├── tests
    ├── __init__.py
    ├── data
    │   ├── file2.txt
    │   ├── file.txt
    │   ├── ccc-example-coef.pkl
    │   ├── ccc-example-data.pkl
    │   ├── ccc-random_data-coef.pkl
    │   ├── ccc-random_data-data.pkl
    │   └── README.md
    ├── README.md
    ├── test_log.py
    ├── test_methods.py
    ├── test_pytorch_core.py
    ├── test_scipy_stats.py
    └── test_conf.py
├── libs
    └── ccc
    │   ├── numpy
    │       └── __init__.py
    │   ├── pytorch
    │       └── __init__.py
    │   ├── scipy
    │       └── __init__.py
    │   ├── sklearn
    │       └── __init__.py
    │   ├── utils
    │       └── __init__.py
    │   ├── __init__.py
    │   ├── coef
    │       └── __init__.py
    │   ├── methods.py
    │   ├── log_config.yaml
    │   ├── log.py
    │   ├── settings.py
    │   └── corr.py
├── .gitattributes
├── misc
    └── logo
    │   └── ccc.png
├── scripts
    ├── styler.r
    ├── jupytext_sync.sh
    ├── touch_pys.sh
    ├── env.sh
    ├── create_docker_image.sh
    ├── run_nbs_server.sh
    ├── convert_ipynb_to_py.sh
    ├── rsync.sh
    └── run_docker.sh
├── nbs
    ├── .jupytext
    ├── others
    │   └── 05_clustermatch_profiling
    │   │   ├── 10_cm_optimized
    │   │       ├── 07-n_samples_large.txt
    │   │       ├── 07-n_samples_small.txt
    │   │       ├── 04-n_samples_large.txt
    │   │       ├── 04-n_samples_small.txt
    │   │       ├── 00-n_samples_large.txt
    │   │       ├── 00-n_samples_small.txt
    │   │       ├── 01-n_samples_large.txt
    │   │       ├── 01-n_samples_small.txt
    │   │       ├── 02-n_samples_large.txt
    │   │       ├── 02-n_samples_small.txt
    │   │       ├── 03-n_samples_large.txt
    │   │       ├── 03-n_samples_small.txt
    │   │       ├── 08-n_samples_small_50.txt
    │   │       ├── 08-n_samples_large_50000.txt
    │   │       ├── 09-n_samples_large_50000.txt
    │   │       ├── 09-n_samples_small_1000.txt
    │   │       ├── 08-n_samples_small_100.txt
    │   │       ├── 08-n_samples_small_500.txt
    │   │       ├── 09-n_samples_small_100.txt
    │   │       ├── 09-n_samples_small_50.txt
    │   │       ├── 09-n_samples_small_500.txt
    │   │       ├── 08-n_samples_large_100000.txt
    │   │       ├── 09-n_samples_large_100000.txt
    │   │       ├── 08-n_samples_small_1000.txt
    │   │       ├── 10-n_samples_small_50.txt
    │   │       ├── 10-n_samples_large_50000.txt
    │   │       ├── 10-n_samples_small_100.txt
    │   │       ├── 10-n_samples_small_1000.txt
    │   │       ├── 10-n_samples_small_500.txt
    │   │       ├── 10-n_samples_large_100000.txt
    │   │       ├── 06-n_samples_large.txt
    │   │       ├── 05-n_samples_small.txt
    │   │       ├── 05-n_samples_large.txt
    │   │       ├── 06-n_samples_small.txt
    │   │       └── py
    │   │       │   ├── 01-cdist_parts_v00.py
    │   │       │   ├── 00-run_reference.py
    │   │       │   ├── 04-get_parts_v00.py
    │   │       │   ├── 02-cdist_parts_v01.py
    │   │       │   ├── 03-cdist_parts_v02.py
    │   │       │   ├── 05-get_parts_v01.py
    │   │       │   ├── 07-get_parts_v03.py
    │   │       │   └── 06-get_parts_v02.py
    │   │   ├── README.md
    │   │   ├── 05_cm_optimized
    │   │       ├── py
    │   │       │   ├── 06-many_genes.py
    │   │       │   ├── 07-many_samples.py
    │   │       │   ├── 05-compare_precomputing_of_parts.py
    │   │       │   └── 04-compare_numba_ari.py
    │   │       ├── 04-cm_ari_numba.txt
    │   │       ├── 04-cm_ari_sklearn.txt
    │   │       ├── 05-cm_precompute_parts_false.txt
    │   │       ├── 05-cm_precompute_parts_true.txt
    │   │       ├── 06-cm_many_genes.txt
    │   │       ├── 07-cm_many_samples-less_internal_n_clusters.txt
    │   │       └── 07-cm_many_samples-default_internal_n_clusters.txt
    │   │   ├── 06_cm_optimized
    │   │       ├── py
    │   │       │   ├── 06-many_genes.py
    │   │       │   ├── 04-compare_numba_ari.py
    │   │       │   └── 07-many_samples.py
    │   │       └── 06-cm_many_genes.txt
    │   │   ├── 07_cm_optimized
    │   │       ├── py
    │   │       │   ├── 06-many_genes.py
    │   │       │   ├── 04-compare_numba_ari.py
    │   │       │   └── 07-many_samples.py
    │   │       └── 04-cm_ari_numba.txt
    │   │   ├── 11_cm_optimized
    │   │       └── py
    │   │       │   ├── 06-many_genes.py
    │   │       │   ├── 08-many_genes.py
    │   │       │   ├── 07-many_samples.py
    │   │       │   └── 09-many_samples.py
    │   │   └── 12_cm_optimized
    │   │       └── py
    │   │           ├── 06-many_genes.py
    │   │           ├── 10-many_genes.py
    │   │           ├── 08-many_genes.py
    │   │           ├── 07-many_samples.py
    │   │           ├── 11-many_samples.py
    │   │           └── 09-many_samples.py
    ├── run_nbs.sh
    ├── 25_pvalue
    │   └── py
    │   │   ├── 00-ccc_pvalue_dist-generate-data_matrix.py
    │   │   └── 01-ccc_pvalue_dist-generate-gene_pairs.py
    ├── 99_manuscript
    │   ├── k_max
    │   │   └── py
    │   │   │   └── 01-k_max-runs.py
    │   └── giant
    │   │   └── py
    │   │       └── 03_00-giant-get_gene_info.py
    └── 20_comparison_others
    │   └── py
    │       ├── 60-time_test-1_cpu_core.py
    │       ├── 61-time_test-3_cpu_cores.py
    │       └── 62-time_test-6_cpu_cores.py
├── .dockerignore
├── setup.cfg
├── entrypoint.sh
├── environment
    ├── scripts
    │   ├── install_other_packages.sh
    │   ├── environment_base.yml
    │   └── install_r_packages.r
    └── environment.yml
├── LICENSE_bundled
├── .github
    └── workflows
    │   └── lint.yaml
├── setup.py
├── Dockerfile
├── .gitignore
└── LICENSE


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/libs/ccc/numpy/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/libs/ccc/pytorch/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/libs/ccc/scipy/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/libs/ccc/sklearn/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | tests/data/* -text
2 | 


--------------------------------------------------------------------------------
/tests/data/file2.txt:
--------------------------------------------------------------------------------
1 | another file
2 | 


--------------------------------------------------------------------------------
/tests/data/file.txt:
--------------------------------------------------------------------------------
1 | a file with some content
2 | 


--------------------------------------------------------------------------------
/misc/logo/ccc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/greenelab/ccc/HEAD/misc/logo/ccc.png


--------------------------------------------------------------------------------
/libs/ccc/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from ccc.utils.utility_functions import *  # noqa: F403, F401
2 | 


--------------------------------------------------------------------------------
/libs/ccc/__init__.py:
--------------------------------------------------------------------------------
1 | # Remember to change also setup.py with the version here
2 | __version__ = "0.2.2"
3 | 


--------------------------------------------------------------------------------
/tests/data/ccc-example-coef.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/greenelab/ccc/HEAD/tests/data/ccc-example-coef.pkl


--------------------------------------------------------------------------------
/tests/data/ccc-example-data.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/greenelab/ccc/HEAD/tests/data/ccc-example-data.pkl


--------------------------------------------------------------------------------
/tests/data/ccc-random_data-coef.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/greenelab/ccc/HEAD/tests/data/ccc-random_data-coef.pkl


--------------------------------------------------------------------------------
/tests/data/ccc-random_data-data.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/greenelab/ccc/HEAD/tests/data/ccc-random_data-data.pkl


--------------------------------------------------------------------------------
/scripts/styler.r:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env Rscript
2 | 
3 | args <- commandArgs(trailingOnly = TRUE)
4 | file_name <- args[1L]
5 | styler::style_file(file_name)
6 | 


--------------------------------------------------------------------------------
/nbs/.jupytext:
--------------------------------------------------------------------------------
1 | cell_metadata_filter = "all,-execution,-papermill,-trusted"
2 | notebook_metadata_filter="-jupytext.text_representation.jupytext_version"
3 | formats = "ipynb,py//auto:percent"
4 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
 1 | # project specific
 2 | data/
 3 | images/
 4 | base/
 5 | 
 6 | # git
 7 | .git/
 8 | .gitignore
 9 | .github/
10 | 
11 | # python
12 | .idea/
13 | .pytest_cache/
14 | **/__pycache__
15 | *.py[cod]
16 | 
17 | # other
18 | *.swp
19 | 


--------------------------------------------------------------------------------
/libs/ccc/coef/__init__.py:
--------------------------------------------------------------------------------
1 | from ccc.coef.impl import *  # noqa: F403, F401
2 | 
3 | # Run CCC to initialize/compile its functions with numba
4 | from ccc.coef.impl import ccc
5 | import numpy as np
6 | 
7 | ccc(np.random.rand(10), np.random.rand(10))
8 | 


--------------------------------------------------------------------------------
/scripts/jupytext_sync.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # This is used to go through all Jupyter notebooks, run black on the text
4 | # representation of the code, and and sync with the ipynb file.
5 | 
6 | parallel 'jupytext --sync --pipe black {}' ::: nbs/**/*.ipynb
7 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | ignore = 
 3 |     E501,
 4 |     W503
 5 | exclude =
 6 |     # No need to traverse our git directory
 7 |     .git,
 8 |     setup.py,
 9 | max-line-length = 88
10 | per-file-ignores =
11 |     nbs/**/py/*.py:E302,E305,E402,F821
12 | 


--------------------------------------------------------------------------------
/scripts/touch_pys.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # Updates the access date of all python scripts (.py) converted from notebooks.
4 | # This is needed sometimes when git updates files after a pull, otherwise
5 | # jupyter won't load the notebooks in the browser.
6 | 
7 | find . -type f -wholename "**/py/*.py" -exec touch {} +
8 | 


--------------------------------------------------------------------------------
/entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash --login
 2 | # Taken from here with modifications: https://pythonspeed.com/articles/activate-conda-dockerfile/
 3 | # The --login ensures the bash configuration is loaded,
 4 | # enabling Conda.
 5 | 
 6 | set +eu
 7 | conda activate ccc
 8 | set -euo pipefail
 9 | 
10 | # load environment variables
11 | eval `python libs/ccc/conf.py`
12 | 
13 | exec "$@"
14 | 
15 | 


--------------------------------------------------------------------------------
/tests/README.md:
--------------------------------------------------------------------------------
 1 | # Unit tests
 2 | 
 3 | ## Run
 4 | 
 5 | These are the instructions to run the unit tests. It is assumed that you already 
 6 | followed the steps to set up the environment and download the needed data, and that
 7 | your `PYTHONPATH` and `CM_ROOT_DIR` variables are adjusted appropriately.
 8 | 
 9 | Execute these commands to run the unit tests:
10 | 
11 | ```bash
12 | pytest -rs --color=yes tests/
13 | ```
14 | 


--------------------------------------------------------------------------------
/libs/ccc/methods.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Contains other correlation methods.
 3 | """
 4 | import warnings
 5 | 
 6 | from minepy.mine import MINE
 7 | 
 8 | 
 9 | def mic(x, y, estimator="mic_approx"):
10 |     """
11 |     Given two arrays (x and y), it computes MIC with the default parameters.
12 |     """
13 |     with warnings.catch_warnings():
14 |         warnings.filterwarnings("ignore", category=DeprecationWarning)
15 | 
16 |         mine = MINE(alpha=0.6, c=15, est=estimator)
17 |         mine.compute_score(x, y)
18 |         return mine.mic()
19 | 


--------------------------------------------------------------------------------
/tests/test_log.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Tests the log.py module.
 3 | """
 4 | 
 5 | 
 6 | def test_log_module_load():
 7 |     from ccc import log
 8 | 
 9 |     assert log is not None
10 |     assert log.__file__ is not None
11 | 
12 | 
13 | def test_log_get_logger():
14 |     from ccc import log
15 | 
16 |     logger = log.get_logger("testing")
17 |     assert logger is not None
18 |     assert hasattr(logger, "info")
19 |     assert hasattr(logger, "debug")
20 |     assert hasattr(logger, "error")
21 | 
22 |     logger.info("test")
23 |     logger.warning("test warn")
24 | 


--------------------------------------------------------------------------------
/scripts/env.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This file exports some common environmental variables to run the code.  It
 4 | # has to be customized for your need by changing the BASE_DIR and CM_N_JOBS
 5 | # below.
 6 | 
 7 | # Your settings here
 8 | # BASE_DIR is the parent directory where the code and manuscript repos are
 9 | # located.
10 | BASE_DIR=/home/miltondp/projects/ccc/greenelab/
11 | export CM_N_JOBS=20
12 | 
13 | export CM_ROOT_DIR=${BASE_DIR}/ccc/base
14 | export CM_MANUSCRIPT_DIR=${BASE_DIR}/ccc-manuscript/
15 | 
16 | export PYTHONPATH=${BASE_DIR}/ccc/libs/
17 | 
18 | 


--------------------------------------------------------------------------------
/environment/scripts/install_other_packages.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This script installs other dependencies that cannot be directly installed using conda.
 4 | 
 5 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 6 | 
 7 | # Fix tqdm with JupyterLab: https://github.com/tqdm/tqdm/issues/394#issuecomment-384743637
 8 | # jupyter nbextension enable --py widgetsnbextension
 9 | 
10 | # jupyter labextension install @jupyter-widgets/jupyterlab-manager
11 | 
12 | #
13 | # R dependencies
14 | #
15 | TAR=$(which tar) Rscript ${SCRIPT_DIR}/install_r_packages.r
16 | 
17 | 


--------------------------------------------------------------------------------
/libs/ccc/log_config.yaml:
--------------------------------------------------------------------------------
 1 | version: 1
 2 | 
 3 | formatters:
 4 |   simple:
 5 |     format: "[%(asctime)s - %(name)s] %(levelname)s: %(message)s"
 6 | 
 7 | handlers:
 8 |   empty:
 9 |     class: logging.NullHandler
10 | 
11 |   console:
12 |     class: logging.StreamHandler
13 |     level: INFO
14 |     formatter: simple
15 | 
16 |   file_handler:
17 |     class: logging.FileHandler
18 |     level: DEBUG
19 |     filename: logging.txt
20 |     formatter: simple
21 |     delay: true
22 | 
23 | loggers:
24 |   none:
25 |     handlers: [empty]
26 |     propagate: false
27 | 
28 | root:
29 |   handlers: [console]
30 |   level: INFO
31 | 


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/10_cm_optimized/07-n_samples_large.txt:
--------------------------------------------------------------------------------
 1 |          14 function calls in 5.605 seconds
 2 | 
 3 |    Ordered by: cumulative time
 4 | 
 5 |    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
 6 |         1    0.000    0.000    5.605    5.605 {built-in method builtins.exec}
 7 |         1    0.000    0.000    5.605    5.605 <string>:1(<module>)
 8 |         1    0.000    0.000    5.605    5.605 691993785.py:1(func)
 9 |        10    5.605    0.560    5.605    0.560 coef.py:254(_cm)
10 |         1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/10_cm_optimized/07-n_samples_small.txt:
--------------------------------------------------------------------------------
 1 |          14 function calls in 0.034 seconds
 2 | 
 3 |    Ordered by: cumulative time
 4 | 
 5 |    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
 6 |         1    0.000    0.000    0.034    0.034 {built-in method builtins.exec}
 7 |         1    0.000    0.000    0.034    0.034 <string>:1(<module>)
 8 |         1    0.000    0.000    0.034    0.034 691993785.py:1(func)
 9 |        10    0.033    0.003    0.033    0.003 coef.py:254(_cm)
10 |         1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}


--------------------------------------------------------------------------------
/LICENSE_bundled:
--------------------------------------------------------------------------------
 1 | The Clustermatch Correlation Coefficient (CCC) repository and source
 2 | distributions bundle a number of libraries that are compatibly
 3 | licensed. We list these here.
 4 | 
 5 | Name: scikit-learn
 6 | Files: libs/ccc/sklearn/*
 7 | License: BSD 3-Clause License
 8 |   For details, see the header inside libs/ccc/sklearn/metrics.py
 9 | 
10 | Name: SciPy
11 | Files: libs/ccc/scipy/*
12 | License: BSD 3-Clause License
13 |   For details, see the header inside libs/ccc/scipy/stats.py
14 | 
15 | Name: PyTorch
16 | Files: libs/ccc/pytorch/*
17 | License: BSD License
18 |   For details, see the header inside libs/ccc/pytorch/core.py
19 | 


--------------------------------------------------------------------------------
/environment/scripts/environment_base.yml:
--------------------------------------------------------------------------------
 1 | name: ccc
 2 | channels:
 3 |   - conda-forge
 4 |   - defaults
 5 | dependencies:
 6 |   - ipython
 7 |   - ipywidgets
 8 |   - jupyterlab
 9 |   - jupytext
10 |   - matplotlib
11 |   - minepy
12 |   - numba
13 |   - numpy
14 |   - openpyxl
15 |   - pandas
16 |   - papermill
17 |   - pip
18 |   - pytables
19 |   - pytest
20 |   - python=3.9.*
21 |   - pyyaml
22 |   - requests
23 |   - r-base
24 |   - r-devtools
25 |   - r-essentials
26 |   - r-reticulate
27 |   - r-svglite
28 |   - rpy2
29 |   - scikit-learn
30 |   - scipy
31 |   - seaborn
32 |   - svgutils
33 |   - tabulate
34 |   - tqdm
35 |   - upsetplot
36 | 
37 | 


--------------------------------------------------------------------------------
/libs/ccc/log.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Provides logging functions.
 3 | """
 4 | import logging
 5 | import logging.config
 6 | import yaml
 7 | 
 8 | from ccc import conf
 9 | 
10 | 
11 | def _get_logger_config():
12 |     """Reads the logging config file in YAML format."""
13 |     with open(conf.GENERAL["LOG_CONFIG_FILE"], "r") as f:
14 |         return yaml.safe_load(f.read())
15 | 
16 | 
17 | logging.config.dictConfig(_get_logger_config())
18 | 
19 | 
20 | def get_logger(log_name: str = None) -> logging.Logger:
21 |     """
22 |     Returns a Logger instance.
23 | 
24 |     Args:
25 |         log_name: logger name.
26 | 
27 |     Returns:
28 |         A Logger instance configured with default settings.
29 |     """
30 |     return logging.getLogger(log_name)
31 | 


--------------------------------------------------------------------------------
/environment/environment.yml:
--------------------------------------------------------------------------------
 1 | name: ccc
 2 | channels:
 3 |   - conda-forge
 4 |   - defaults
 5 | dependencies:
 6 |   - ipython=7.*
 7 |   - ipywidgets
 8 |   - jupyterlab=3.3.*
 9 |   - jupytext=1.11.*
10 |   - matplotlib=3.4.*
11 |   - minepy=1.2.*
12 |   - numba=0.53.*
13 |   - numpy=1.21.*
14 |   - openpyxl=3.0.*
15 |   - pandas=1.3.*
16 |   - papermill=2.3.*
17 |   - pip
18 |   - pytables=3.7.*
19 |   - pytest=6.*
20 |   - python=3.9.*
21 |   - pyyaml=5.4.*
22 |   - requests=2.*
23 |   - r-base=4.1.*
24 |   - r-devtools
25 |   - r-essentials
26 |   - r-reticulate=1.*
27 |   - r-svglite=2.*
28 |   - rpy2=3.4.*
29 |   - scikit-learn=0.24.*
30 |   - scipy=1.7.*
31 |   - seaborn=0.11.*
32 |   - svgutils=0.3.*
33 |   - tabulate=0.8.*
34 |   - tqdm=4.*
35 |   - upsetplot=0.6.*
36 | 


--------------------------------------------------------------------------------
/scripts/create_docker_image.sh:
--------------------------------------------------------------------------------
 1 | #/bin/bash
 2 | 
 3 | PROJECT_NAME="ccc"
 4 | VERSION="1.0"
 5 | 
 6 | CURRENT_IMAGE_ID=$(docker images --filter=reference=miltondp/${PROJECT_NAME}:latest --format "{{.ID}}")
 7 | 
 8 | docker build -t miltondp/${PROJECT_NAME}:${VERSION} -t miltondp/${PROJECT_NAME}:latest .
 9 | 
10 | read -p "'docker push' new image and retag? " -r
11 | echo    # (optional) move to a new line
12 | if [[ $REPLY =~ ^[Yy]$ ]]; then
13 |   # push version label
14 |   echo "Pushing new image to miltondp/${PROJECT_NAME}:${VERSION}"
15 |   docker push miltondp/${PROJECT_NAME}:${VERSION}
16 | 
17 |   # push latest label
18 |   echo "Pushing new image as latest"
19 |   docker push miltondp/${PROJECT_NAME}:latest
20 | 
21 |   # retag previous version
22 |   docker tag ${CURRENT_IMAGE_ID} miltondp/${PROJECT_NAME}:prev
23 | fi
24 | 
25 | 


--------------------------------------------------------------------------------
/scripts/run_nbs_server.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # The script allows to run a JupyterLab server, listening to local connections
 4 | # only by default.
 5 | # It accepts only one argument, which could be:
 6 | #   * "--container-mode": it sets some parameters when starting the jupyter server
 7 | #   to make it work inside a Docker container.
 8 | #   * any other value: it is the token that the server will request from users;
 9 | #   in addition, it will listen to any address (*).
10 | 
11 | PORT=8893
12 | 
13 | IP="127.0.0.1"
14 | TOKEN=""
15 | EXTRA_ARGS=""
16 | 
17 | if [ "$1" = "--container-mode" ]; then
18 |   IP="*"
19 | #  EXTRA_ARGS="--allow-root"
20 | elif [ ! -z "$1" ]; then
21 |   IP="*"
22 |   TOKEN="${1}"
23 | fi
24 | 
25 | exec jupyter lab \
26 |   --ip="${IP}" \
27 |   --port="${PORT}" \
28 |   --ContentsManager.allow_hidden=True \
29 |   --no-browser \
30 |   --ServerApp.token="${TOKEN}" ${EXTRA_ARGS}
31 | 
32 | 


--------------------------------------------------------------------------------
/scripts/convert_ipynb_to_py.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # show commands being executed (for debugging purposes)
 4 | #set -x
 5 | 
 6 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
 7 | 
 8 | NOTEBOOK="${1}"
 9 | if [ -z "${NOTEBOOK}" ]; then
10 |   echo "Provide the notebook path"
11 |   exit 1
12 | fi
13 | 
14 | # capture whether notebook has a python or R kernel
15 | regex="\"file_extension\": \"(\.[a-zA-Z]+)\"\,"
16 | value=`cat ${NOTEBOOK} | grep "file_extension"`
17 | if [[ $value =~ $regex ]]; then
18 |   fext="${BASH_REMATCH[1]}"
19 | else
20 |   echo "ERROR: file extension not found"
21 |   exit 1
22 | fi
23 | 
24 | # select code formatter according to file extension
25 | PIPE_CMD=("black {}")
26 | if [ "$fext" = ".r" ] || [ "$fext" = ".R" ]; then
27 |   PIPE_CMD=("${SCRIPT_DIR}/styler.r {}")
28 | fi
29 | 
30 | jupytext \
31 |   --sync \
32 |   --pipe "${PIPE_CMD[@]}" \
33 |   ${NOTEBOOK}
34 | 
35 | 


--------------------------------------------------------------------------------
/environment/scripts/install_r_packages.r:
--------------------------------------------------------------------------------
 1 | # This script installs R packages. When installing BiocManager, the script updates all R packages
 2 | # currently installed (options update=TRUE, ask=FALSE in BiocManager::install).
 3 | 
 4 | 
 5 | default_repo <- "http://cran.us.r-project.org"
 6 | 
 7 | # install BiocManager but do not update R packages so we keep those installed
 8 | # with conda
 9 | if (!requireNamespace("BiocManager", quietly = TRUE)) {
10 |   install.packages("BiocManager", repos = default_repo)
11 | }
12 | BiocManager::install(version = "3.13", update = FALSE, ask = FALSE)
13 | 
14 | # styler
15 | BiocManager::install("styler", update = FALSE, ask = FALSE)
16 | 
17 | # org.Hs.eg.db
18 | BiocManager::install("org.Hs.eg.db", update = FALSE, ask = FALSE)
19 | 
20 | # clusterProfiler
21 | # BiocManager::install("clusterProfiler", update = FALSE, ask = FALSE)
22 | 
23 | # ReactomePA
24 | # BiocManager::install("ReactomePA", update = FALSE, ask = FALSE)
25 | 
26 | # library(devtools)
27 | 
28 | # fgsea
29 | # install_github("ctlab/fgsea", ref="v1.17.0")
30 | 


--------------------------------------------------------------------------------
/.github/workflows/lint.yaml:
--------------------------------------------------------------------------------
 1 | name: lint
 2 | on:
 3 |   push:
 4 |   pull_request:
 5 |     types: [opened, reopened]
 6 | jobs:
 7 |   run-linters:
 8 |     name: Run linters
 9 |     runs-on: ubuntu-latest
10 | 
11 |     steps:
12 |       - name: Check out Git repository
13 |         uses: actions/checkout@v2
14 | 
15 |       - name: Set up Python
16 |         uses: actions/setup-python@v1
17 |         with:
18 |           python-version: 3.9
19 | 
20 |       - name: Install Python dependencies
21 |         run: pip install black flake8
22 | 
23 |       - name: Run linters
24 |         uses: wearerequired/lint-action@v1
25 |         with:
26 |           github_token: ${{ secrets.github_token }}
27 |           # Enable linters
28 |           black: true
29 |           flake8: true
30 |           # Mark the following line true if you want linters to attempt to
31 |           # autocorrect your code
32 |           auto_fix: true
33 |           git_name: "Greene Lab Linter"
34 |           git_email: "miltondp@gmail.com"
35 |           commit_message: "fix code style issues with ${linter}"
36 | 
37 | 


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/README.md:
--------------------------------------------------------------------------------
 1 | # Clustermatch profiling
 2 | 
 3 | This folder contains profiling results (with cProfile) of different
 4 | optimizations of the clustermatch code. A brief description of each subfolder is
 5 | below.
 6 | 
 7 | * `05_cm_optimized`:
 8 |   * ari implementation with numba
 9 |   * precomputing of internal partitions
10 | 
11 | * `06_cm_optimized`:
12 |   * cm function fully implemented in numba
13 | 
14 | * `07_cm_optimized`:
15 |   * cm function now supports parallelization (from numba)
16 | 
17 | * `10_cm_optimized`:
18 |   * optimization for computing ari in parallel (function cdist_parts)
19 |   * many optimizations in other functions associated to _get_parts, such as rank, run_quantile_clustering, etc.
20 |   * the idea here is to optimize the single variable pair processing
21 | 
22 | * `11_cm_optimized`:
23 |   * after all optimization in 10_cm_optimized, this is a copy of 07_cm_optimized to check
24 |     if the matrix data input keeps working correctly.
25 | 
26 | * `12_cm_optimized`:
27 |   * a copy of `11_cm_optimized` with some other optimizations.
28 | 


--------------------------------------------------------------------------------
/tests/test_methods.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from ccc.methods import mic
 4 | 
 5 | 
 6 | def test_mic_basic():
 7 |     # Prepare
 8 |     np.random.seed(123)
 9 | 
10 |     # two features on 100 objects (random data)
11 |     feature0 = np.random.rand(100)
12 |     feature1 = np.random.rand(100)
13 | 
14 |     # Run
15 |     mic_value = mic(feature0, feature1)
16 |     assert mic_value is not None
17 |     assert isinstance(mic_value, float)
18 |     assert 1.0 > mic_value > 0.0
19 | 
20 | 
21 | def test_mic_use_estimator_mic_e():
22 |     # Prepare
23 |     np.random.seed(123)
24 | 
25 |     # two features on 100 objects (random data)
26 |     feature0 = np.random.rand(100)
27 |     feature1 = np.random.rand(100)
28 | 
29 |     # Run default estimator
30 |     mic_value = mic(feature0, feature1)
31 | 
32 |     # Run with mic_e estimator
33 |     mic_e_value = mic(feature0, feature1, estimator="mic_e")
34 | 
35 |     assert mic_e_value is not None
36 |     assert isinstance(mic_e_value, float)
37 |     assert 1.0 > mic_e_value > 0.0
38 | 
39 |     # make sure the estimator parameter is being used
40 |     assert mic_value != mic_e_value
41 | 


--------------------------------------------------------------------------------
/scripts/rsync.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | # Intended for internal use only, with very personalized settings.
 5 | #
 6 | # This script runs rsync with some common parameters to sync with a remote
 7 | # machine. For instance, it checks files' hashes instead of timestamp, and
 8 | # excludes some huge files not needed.
 9 | #
10 | # It accepts one argument, and it is the remote directory path (absolute) where
11 | # the base directory is.
12 | 
13 | GIT_ROOT_DIR=$(git rev-parse --show-toplevel)
14 | LOCAL_DIR="${GIT_ROOT_DIR}/base/"
15 | 
16 | REMOTE_DIR="${1}"
17 | if [ -z "${REMOTE_DIR}" ]; then
18 |   # if remote dir not given, use the same as local
19 |   REMOTE_DIR=${LOCAL_DIR}
20 | else
21 |   # default value
22 |   REMOTE_DIR="/home/miltondp/projects/ccc/ccc/base/*"
23 | fi
24 | 
25 | rsync \
26 |         -chavzP \
27 |         --stats \
28 |         --exclude 'GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz' \
29 |         --exclude 'GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt' \
30 |         --exclude 'recount_data_prep_PLIER.*' \
31 |         --exclude 'recount2_PLIER_data.zip' \
32 |         pcgreene:${REMOTE_DIR} \
33 |         ${LOCAL_DIR}
34 | 
35 | 


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/05_cm_optimized/py/06-many_genes.py:
--------------------------------------------------------------------------------
 1 | # ---
 2 | # jupyter:
 3 | #   jupytext:
 4 | #     cell_metadata_filter: all,-execution,-papermill,-trusted
 5 | #     text_representation:
 6 | #       extension: .py
 7 | #       format_name: percent
 8 | #       format_version: '1.3'
 9 | #       jupytext_version: 1.11.5
10 | #   kernelspec:
11 | #     display_name: Python 3 (ipykernel)
12 | #     language: python
13 | #     name: python3
14 | # ---
15 | 
16 | # %% [markdown] tags=[]
17 | # # Description
18 | 
19 | # %% [markdown]
20 | # Clustermatch run using a larger number of genes.
21 | 
22 | # %% [markdown]
23 | # # Modules
24 | 
25 | # %%
26 | import numpy as np
27 | 
28 | from ccc.coef import ccc
29 | 
30 | # %% [markdown]
31 | # # Data
32 | 
33 | # %% tags=[]
34 | n_genes, n_samples = 500, 1000
35 | 
36 | # %% tags=[]
37 | np.random.seed(0)
38 | 
39 | # %% tags=[]
40 | data = np.random.rand(n_genes, n_samples)
41 | 
42 | # %% tags=[]
43 | data.shape
44 | 
45 | 
46 | # %% [markdown] tags=[]
47 | # # Profile
48 | 
49 | # %% tags=[]
50 | def func():
51 |     return ccc(data, internal_n_clusters=range(2, 10 + 1), precompute_parts=True)
52 | 
53 | 
54 | # %% tags=[]
55 | # %%timeit func()
56 | func()
57 | 
58 | # %% tags=[]
59 | # %%prun -s cumulative -l 20 -T 06-cm_many_genes.txt
60 | func()
61 | 
62 | # %% tags=[]
63 | 


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/10_cm_optimized/04-n_samples_large.txt:
--------------------------------------------------------------------------------
 1 |          134 function calls in 15.269 seconds
 2 | 
 3 |    Ordered by: cumulative time
 4 | 
 5 |    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
 6 |         1    0.000    0.000   15.269   15.269 {built-in method builtins.exec}
 7 |         1    0.000    0.000   15.269   15.269 <string>:1(<module>)
 8 |         1    0.007    0.007   15.269   15.269 1556911885.py:1(func)
 9 |        10    0.026    0.003   15.262    1.526 coef.py:266(_cm)
10 |        20   11.375    0.569   11.375    0.569 coef.py:169(_get_parts)
11 |        10    3.860    0.386    3.860    0.386 coef.py:199(cdist_parts)
12 |        30    0.001    0.000    0.001    0.000 {built-in method numpy.zeros}
13 |        10    0.000    0.000    0.000    0.000 {method 'argmax' of 'numpy.ndarray' objects}
14 |        10    0.000    0.000    0.000    0.000 {built-in method numpy.empty}
15 |        10    0.000    0.000    0.000    0.000 coef.py:119(_get_range_n_clusters)
16 |        10    0.000    0.000    0.000    0.000 coef.py:249(unravel_index_2d)
17 |         1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}
18 |        10    0.000    0.000    0.000    0.000 special.py:18(__new__)
19 |        10    0.000    0.000    0.000    0.000 coef.py:225(get_coords_from_index)


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/10_cm_optimized/04-n_samples_small.txt:
--------------------------------------------------------------------------------
 1 |          134 function calls in 0.028 seconds
 2 | 
 3 |    Ordered by: cumulative time
 4 | 
 5 |    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
 6 |         1    0.000    0.000    0.028    0.028 {built-in method builtins.exec}
 7 |         1    0.000    0.000    0.028    0.028 <string>:1(<module>)
 8 |         1    0.000    0.000    0.028    0.028 1556911885.py:1(func)
 9 |        10    0.001    0.000    0.027    0.003 coef.py:266(_cm)
10 |        10    0.019    0.002    0.019    0.002 coef.py:199(cdist_parts)
11 |        20    0.008    0.000    0.008    0.000 coef.py:169(_get_parts)
12 |        30    0.000    0.000    0.000    0.000 {built-in method numpy.zeros}
13 |        10    0.000    0.000    0.000    0.000 {method 'argmax' of 'numpy.ndarray' objects}
14 |        10    0.000    0.000    0.000    0.000 coef.py:119(_get_range_n_clusters)
15 |        10    0.000    0.000    0.000    0.000 {built-in method numpy.empty}
16 |        10    0.000    0.000    0.000    0.000 coef.py:249(unravel_index_2d)
17 |         1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}
18 |        10    0.000    0.000    0.000    0.000 special.py:18(__new__)
19 |        10    0.000    0.000    0.000    0.000 coef.py:225(get_coords_from_index)


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/06_cm_optimized/py/06-many_genes.py:
--------------------------------------------------------------------------------
 1 | # ---
 2 | # jupyter:
 3 | #   jupytext:
 4 | #     cell_metadata_filter: all,-execution,-papermill,-trusted
 5 | #     text_representation:
 6 | #       extension: .py
 7 | #       format_name: percent
 8 | #       format_version: '1.3'
 9 | #       jupytext_version: 1.11.5
10 | #   kernelspec:
11 | #     display_name: Python 3 (ipykernel)
12 | #     language: python
13 | #     name: python3
14 | # ---
15 | 
16 | # %% [markdown] tags=[]
17 | # # Description
18 | 
19 | # %% [markdown]
20 | # Clustermatch run using a larger number of genes.
21 | 
22 | # %% [markdown] tags=[]
23 | # # Modules
24 | 
25 | # %% tags=[]
26 | import numpy as np
27 | 
28 | from ccc.coef import ccc
29 | 
30 | # %% [markdown] tags=[]
31 | # # Data
32 | 
33 | # %% tags=[]
34 | n_genes, n_samples = 500, 1000
35 | 
36 | # %% tags=[]
37 | np.random.seed(0)
38 | 
39 | # %% tags=[]
40 | data = np.random.rand(n_genes, n_samples)
41 | 
42 | # %% tags=[]
43 | data.shape
44 | 
45 | 
46 | # %% [markdown] tags=[]
47 | # # Profile
48 | 
49 | # %% tags=[]
50 | def func():
51 |     n_clust = list(range(2, 10 + 1))
52 |     return ccc(data, internal_n_clusters=n_clust)
53 | 
54 | 
55 | # %% tags=[]
56 | # %%timeit func()
57 | func()
58 | 
59 | # %% tags=[]
60 | # %%prun -s cumulative -l 50 -T 06-cm_many_genes.txt
61 | func()
62 | 
63 | # %% tags=[]
64 | 


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/07_cm_optimized/py/06-many_genes.py:
--------------------------------------------------------------------------------
 1 | # ---
 2 | # jupyter:
 3 | #   jupytext:
 4 | #     cell_metadata_filter: all,-execution,-papermill,-trusted
 5 | #     text_representation:
 6 | #       extension: .py
 7 | #       format_name: percent
 8 | #       format_version: '1.3'
 9 | #       jupytext_version: 1.11.5
10 | #   kernelspec:
11 | #     display_name: Python 3 (ipykernel)
12 | #     language: python
13 | #     name: python3
14 | # ---
15 | 
16 | # %% [markdown] tags=[]
17 | # # Description
18 | 
19 | # %% [markdown]
20 | # Clustermatch run using a larger number of genes.
21 | 
22 | # %% [markdown] tags=[]
23 | # # Modules
24 | 
25 | # %% tags=[]
26 | import numpy as np
27 | 
28 | from ccc.coef import ccc
29 | 
30 | # %% [markdown] tags=[]
31 | # # Data
32 | 
33 | # %% tags=[]
34 | n_genes, n_samples = 500, 1000
35 | 
36 | # %% tags=[]
37 | np.random.seed(0)
38 | 
39 | # %% tags=[]
40 | data = np.random.rand(n_genes, n_samples)
41 | 
42 | # %% tags=[]
43 | data.shape
44 | 
45 | 
46 | # %% [markdown] tags=[]
47 | # # Profile
48 | 
49 | # %% tags=[]
50 | def func():
51 |     n_clust = list(range(2, 10 + 1))
52 |     return ccc(data, internal_n_clusters=n_clust)
53 | 
54 | 
55 | # %% tags=[]
56 | # %%timeit func()
57 | func()
58 | 
59 | # %% tags=[]
60 | # %%prun -s cumulative -l 50 -T 06-cm_many_genes.txt
61 | func()
62 | 
63 | # %% tags=[]
64 | 


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/10_cm_optimized/00-n_samples_large.txt:
--------------------------------------------------------------------------------
 1 |          154 function calls in 18.817 seconds
 2 | 
 3 |    Ordered by: cumulative time
 4 | 
 5 |    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
 6 |         1    0.000    0.000   18.817   18.817 {built-in method builtins.exec}
 7 |         1    0.000    0.000   18.817   18.817 <string>:1(<module>)
 8 |         1    0.004    0.004   18.817   18.817 1556911885.py:1(func)
 9 |        10    0.008    0.001   18.813    1.881 coef.py:265(_cm)
10 |        20   10.568    0.528   10.568    0.528 coef.py:169(_get_parts)
11 |        10    8.237    0.824    8.237    0.824 coef.py:198(cdist_parts)
12 |        20    0.001    0.000    0.001    0.000 {built-in method numpy.zeros}
13 |        10    0.000    0.000    0.000    0.000 {method 'argmax' of 'numpy.ndarray' objects}
14 |        20    0.000    0.000    0.000    0.000 coef.py:119(_get_range_n_clusters)
15 |        10    0.000    0.000    0.000    0.000 {built-in method numpy.empty}
16 |        10    0.000    0.000    0.000    0.000 special.py:18(__new__)
17 |        10    0.000    0.000    0.000    0.000 coef.py:248(unravel_index_2d)
18 |        20    0.000    0.000    0.000    0.000 {method 'append' of 'list' objects}
19 |        10    0.000    0.000    0.000    0.000 coef.py:224(get_coords_from_index)
20 |         1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/10_cm_optimized/00-n_samples_small.txt:
--------------------------------------------------------------------------------
 1 |          154 function calls in 0.034 seconds
 2 | 
 3 |    Ordered by: cumulative time
 4 | 
 5 |    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
 6 |         1    0.000    0.000    0.034    0.034 {built-in method builtins.exec}
 7 |         1    0.000    0.000    0.034    0.034 <string>:1(<module>)
 8 |         1    0.000    0.000    0.034    0.034 1556911885.py:1(func)
 9 |        10    0.001    0.000    0.034    0.003 coef.py:265(_cm)
10 |        10    0.024    0.002    0.024    0.002 coef.py:198(cdist_parts)
11 |        20    0.009    0.000    0.009    0.000 coef.py:169(_get_parts)
12 |        10    0.000    0.000    0.000    0.000 {method 'argmax' of 'numpy.ndarray' objects}
13 |        20    0.000    0.000    0.000    0.000 coef.py:119(_get_range_n_clusters)
14 |        20    0.000    0.000    0.000    0.000 {built-in method numpy.zeros}
15 |        10    0.000    0.000    0.000    0.000 {built-in method numpy.empty}
16 |        10    0.000    0.000    0.000    0.000 coef.py:248(unravel_index_2d)
17 |        10    0.000    0.000    0.000    0.000 special.py:18(__new__)
18 |        10    0.000    0.000    0.000    0.000 coef.py:224(get_coords_from_index)
19 |        20    0.000    0.000    0.000    0.000 {method 'append' of 'list' objects}
20 |         1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/10_cm_optimized/01-n_samples_large.txt:
--------------------------------------------------------------------------------
 1 |          154 function calls in 14.819 seconds
 2 | 
 3 |    Ordered by: cumulative time
 4 | 
 5 |    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
 6 |         1    0.000    0.000   14.819   14.819 {built-in method builtins.exec}
 7 |         1    0.000    0.000   14.819   14.819 <string>:1(<module>)
 8 |         1    0.005    0.005   14.819   14.819 1556911885.py:1(func)
 9 |        10    0.009    0.001   14.815    1.481 coef.py:265(_cm)
10 |        20   11.105    0.555   11.105    0.555 coef.py:169(_get_parts)
11 |        10    3.700    0.370    3.700    0.370 coef.py:198(cdist_parts)
12 |        20    0.000    0.000    0.000    0.000 {built-in method numpy.zeros}
13 |        20    0.000    0.000    0.000    0.000 coef.py:119(_get_range_n_clusters)
14 |        10    0.000    0.000    0.000    0.000 {method 'argmax' of 'numpy.ndarray' objects}
15 |        10    0.000    0.000    0.000    0.000 {built-in method numpy.empty}
16 |        10    0.000    0.000    0.000    0.000 coef.py:248(unravel_index_2d)
17 |        10    0.000    0.000    0.000    0.000 special.py:18(__new__)
18 |        20    0.000    0.000    0.000    0.000 {method 'append' of 'list' objects}
19 |        10    0.000    0.000    0.000    0.000 coef.py:224(get_coords_from_index)
20 |         1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/10_cm_optimized/01-n_samples_small.txt:
--------------------------------------------------------------------------------
 1 |          154 function calls in 0.032 seconds
 2 | 
 3 |    Ordered by: cumulative time
 4 | 
 5 |    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
 6 |         1    0.000    0.000    0.032    0.032 {built-in method builtins.exec}
 7 |         1    0.000    0.000    0.032    0.032 <string>:1(<module>)
 8 |         1    0.000    0.000    0.032    0.032 1556911885.py:1(func)
 9 |        10    0.001    0.000    0.032    0.003 coef.py:265(_cm)
10 |        10    0.021    0.002    0.021    0.002 coef.py:198(cdist_parts)
11 |        20    0.010    0.000    0.010    0.000 coef.py:169(_get_parts)
12 |        20    0.000    0.000    0.000    0.000 coef.py:119(_get_range_n_clusters)
13 |        10    0.000    0.000    0.000    0.000 {method 'argmax' of 'numpy.ndarray' objects}
14 |        20    0.000    0.000    0.000    0.000 {built-in method numpy.zeros}
15 |        10    0.000    0.000    0.000    0.000 {built-in method numpy.empty}
16 |        10    0.000    0.000    0.000    0.000 coef.py:248(unravel_index_2d)
17 |        10    0.000    0.000    0.000    0.000 special.py:18(__new__)
18 |        20    0.000    0.000    0.000    0.000 {method 'append' of 'list' objects}
19 |        10    0.000    0.000    0.000    0.000 coef.py:224(get_coords_from_index)
20 |         1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/10_cm_optimized/02-n_samples_large.txt:
--------------------------------------------------------------------------------
 1 |          154 function calls in 15.669 seconds
 2 | 
 3 |    Ordered by: cumulative time
 4 | 
 5 |    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
 6 |         1    0.000    0.000   15.669   15.669 {built-in method builtins.exec}
 7 |         1    0.000    0.000   15.669   15.669 <string>:1(<module>)
 8 |         1    0.004    0.004   15.669   15.669 1556911885.py:1(func)
 9 |        10    0.010    0.001   15.665    1.566 coef.py:265(_cm)
10 |        20   11.799    0.590   11.799    0.590 coef.py:169(_get_parts)
11 |        10    3.854    0.385    3.854    0.385 coef.py:198(cdist_parts)
12 |        20    0.000    0.000    0.000    0.000 {built-in method numpy.zeros}
13 |        20    0.000    0.000    0.000    0.000 coef.py:119(_get_range_n_clusters)
14 |        10    0.000    0.000    0.000    0.000 {method 'argmax' of 'numpy.ndarray' objects}
15 |        10    0.000    0.000    0.000    0.000 {built-in method numpy.empty}
16 |        10    0.000    0.000    0.000    0.000 special.py:18(__new__)
17 |        10    0.000    0.000    0.000    0.000 coef.py:248(unravel_index_2d)
18 |        20    0.000    0.000    0.000    0.000 {method 'append' of 'list' objects}
19 |        10    0.000    0.000    0.000    0.000 coef.py:224(get_coords_from_index)
20 |         1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/10_cm_optimized/02-n_samples_small.txt:
--------------------------------------------------------------------------------
 1 |          154 function calls in 0.034 seconds
 2 | 
 3 |    Ordered by: cumulative time
 4 | 
 5 |    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
 6 |         1    0.000    0.000    0.034    0.034 {built-in method builtins.exec}
 7 |         1    0.000    0.000    0.034    0.034 <string>:1(<module>)
 8 |         1    0.000    0.000    0.034    0.034 1556911885.py:1(func)
 9 |        10    0.001    0.000    0.034    0.003 coef.py:265(_cm)
10 |        10    0.024    0.002    0.024    0.002 coef.py:198(cdist_parts)
11 |        20    0.009    0.000    0.009    0.000 coef.py:169(_get_parts)
12 |        20    0.000    0.000    0.000    0.000 {built-in method numpy.zeros}
13 |        10    0.000    0.000    0.000    0.000 {method 'argmax' of 'numpy.ndarray' objects}
14 |        20    0.000    0.000    0.000    0.000 coef.py:119(_get_range_n_clusters)
15 |        10    0.000    0.000    0.000    0.000 {built-in method numpy.empty}
16 |        10    0.000    0.000    0.000    0.000 coef.py:248(unravel_index_2d)
17 |        10    0.000    0.000    0.000    0.000 special.py:18(__new__)
18 |        20    0.000    0.000    0.000    0.000 {method 'append' of 'list' objects}
19 |        10    0.000    0.000    0.000    0.000 coef.py:224(get_coords_from_index)
20 |         1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/10_cm_optimized/03-n_samples_large.txt:
--------------------------------------------------------------------------------
 1 |          154 function calls in 15.245 seconds
 2 | 
 3 |    Ordered by: cumulative time
 4 | 
 5 |    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
 6 |         1    0.000    0.000   15.245   15.245 {built-in method builtins.exec}
 7 |         1    0.000    0.000   15.245   15.245 <string>:1(<module>)
 8 |         1    0.004    0.004   15.245   15.245 1556911885.py:1(func)
 9 |        10    0.011    0.001   15.241    1.524 coef.py:265(_cm)
10 |        20   11.407    0.570   11.407    0.570 coef.py:169(_get_parts)
11 |        10    3.823    0.382    3.823    0.382 coef.py:198(cdist_parts)
12 |        20    0.000    0.000    0.000    0.000 {built-in method numpy.zeros}
13 |        20    0.000    0.000    0.000    0.000 coef.py:119(_get_range_n_clusters)
14 |        10    0.000    0.000    0.000    0.000 {method 'argmax' of 'numpy.ndarray' objects}
15 |        10    0.000    0.000    0.000    0.000 {built-in method numpy.empty}
16 |        10    0.000    0.000    0.000    0.000 coef.py:248(unravel_index_2d)
17 |        10    0.000    0.000    0.000    0.000 special.py:18(__new__)
18 |        20    0.000    0.000    0.000    0.000 {method 'append' of 'list' objects}
19 |        10    0.000    0.000    0.000    0.000 coef.py:224(get_coords_from_index)
20 |         1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/10_cm_optimized/03-n_samples_small.txt:
--------------------------------------------------------------------------------
 1 |          154 function calls in 0.032 seconds
 2 | 
 3 |    Ordered by: cumulative time
 4 | 
 5 |    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
 6 |         1    0.000    0.000    0.032    0.032 {built-in method builtins.exec}
 7 |         1    0.000    0.000    0.032    0.032 <string>:1(<module>)
 8 |         1    0.000    0.000    0.032    0.032 1556911885.py:1(func)
 9 |        10    0.001    0.000    0.032    0.003 coef.py:265(_cm)
10 |        10    0.021    0.002    0.021    0.002 coef.py:198(cdist_parts)
11 |        20    0.010    0.001    0.010    0.001 coef.py:169(_get_parts)
12 |        10    0.000    0.000    0.000    0.000 {method 'argmax' of 'numpy.ndarray' objects}
13 |        20    0.000    0.000    0.000    0.000 coef.py:119(_get_range_n_clusters)
14 |        20    0.000    0.000    0.000    0.000 {built-in method numpy.zeros}
15 |        10    0.000    0.000    0.000    0.000 {built-in method numpy.empty}
16 |        10    0.000    0.000    0.000    0.000 coef.py:248(unravel_index_2d)
17 |        10    0.000    0.000    0.000    0.000 special.py:18(__new__)
18 |        10    0.000    0.000    0.000    0.000 coef.py:224(get_coords_from_index)
19 |        20    0.000    0.000    0.000    0.000 {method 'append' of 'list' objects}
20 |         1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/06_cm_optimized/py/04-compare_numba_ari.py:
--------------------------------------------------------------------------------
 1 | # ---
 2 | # jupyter:
 3 | #   jupytext:
 4 | #     cell_metadata_filter: all,-execution,-papermill,-trusted
 5 | #     text_representation:
 6 | #       extension: .py
 7 | #       format_name: percent
 8 | #       format_version: '1.3'
 9 | #       jupytext_version: 1.11.5
10 | #   kernelspec:
11 | #     display_name: Python 3 (ipykernel)
12 | #     language: python
13 | #     name: python3
14 | # ---
15 | 
16 | # %% [markdown] tags=[]
17 | # # Description
18 | 
19 | # %% [markdown] tags=[]
20 | # This file actually does not compare different ARI implementations. The name is kept to ease comparison with the previous runs from `05_cm_optimized`
21 | 
22 | # %% [markdown] tags=[]
23 | # # Modules
24 | 
25 | # %% tags=[]
26 | import numpy as np
27 | 
28 | from ccc import coef
29 | 
30 | # %% [markdown] tags=[]
31 | # # Data
32 | 
33 | # %% tags=[]
34 | n_genes, n_samples = 100, 1000
35 | 
36 | # %% tags=[]
37 | np.random.seed(0)
38 | 
39 | # %% tags=[]
40 | data = np.random.rand(n_genes, n_samples)
41 | 
42 | # %% tags=[]
43 | data.shape
44 | 
45 | 
46 | # %% [markdown] tags=[]
47 | # # Improved implementation (ARI implemented in numba)
48 | 
49 | # %% tags=[]
50 | def func():
51 |     n_clust = list(range(2, 10 + 1))
52 |     return coef.ccc(data, internal_n_clusters=n_clust)
53 | 
54 | 
55 | # %% tags=[]
56 | # %%timeit func()
57 | func()
58 | 
59 | # %% tags=[]
60 | # %%prun -s cumulative -l 50 -T 04-cm_ari_numba.txt
61 | func()
62 | 
63 | # %% tags=[]
64 | 


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/07_cm_optimized/py/04-compare_numba_ari.py:
--------------------------------------------------------------------------------
 1 | # ---
 2 | # jupyter:
 3 | #   jupytext:
 4 | #     cell_metadata_filter: all,-execution,-papermill,-trusted
 5 | #     text_representation:
 6 | #       extension: .py
 7 | #       format_name: percent
 8 | #       format_version: '1.3'
 9 | #       jupytext_version: 1.11.5
10 | #   kernelspec:
11 | #     display_name: Python 3 (ipykernel)
12 | #     language: python
13 | #     name: python3
14 | # ---
15 | 
16 | # %% [markdown] tags=[]
17 | # # Description
18 | 
19 | # %% [markdown] tags=[]
20 | # This file actually does not compare different ari implementations. The name is kept to ease comparison with the previous runs from `05_cm_optimized` and `06_cm_optimized`.
21 | 
22 | # %% [markdown] tags=[]
23 | # # Modules
24 | 
25 | # %% tags=[]
26 | import numpy as np
27 | 
28 | from ccc import coef
29 | 
30 | # %% [markdown] tags=[]
31 | # # Data
32 | 
33 | # %% tags=[]
34 | n_genes, n_samples = 100, 1000
35 | 
36 | # %% tags=[]
37 | np.random.seed(0)
38 | 
39 | # %% tags=[]
40 | data = np.random.rand(n_genes, n_samples)
41 | 
42 | # %% tags=[]
43 | data.shape
44 | 
45 | 
46 | # %% [markdown] tags=[]
47 | # # Improved implementation (ARI implemented in numba)
48 | 
49 | # %% tags=[]
50 | def func():
51 |     n_clust = list(range(2, 10 + 1))
52 |     return coef.ccc(data, internal_n_clusters=n_clust)
53 | 
54 | 
55 | # %% tags=[]
56 | # %%timeit func()
57 | func()
58 | 
59 | # %% tags=[]
60 | # %%prun -s cumulative -l 50 -T 04-cm_ari_numba.txt
61 | func()
62 | 
63 | # %% tags=[]
64 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | # Commands to publish new package:
 4 | #
 5 | # rm -rf dist/
 6 | # python setup.py sdist
 7 | # twine upload dist/*
 8 | 
 9 | with open("README.md", "r") as fh:
10 |     long_description = fh.read()
11 | 
12 | setuptools.setup(
13 |     name="ccc-coef",
14 |     version="0.2.2",  # remember to change libs/ccc/__init__.py file also
15 |     author="Milton Pividori",
16 |     author_email="miltondp@gmail.com",
17 |     description="The Clustermatch Correlation Coefficient (CCC) is a highly-efficient, next-generation not-only-linear correlation coefficient that can work on numerical and categorical data types.",
18 |     license="BSD-2-Clause Plus Patent",
19 |     long_description=long_description,
20 |     long_description_content_type="text/markdown",
21 |     url="https://github.com/greenelab/ccc",
22 |     package_dir={"": "libs"},
23 |     packages=[
24 |         "ccc/coef",
25 |         "ccc/numpy",
26 |         "ccc/pytorch",
27 |         "ccc/scipy",
28 |         "ccc/sklearn",
29 |         "ccc/utils",
30 |     ],
31 |     python_requires=">=3.9",
32 |     install_requires=[
33 |         # numpy.typing is only available in numpy>=1.21.0
34 |         "numpy>=1.21.0",
35 |         "scipy",
36 |         "numba",
37 |     ],
38 |     classifiers=[
39 |         "Programming Language :: Python :: 3",
40 |         "License :: OSI Approved :: BSD License",
41 |         "Operating System :: OS Independent",
42 |         "Development Status :: 5 - Production/Stable",
43 |         "Environment :: Console",
44 |     ],
45 | )
46 | 


--------------------------------------------------------------------------------
/libs/ccc/settings.py:
--------------------------------------------------------------------------------
 1 | """
 2 | General settings. This file is intended to be modified by the user. Each entry
 3 | also provides an alternative way to specify its value using an environment
 4 | variable.
 5 | """
 6 | 
 7 | # Instead of changing this file, you can also use the environment variable name
 8 | # specified for each entry (environment variables supersede these settings).
 9 | 
10 | # Specifies the main directory where all data and results generated are stored.
11 | # When setting up the environment for the first time, input data will be
12 | # automatically downloaded into a subfolder of ROOT_DIR.
13 | #
14 | # Default: if not specified (None), it defaults to the 'cm_gene_expr' subfolder
15 | # in the temporary directory of the operating system (i.e. '/tmp/cm_gene_expr'
16 | # in Unix systems).
17 | # Environment variable: CM_ROOT_DIR
18 | ROOT_DIR = None
19 | 
20 | # Specifies the directory where the manuscript git repository was
21 | # cloned/downloaded. If None, manuscript figures and other related files will
22 | # not be generated.
23 | #
24 | # Default: None
25 | # Environment variable: CM_MANUSCRIPT_DIR
26 | MANUSCRIPT_DIR = None
27 | 
28 | 
29 | #
30 | # CPU usage
31 | #
32 | 
33 | # Amount of cores to use for general usage.
34 | #
35 | # Default: half of available cores.
36 | # Environment variable: CM_N_JOBS
37 | N_JOBS = None
38 | 
39 | # Number of cores to use for low-computational tasks (IO, etc). This number
40 | # can be greater than N_JOBS.
41 | #
42 | # Default: same as N_JOBS.
43 | # Environment variable: CM_N_JOBS_LOW
44 | N_JOBS_LOW = None
45 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM continuumio/miniconda3
 2 | 
 3 | EXPOSE 8893/tcp
 4 | 
 5 | ENV CODE_DIR=/opt/code
 6 | ENV CM_CONDA_ENV_NAME="ccc"
 7 | ENV CM_N_JOBS=1
 8 | ENV CM_ROOT_DIR=/opt/data
 9 | ENV CM_USER_HOME=${CM_ROOT_DIR}/user_home
10 | ENV CM_MANUSCRIPT_DIR=/opt/manuscript
11 | 
12 | VOLUME ${CM_ROOT_DIR}
13 | VOLUME ${CM_MANUSCRIPT_DIR}
14 | 
15 | # install gnu parallel
16 | RUN DEBIAN_FRONTEND=noninteractive apt-get update \
17 |   && apt-get install -y --no-install-recommends parallel \
18 |   && apt-get clean \
19 |   && rm -rf /var/lib/apt/lists/*
20 | 
21 | # setup phenoplier
22 | COPY environment/environment.yml environment/scripts/install_other_packages.sh environment/scripts/install_r_packages.r /tmp/
23 | RUN conda env create --name ${CM_CONDA_ENV_NAME} --file /tmp/environment.yml \
24 |   && conda run -n ${CM_CONDA_ENV_NAME} --no-capture-output /bin/bash /tmp/install_other_packages.sh \
25 |   && conda clean --all --yes
26 | 
27 | # activate the environment when starting bash
28 | RUN echo "conda activate ${CM_CONDA_ENV_NAME}" >> ~/.bashrc
29 | SHELL ["/bin/bash", "--login", "-c"]
30 | 
31 | ENV PYTHONPATH=${CODE_DIR}/libs:${PYTHONPATH}
32 | 
33 | RUN echo "Make sure packages can be loaded"
34 | RUN python -c "import papermill"
35 | 
36 | COPY . ${CODE_DIR}
37 | WORKDIR ${CODE_DIR}
38 | 
39 | RUN echo "Make sure modules can be loaded"
40 | RUN python -c "from ccc import conf"
41 | 
42 | # setup user home directory
43 | RUN mkdir ${CM_USER_HOME} && chmod -R 0777 ${CM_USER_HOME}
44 | ENV HOME=${CM_USER_HOME}
45 | 
46 | ENTRYPOINT ["/opt/code/entrypoint.sh"]
47 | CMD ["scripts/run_nbs_server.sh", "--container-mode"]
48 | 
49 | 


--------------------------------------------------------------------------------
/nbs/run_nbs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | GIT_ROOT_DIR=$(git rev-parse --show-toplevel)
 5 | 
 6 | # This script runs a Jupyter notebook (.ipynb) from the command line using
 7 | # papermill.
 8 | #
 9 | # This script must be run within the nbs/ folder.
10 | 
11 | if [ -z "${1}" ]; then
12 |     echo "Specify notebook to run"
13 |     exit 1
14 | fi
15 | 
16 | # If the notebook is an "output notebook" (*.run.ipynb), which are generated by
17 | # papermill for instance, then do not run it.
18 | pattern="*.run.ipynb"
19 | 
20 | input_notebook=$1
21 | shift
22 | 
23 | if [[ $input_notebook == $pattern ]]; then
24 |     echo "Not running output notebook"
25 |     exit 0
26 | fi
27 | 
28 | override_nbs=${CM_RUN_NBS_OVERRIDE}
29 | 
30 | # if second argument is a notebook, then it is the output
31 | # notebook filename
32 | if [[ $1 == *.ipynb ]]; then
33 |     output_notebook=${input_notebook%/*}/$1
34 |     shift
35 | 
36 |     # do not override if output was specified
37 |     override_nbs=0
38 | else
39 |     output_notebook="${input_notebook%.*}.run.ipynb"
40 | fi
41 | 
42 | # run papermill
43 | papermill \
44 |   --log-output \
45 |   --request-save-on-cell-execute \
46 |   $@ \
47 |   $input_notebook \
48 |   $output_notebook
49 | 
50 | # Convert to notebook
51 | #
52 | # This is to reduce the notebook final size, which is huge after
53 | # running with papermill.
54 | jupyter nbconvert --to notebook ${output_notebook} --output ${output_notebook##*/}
55 | 
56 | if [ "${override_nbs}" != "0" ]; then
57 |     mv $output_notebook $input_notebook
58 |     bash ${GIT_ROOT_DIR}/scripts/convert_ipynb_to_py.sh ${input_notebook}
59 | fi
60 | 
61 | 


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/10_cm_optimized/08-n_samples_small_50.txt:
--------------------------------------------------------------------------------
 1 |          6815 function calls in 0.028 seconds
 2 | 
 3 |    Ordered by: cumulative time
 4 |    List reduced from 120 to 20 due to restriction <20>
 5 | 
 6 |    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
 7 |         1    0.000    0.000    0.028    0.028 {built-in method builtins.exec}
 8 |         1    0.000    0.000    0.028    0.028 <string>:1(<module>)
 9 |         1    0.000    0.000    0.028    0.028 1517976664.py:1(func)
10 |        10    0.001    0.000    0.028    0.003 coef.py:275(cm)
11 |        10    0.001    0.000    0.020    0.002 coef.py:414(compute_coef)
12 |        10    0.000    0.000    0.019    0.002 coef.py:407(cdist_func)
13 |        10    0.002    0.000    0.019    0.002 coef.py:168(cdist_parts_parallel)
14 |       132    0.001    0.000    0.015    0.000 threading.py:280(wait)
15 |       540    0.015    0.000    0.015    0.000 {method 'acquire' of '_thread.lock' objects}
16 |        65    0.000    0.000    0.014    0.000 threading.py:556(wait)
17 |        70    0.000    0.000    0.012    0.000 _base.py:201(as_completed)
18 |        80    0.000    0.000    0.007    0.000 thread.py:155(submit)
19 |        80    0.000    0.000    0.006    0.000 thread.py:174(_adjust_thread_count)
20 |        30    0.000    0.000    0.004    0.000 threading.py:873(start)
21 |        10    0.000    0.000    0.004    0.000 coef.py:186(<dictcomp>)
22 |        10    0.000    0.000    0.004    0.000 _base.py:572(map)
23 |        10    0.000    0.000    0.004    0.000 _base.py:597(<listcomp>)
24 |        80    0.000    0.000    0.001    0.000 _base.py:417(result)
25 |        10    0.000    0.000    0.001    0.000 _base.py:635(__exit__)
26 |        10    0.000    0.000    0.001    0.000 thread.py:210(shutdown)


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/10_cm_optimized/08-n_samples_large_50000.txt:
--------------------------------------------------------------------------------
 1 |          9633 function calls in 2.469 seconds
 2 | 
 3 |    Ordered by: cumulative time
 4 |    List reduced from 120 to 20 due to restriction <20>
 5 | 
 6 |    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
 7 |         1    0.000    0.000    2.469    2.469 {built-in method builtins.exec}
 8 |         1    0.000    0.000    2.469    2.469 <string>:1(<module>)
 9 |         1    0.000    0.000    2.469    2.469 1517976664.py:1(func)
10 |        10    0.003    0.000    2.469    0.247 coef.py:275(cm)
11 |       220    0.001    0.000    2.448    0.011 threading.py:280(wait)
12 |       890    2.448    0.003    2.448    0.003 {method 'acquire' of '_thread.lock' objects}
13 |        10    0.001    0.000    1.676    0.168 coef.py:414(compute_coef)
14 |        10    0.000    0.000    1.675    0.168 coef.py:407(cdist_func)
15 |        10    0.001    0.000    1.675    0.167 coef.py:168(cdist_parts_parallel)
16 |       120    0.000    0.000    1.668    0.014 threading.py:556(wait)
17 |       100    0.001    0.000    1.667    0.017 _base.py:201(as_completed)
18 |       110    0.000    0.000    0.781    0.007 _base.py:417(result)
19 |        30    0.000    0.000    0.780    0.026 _base.py:601(result_iterator)
20 |       110    0.001    0.000    0.008    0.000 thread.py:155(submit)
21 |       110    0.000    0.000    0.007    0.000 thread.py:174(_adjust_thread_count)
22 |        10    0.002    0.000    0.006    0.001 coef.py:186(<dictcomp>)
23 |        30    0.000    0.000    0.004    0.000 threading.py:873(start)
24 |        10    0.000    0.000    0.004    0.000 _base.py:572(map)
25 |        10    0.000    0.000    0.004    0.000 _base.py:597(<listcomp>)
26 |        10    0.000    0.000    0.002    0.000 _base.py:635(__exit__)


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/10_cm_optimized/09-n_samples_large_50000.txt:
--------------------------------------------------------------------------------
 1 |          9632 function calls in 2.252 seconds
 2 | 
 3 |    Ordered by: cumulative time
 4 |    List reduced from 120 to 20 due to restriction <20>
 5 | 
 6 |    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
 7 |         1    0.000    0.000    2.252    2.252 {built-in method builtins.exec}
 8 |         1    0.000    0.000    2.252    2.252 <string>:1(<module>)
 9 |         1    0.000    0.000    2.252    2.252 1517976664.py:1(func)
10 |        10    0.003    0.000    2.252    0.225 coef.py:275(cm)
11 |       890    2.231    0.003    2.231    0.003 {method 'acquire' of '_thread.lock' objects}
12 |       220    0.001    0.000    2.231    0.010 threading.py:280(wait)
13 |        10    0.001    0.000    1.547    0.155 coef.py:414(compute_coef)
14 |        10    0.000    0.000    1.546    0.155 coef.py:407(cdist_func)
15 |        10    0.001    0.000    1.546    0.155 coef.py:168(cdist_parts_parallel)
16 |       120    0.000    0.000    1.538    0.013 threading.py:556(wait)
17 |       100    0.001    0.000    1.537    0.015 _base.py:201(as_completed)
18 |       110    0.000    0.000    0.693    0.006 _base.py:417(result)
19 |        30    0.000    0.000    0.693    0.023 _base.py:601(result_iterator)
20 |       110    0.001    0.000    0.008    0.000 thread.py:155(submit)
21 |        10    0.002    0.000    0.007    0.001 coef.py:186(<dictcomp>)
22 |       110    0.001    0.000    0.007    0.000 thread.py:174(_adjust_thread_count)
23 |        30    0.000    0.000    0.004    0.000 threading.py:873(start)
24 |        10    0.000    0.000    0.003    0.000 _base.py:572(map)
25 |        10    0.000    0.000    0.003    0.000 _base.py:597(<listcomp>)
26 |        10    0.000    0.000    0.003    0.000 _base.py:635(__exit__)


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/10_cm_optimized/09-n_samples_small_1000.txt:
--------------------------------------------------------------------------------
 1 |          9576 function calls in 0.069 seconds
 2 | 
 3 |    Ordered by: cumulative time
 4 |    List reduced from 120 to 20 due to restriction <20>
 5 | 
 6 |    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
 7 |         1    0.000    0.000    0.069    0.069 {built-in method builtins.exec}
 8 |         1    0.000    0.000    0.068    0.068 <string>:1(<module>)
 9 |         1    0.000    0.000    0.068    0.068 1517976664.py:1(func)
10 |        10    0.001    0.000    0.068    0.007 coef.py:275(cm)
11 |       884    0.055    0.000    0.055    0.000 {method 'acquire' of '_thread.lock' objects}
12 |       223    0.001    0.000    0.054    0.000 threading.py:280(wait)
13 |        10    0.000    0.000    0.051    0.005 coef.py:414(compute_coef)
14 |        10    0.000    0.000    0.051    0.005 coef.py:407(cdist_func)
15 |        10    0.001    0.000    0.051    0.005 coef.py:168(cdist_parts_parallel)
16 |       116    0.000    0.000    0.045    0.000 threading.py:556(wait)
17 |       100    0.001    0.000    0.042    0.000 _base.py:201(as_completed)
18 |       110    0.000    0.000    0.010    0.000 _base.py:417(result)
19 |        30    0.000    0.000    0.010    0.000 _base.py:601(result_iterator)
20 |       110    0.000    0.000    0.008    0.000 thread.py:155(submit)
21 |       110    0.000    0.000    0.007    0.000 thread.py:174(_adjust_thread_count)
22 |        10    0.001    0.000    0.007    0.001 coef.py:186(<dictcomp>)
23 |        30    0.000    0.000    0.005    0.000 threading.py:873(start)
24 |        10    0.000    0.000    0.003    0.000 _base.py:572(map)
25 |        10    0.000    0.000    0.003    0.000 _base.py:597(<listcomp>)
26 |        10    0.000    0.000    0.002    0.000 _base.py:635(__exit__)


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/10_cm_optimized/08-n_samples_small_100.txt:
--------------------------------------------------------------------------------
 1 |          9175 function calls in 0.046 seconds
 2 | 
 3 |    Ordered by: cumulative time
 4 |    List reduced from 120 to 20 due to restriction <20>
 5 | 
 6 |    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
 7 |         1    0.000    0.000    0.046    0.046 {built-in method builtins.exec}
 8 |         1    0.000    0.000    0.046    0.046 <string>:1(<module>)
 9 |         1    0.000    0.000    0.046    0.046 1517976664.py:1(func)
10 |        10    0.001    0.000    0.045    0.005 coef.py:275(cm)
11 |        10    0.001    0.000    0.037    0.004 coef.py:414(compute_coef)
12 |        10    0.000    0.000    0.036    0.004 coef.py:407(cdist_func)
13 |        10    0.002    0.000    0.036    0.004 coef.py:168(cdist_parts_parallel)
14 |       203    0.001    0.000    0.030    0.000 threading.py:280(wait)
15 |       810    0.029    0.000    0.029    0.000 {method 'acquire' of '_thread.lock' objects}
16 |       100    0.000    0.000    0.028    0.000 threading.py:556(wait)
17 |       100    0.001    0.000    0.027    0.000 _base.py:201(as_completed)
18 |       110    0.001    0.000    0.009    0.000 thread.py:155(submit)
19 |       110    0.000    0.000    0.007    0.000 thread.py:174(_adjust_thread_count)
20 |        10    0.001    0.000    0.006    0.001 coef.py:186(<dictcomp>)
21 |        30    0.000    0.000    0.005    0.000 threading.py:873(start)
22 |        10    0.000    0.000    0.004    0.000 _base.py:572(map)
23 |        10    0.000    0.000    0.004    0.000 _base.py:597(<listcomp>)
24 |       110    0.000    0.000    0.002    0.000 _base.py:417(result)
25 |       110    0.000    0.000    0.002    0.000 threading.py:404(acquire)
26 |        30    0.000    0.000    0.001    0.000 _base.py:601(result_iterator)


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/10_cm_optimized/08-n_samples_small_500.txt:
--------------------------------------------------------------------------------
 1 |          9391 function calls in 0.062 seconds
 2 | 
 3 |    Ordered by: cumulative time
 4 |    List reduced from 120 to 20 due to restriction <20>
 5 | 
 6 |    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
 7 |         1    0.000    0.000    0.062    0.062 {built-in method builtins.exec}
 8 |         1    0.000    0.000    0.062    0.062 <string>:1(<module>)
 9 |         1    0.000    0.000    0.062    0.062 1517976664.py:1(func)
10 |        10    0.001    0.000    0.062    0.006 coef.py:275(cm)
11 |        10    0.001    0.000    0.048    0.005 coef.py:414(compute_coef)
12 |        10    0.000    0.000    0.047    0.005 coef.py:407(cdist_func)
13 |        10    0.002    0.000    0.047    0.005 coef.py:168(cdist_parts_parallel)
14 |       215    0.001    0.000    0.045    0.000 threading.py:280(wait)
15 |       850    0.045    0.000    0.045    0.000 {method 'acquire' of '_thread.lock' objects}
16 |       108    0.000    0.000    0.040    0.000 threading.py:556(wait)
17 |       100    0.001    0.000    0.039    0.000 _base.py:201(as_completed)
18 |       110    0.001    0.000    0.008    0.000 thread.py:155(submit)
19 |       110    0.000    0.000    0.007    0.000 thread.py:174(_adjust_thread_count)
20 |       110    0.000    0.000    0.006    0.000 _base.py:417(result)
21 |        30    0.000    0.000    0.006    0.000 _base.py:601(result_iterator)
22 |        10    0.001    0.000    0.006    0.001 coef.py:186(<dictcomp>)
23 |        30    0.000    0.000    0.004    0.000 threading.py:873(start)
24 |        10    0.000    0.000    0.004    0.000 _base.py:572(map)
25 |        10    0.000    0.000    0.004    0.000 _base.py:597(<listcomp>)
26 |       110    0.000    0.000    0.001    0.000 threading.py:404(acquire)


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/10_cm_optimized/09-n_samples_small_100.txt:
--------------------------------------------------------------------------------
 1 |          9212 function calls in 0.034 seconds
 2 | 
 3 |    Ordered by: cumulative time
 4 |    List reduced from 120 to 20 due to restriction <20>
 5 | 
 6 |    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
 7 |         1    0.000    0.000    0.034    0.034 {built-in method builtins.exec}
 8 |         1    0.000    0.000    0.034    0.034 <string>:1(<module>)
 9 |         1    0.000    0.000    0.034    0.034 1517976664.py:1(func)
10 |        10    0.001    0.000    0.034    0.003 coef.py:275(cm)
11 |        10    0.000    0.000    0.028    0.003 coef.py:414(compute_coef)
12 |        10    0.000    0.000    0.027    0.003 coef.py:407(cdist_func)
13 |        10    0.003    0.000    0.027    0.003 coef.py:168(cdist_parts_parallel)
14 |       199    0.001    0.000    0.022    0.000 threading.py:280(wait)
15 |       802    0.022    0.000    0.022    0.000 {method 'acquire' of '_thread.lock' objects}
16 |       106    0.000    0.000    0.021    0.000 threading.py:556(wait)
17 |       100    0.000    0.000    0.020    0.000 _base.py:201(as_completed)
18 |       110    0.000    0.000    0.007    0.000 thread.py:155(submit)
19 |       110    0.000    0.000    0.006    0.000 thread.py:174(_adjust_thread_count)
20 |        10    0.001    0.000    0.004    0.000 coef.py:186(<dictcomp>)
21 |        30    0.000    0.000    0.004    0.000 threading.py:873(start)
22 |        10    0.000    0.000    0.003    0.000 _base.py:572(map)
23 |        10    0.000    0.000    0.003    0.000 _base.py:597(<listcomp>)
24 |       110    0.000    0.000    0.001    0.000 _base.py:417(result)
25 |       110    0.000    0.000    0.001    0.000 threading.py:404(acquire)
26 |        30    0.000    0.000    0.001    0.000 _base.py:601(result_iterator)


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/10_cm_optimized/09-n_samples_small_50.txt:
--------------------------------------------------------------------------------
 1 |          6936 function calls in 0.020 seconds
 2 | 
 3 |    Ordered by: cumulative time
 4 |    List reduced from 120 to 20 due to restriction <20>
 5 | 
 6 |    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
 7 |         1    0.000    0.000    0.020    0.020 {built-in method builtins.exec}
 8 |         1    0.000    0.000    0.020    0.020 <string>:1(<module>)
 9 |         1    0.000    0.000    0.020    0.020 1517976664.py:1(func)
10 |        10    0.001    0.000    0.020    0.002 coef.py:275(cm)
11 |        10    0.000    0.000    0.014    0.001 coef.py:414(compute_coef)
12 |        10    0.000    0.000    0.013    0.001 coef.py:407(cdist_func)
13 |        10    0.002    0.000    0.013    0.001 coef.py:168(cdist_parts_parallel)
14 |       136    0.000    0.000    0.009    0.000 threading.py:280(wait)
15 |       554    0.009    0.000    0.009    0.000 {method 'acquire' of '_thread.lock' objects}
16 |        71    0.000    0.000    0.009    0.000 threading.py:556(wait)
17 |        70    0.000    0.000    0.007    0.000 _base.py:201(as_completed)
18 |        80    0.000    0.000    0.005    0.000 thread.py:155(submit)
19 |        80    0.000    0.000    0.005    0.000 thread.py:174(_adjust_thread_count)
20 |        10    0.000    0.000    0.003    0.000 coef.py:186(<dictcomp>)
21 |        30    0.000    0.000    0.003    0.000 threading.py:873(start)
22 |        10    0.000    0.000    0.003    0.000 _base.py:572(map)
23 |        10    0.000    0.000    0.003    0.000 _base.py:597(<listcomp>)
24 |        10    0.001    0.000    0.001    0.000 parallel.py:596(get_num_threads)
25 |        10    0.000    0.000    0.001    0.000 _base.py:635(__exit__)
26 |        10    0.000    0.000    0.001    0.000 thread.py:210(shutdown)


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/10_cm_optimized/09-n_samples_small_500.txt:
--------------------------------------------------------------------------------
 1 |          9477 function calls in 0.044 seconds
 2 | 
 3 |    Ordered by: cumulative time
 4 |    List reduced from 120 to 20 due to restriction <20>
 5 | 
 6 |    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
 7 |         1    0.000    0.000    0.044    0.044 {built-in method builtins.exec}
 8 |         1    0.000    0.000    0.044    0.044 <string>:1(<module>)
 9 |         1    0.000    0.000    0.044    0.044 1517976664.py:1(func)
10 |        10    0.001    0.000    0.044    0.004 coef.py:275(cm)
11 |        10    0.000    0.000    0.034    0.003 coef.py:414(compute_coef)
12 |        10    0.000    0.000    0.033    0.003 coef.py:407(cdist_func)
13 |        10    0.002    0.000    0.033    0.003 coef.py:168(cdist_parts_parallel)
14 |       217    0.001    0.000    0.032    0.000 threading.py:280(wait)
15 |       858    0.031    0.000    0.031    0.000 {method 'acquire' of '_thread.lock' objects}
16 |       113    0.000    0.000    0.027    0.000 threading.py:556(wait)
17 |       100    0.001    0.000    0.026    0.000 _base.py:201(as_completed)
18 |       110    0.000    0.000    0.006    0.000 thread.py:155(submit)
19 |       110    0.000    0.000    0.005    0.000 _base.py:417(result)
20 |       110    0.000    0.000    0.005    0.000 thread.py:174(_adjust_thread_count)
21 |        30    0.000    0.000    0.005    0.000 _base.py:601(result_iterator)
22 |        10    0.001    0.000    0.004    0.000 coef.py:186(<dictcomp>)
23 |        30    0.000    0.000    0.003    0.000 threading.py:873(start)
24 |        10    0.000    0.000    0.002    0.000 _base.py:572(map)
25 |        10    0.000    0.000    0.002    0.000 _base.py:597(<listcomp>)
26 |       110    0.000    0.000    0.001    0.000 threading.py:404(acquire)


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/05_cm_optimized/py/07-many_samples.py:
--------------------------------------------------------------------------------
 1 | # ---
 2 | # jupyter:
 3 | #   jupytext:
 4 | #     cell_metadata_filter: all,-execution,-papermill,-trusted
 5 | #     text_representation:
 6 | #       extension: .py
 7 | #       format_name: percent
 8 | #       format_version: '1.3'
 9 | #       jupytext_version: 1.11.5
10 | #   kernelspec:
11 | #     display_name: Python 3 (ipykernel)
12 | #     language: python
13 | #     name: python3
14 | # ---
15 | 
16 | # %% [markdown] tags=[]
17 | # # Description
18 | 
19 | # %% [markdown]
20 | # Clustermatch run using a larger number of samples.
21 | 
22 | # %% [markdown] tags=[]
23 | # # Modules
24 | 
25 | # %% tags=[]
26 | import numpy as np
27 | 
28 | from ccc.coef import ccc
29 | 
30 | # %% [markdown] tags=[]
31 | # # Data
32 | 
33 | # %% tags=[]
34 | n_genes, n_samples = 10, 30000
35 | 
36 | # %% tags=[]
37 | np.random.seed(0)
38 | 
39 | # %% tags=[]
40 | data = np.random.rand(n_genes, n_samples)
41 | 
42 | # %% tags=[]
43 | data.shape
44 | 
45 | 
46 | # %% [markdown] tags=[]
47 | # # With defeault `internal_n_clusters`
48 | 
49 | # %% tags=[]
50 | def func():
51 |     return ccc(data, internal_n_clusters=range(2, 10 + 1), precompute_parts=True)
52 | 
53 | 
54 | # %% tags=[]
55 | # %%timeit func()
56 | func()
57 | 
58 | # %% tags=[]
59 | # %%prun -s cumulative -l 20 -T 07-cm_many_samples-default_internal_n_clusters.txt
60 | func()
61 | 
62 | 
63 | # %% [markdown] tags=[]
64 | # # With defeault `internal_n_clusters`
65 | 
66 | # %% tags=[]
67 | def func():
68 |     return ccc(data, internal_n_clusters=range(2, 5 + 1), precompute_parts=True)
69 | 
70 | 
71 | # %% tags=[]
72 | # %%timeit func()
73 | func()
74 | 
75 | # %% tags=[]
76 | # %%prun -s cumulative -l 20 -T 07-cm_many_samples-less_internal_n_clusters.txt
77 | func()
78 | 
79 | # %% tags=[]
80 | 


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/10_cm_optimized/08-n_samples_large_100000.txt:
--------------------------------------------------------------------------------
 1 |          9647 function calls in 5.917 seconds
 2 | 
 3 |    Ordered by: cumulative time
 4 |    List reduced from 120 to 20 due to restriction <20>
 5 | 
 6 |    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
 7 |         1    0.000    0.000    5.917    5.917 {built-in method builtins.exec}
 8 |         1    0.000    0.000    5.917    5.917 <string>:1(<module>)
 9 |         1    0.000    0.000    5.917    5.917 1517976664.py:1(func)
10 |        10    0.005    0.001    5.917    0.592 coef.py:275(cm)
11 |       222    0.001    0.000    5.890    0.027 threading.py:280(wait)
12 |       894    5.889    0.007    5.889    0.007 {method 'acquire' of '_thread.lock' objects}
13 |        10    0.001    0.000    4.013    0.401 coef.py:414(compute_coef)
14 |        10    0.000    0.000    4.011    0.401 coef.py:407(cdist_func)
15 |        10    0.002    0.000    4.011    0.401 coef.py:168(cdist_parts_parallel)
16 |       120    0.000    0.000    4.002    0.033 threading.py:556(wait)
17 |       100    0.001    0.000    4.001    0.040 _base.py:201(as_completed)
18 |       110    0.000    0.000    1.888    0.017 _base.py:417(result)
19 |        30    0.000    0.000    1.888    0.063 _base.py:601(result_iterator)
20 |       110    0.001    0.000    0.010    0.000 thread.py:155(submit)
21 |       110    0.001    0.000    0.008    0.000 thread.py:174(_adjust_thread_count)
22 |        10    0.003    0.000    0.008    0.001 coef.py:186(<dictcomp>)
23 |        30    0.000    0.000    0.005    0.000 threading.py:873(start)
24 |        10    0.000    0.000    0.005    0.000 _base.py:572(map)
25 |        10    0.000    0.000    0.005    0.000 _base.py:597(<listcomp>)
26 |        50    0.002    0.000    0.002    0.000 {built-in method numpy.zeros}


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/10_cm_optimized/09-n_samples_large_100000.txt:
--------------------------------------------------------------------------------
 1 |          9634 function calls in 4.652 seconds
 2 | 
 3 |    Ordered by: cumulative time
 4 |    List reduced from 120 to 20 due to restriction <20>
 5 | 
 6 |    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
 7 |         1    0.000    0.000    4.652    4.652 {built-in method builtins.exec}
 8 |         1    0.000    0.000    4.652    4.652 <string>:1(<module>)
 9 |         1    0.000    0.000    4.652    4.652 1517976664.py:1(func)
10 |        10    0.005    0.001    4.652    0.465 coef.py:275(cm)
11 |       221    0.001    0.000    4.626    0.021 threading.py:280(wait)
12 |       892    4.626    0.005    4.626    0.005 {method 'acquire' of '_thread.lock' objects}
13 |        10    0.001    0.000    3.121    0.312 coef.py:414(compute_coef)
14 |        10    0.000    0.000    3.120    0.312 coef.py:407(cdist_func)
15 |        10    0.001    0.000    3.119    0.312 coef.py:168(cdist_parts_parallel)
16 |       120    0.000    0.000    3.110    0.026 threading.py:556(wait)
17 |       100    0.001    0.000    3.110    0.031 _base.py:201(as_completed)
18 |       110    0.000    0.000    1.516    0.014 _base.py:417(result)
19 |        30    0.000    0.000    1.516    0.051 _base.py:601(result_iterator)
20 |        10    0.004    0.000    0.008    0.001 coef.py:186(<dictcomp>)
21 |       110    0.001    0.000    0.008    0.000 thread.py:155(submit)
22 |       110    0.000    0.000    0.006    0.000 thread.py:174(_adjust_thread_count)
23 |        30    0.000    0.000    0.004    0.000 threading.py:873(start)
24 |        10    0.000    0.000    0.004    0.000 _base.py:572(map)
25 |        10    0.000    0.000    0.004    0.000 _base.py:597(<listcomp>)
26 |        50    0.003    0.000    0.003    0.000 {built-in method numpy.zeros}


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/10_cm_optimized/08-n_samples_small_1000.txt:
--------------------------------------------------------------------------------
 1 |          9577 function calls in 0.083 seconds
 2 | 
 3 |    Ordered by: cumulative time
 4 |    List reduced from 120 to 20 due to restriction <20>
 5 | 
 6 |    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
 7 |         1    0.000    0.000    0.083    0.083 {built-in method builtins.exec}
 8 |         1    0.000    0.000    0.083    0.083 <string>:1(<module>)
 9 |         1    0.000    0.000    0.083    0.083 1517976664.py:1(func)
10 |        10    0.001    0.000    0.083    0.008 coef.py:275(cm)
11 |       223    0.001    0.000    0.066    0.000 threading.py:280(wait)
12 |       882    0.065    0.000    0.065    0.000 {method 'acquire' of '_thread.lock' objects}
13 |        10    0.001    0.000    0.064    0.006 coef.py:414(compute_coef)
14 |        10    0.000    0.000    0.063    0.006 coef.py:407(cdist_func)
15 |        10    0.002    0.000    0.062    0.006 coef.py:168(cdist_parts_parallel)
16 |       115    0.000    0.000    0.055    0.000 threading.py:556(wait)
17 |       100    0.001    0.000    0.054    0.001 _base.py:201(as_completed)
18 |       110    0.000    0.000    0.012    0.000 _base.py:417(result)
19 |        30    0.000    0.000    0.011    0.000 _base.py:601(result_iterator)
20 |       110    0.001    0.000    0.008    0.000 thread.py:155(submit)
21 |       110    0.000    0.000    0.007    0.000 thread.py:174(_adjust_thread_count)
22 |        10    0.002    0.000    0.006    0.001 coef.py:186(<dictcomp>)
23 |        30    0.000    0.000    0.005    0.000 threading.py:873(start)
24 |        10    0.000    0.000    0.004    0.000 _base.py:572(map)
25 |        10    0.000    0.000    0.004    0.000 _base.py:597(<listcomp>)
26 |       185    0.001    0.000    0.002    0.000 _base.py:179(_yield_finished_futures)


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/10_cm_optimized/10-n_samples_small_50.txt:
--------------------------------------------------------------------------------
 1 |          7071 function calls (7061 primitive calls) in 0.123 seconds
 2 | 
 3 |    Ordered by: cumulative time
 4 |    List reduced from 131 to 20 due to restriction <20>
 5 | 
 6 |    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
 7 |         1    0.000    0.000    0.123    0.123 {built-in method builtins.exec}
 8 |         1    0.000    0.000    0.123    0.123 <string>:1(<module>)
 9 |         1    0.000    0.000    0.123    0.123 1517976664.py:1(func)
10 |        10    0.001    0.000    0.123    0.012 coef.py:275(cm)
11 |        10    0.000    0.000    0.103    0.010 coef.py:414(compute_coef)
12 |        10    0.000    0.000    0.102    0.010 coef.py:407(cdist_func)
13 |        10    0.015    0.002    0.102    0.010 coef.py:168(cdist_parts_parallel)
14 |       131    0.000    0.000    0.099    0.001 threading.py:280(wait)
15 |       534    0.099    0.000    0.099    0.000 {method 'acquire' of '_thread.lock' objects}
16 |        72    0.000    0.000    0.095    0.001 threading.py:556(wait)
17 |        70    0.000    0.000    0.074    0.001 _base.py:201(as_completed)
18 |        80    0.000    0.000    0.025    0.000 thread.py:155(submit)
19 |        80    0.000    0.000    0.024    0.000 thread.py:174(_adjust_thread_count)
20 |        30    0.000    0.000    0.023    0.001 threading.py:873(start)
21 |        10    0.000    0.000    0.013    0.001 coef.py:186(<dictcomp>)
22 |        10    0.000    0.000    0.013    0.001 _base.py:572(map)
23 |        10    0.000    0.000    0.013    0.001 _base.py:597(<listcomp>)
24 |        80    0.000    0.000    0.004    0.000 _base.py:417(result)
25 |        30    0.000    0.000    0.004    0.000 _base.py:601(result_iterator)
26 |        10    0.000    0.000    0.001    0.000 _base.py:635(__exit__)


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/10_cm_optimized/10-n_samples_large_50000.txt:
--------------------------------------------------------------------------------
 1 |          9867 function calls (9857 primitive calls) in 2.227 seconds
 2 | 
 3 |    Ordered by: cumulative time
 4 |    List reduced from 131 to 20 due to restriction <20>
 5 | 
 6 |    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
 7 |         1    0.000    0.000    2.227    2.227 {built-in method builtins.exec}
 8 |         1    0.000    0.000    2.227    2.227 <string>:1(<module>)
 9 |         1    0.001    0.001    2.227    2.227 1517976664.py:1(func)
10 |        10    0.003    0.000    2.227    0.223 coef.py:275(cm)
11 |       225    0.001    0.000    2.204    0.010 threading.py:280(wait)
12 |       900    2.203    0.002    2.203    0.002 {method 'acquire' of '_thread.lock' objects}
13 |        10    0.000    0.000    1.502    0.150 coef.py:414(compute_coef)
14 |        10    0.000    0.000    1.501    0.150 coef.py:407(cdist_func)
15 |        10    0.003    0.000    1.501    0.150 coef.py:168(cdist_parts_parallel)
16 |       120    0.000    0.000    1.491    0.012 threading.py:556(wait)
17 |       100    0.001    0.000    1.490    0.015 _base.py:201(as_completed)
18 |       110    0.000    0.000    0.713    0.006 _base.py:417(result)
19 |        30    0.000    0.000    0.713    0.024 _base.py:601(result_iterator)
20 |       110    0.001    0.000    0.009    0.000 thread.py:155(submit)
21 |        10    0.003    0.000    0.008    0.001 coef.py:186(<dictcomp>)
22 |       110    0.000    0.000    0.007    0.000 thread.py:174(_adjust_thread_count)
23 |        30    0.000    0.000    0.005    0.000 threading.py:873(start)
24 |        10    0.000    0.000    0.004    0.000 _base.py:572(map)
25 |        10    0.000    0.000    0.004    0.000 _base.py:597(<listcomp>)
26 |        10    0.000    0.000    0.002    0.000 _base.py:635(__exit__)


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/10_cm_optimized/10-n_samples_small_100.txt:
--------------------------------------------------------------------------------
 1 |          9146 function calls (9136 primitive calls) in 0.359 seconds
 2 | 
 3 |    Ordered by: cumulative time
 4 |    List reduced from 131 to 20 due to restriction <20>
 5 | 
 6 |    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
 7 |         1    0.000    0.000    0.359    0.359 {built-in method builtins.exec}
 8 |         1    0.000    0.000    0.359    0.359 <string>:1(<module>)
 9 |         1    0.000    0.000    0.359    0.359 1517976664.py:1(func)
10 |        10    0.001    0.000    0.359    0.036 coef.py:275(cm)
11 |        10    0.000    0.000    0.331    0.033 coef.py:414(compute_coef)
12 |        10    0.000    0.000    0.330    0.033 coef.py:407(cdist_func)
13 |        10    0.050    0.005    0.330    0.033 coef.py:168(cdist_parts_parallel)
14 |       744    0.299    0.000    0.299    0.000 {method 'acquire' of '_thread.lock' objects}
15 |       184    0.001    0.000    0.298    0.002 threading.py:280(wait)
16 |        98    0.000    0.000    0.284    0.003 threading.py:556(wait)
17 |       100    0.001    0.000    0.270    0.003 _base.py:201(as_completed)
18 |       110    0.000    0.000    0.020    0.000 thread.py:155(submit)
19 |       110    0.000    0.000    0.019    0.000 thread.py:174(_adjust_thread_count)
20 |        30    0.000    0.000    0.017    0.001 threading.py:873(start)
21 |       110    0.000    0.000    0.014    0.000 _base.py:417(result)
22 |        30    0.000    0.000    0.014    0.000 _base.py:601(result_iterator)
23 |        10    0.000    0.000    0.010    0.001 _base.py:572(map)
24 |        10    0.000    0.000    0.010    0.001 _base.py:597(<listcomp>)
25 |        10    0.000    0.000    0.010    0.001 coef.py:186(<dictcomp>)
26 |        10    0.000    0.000    0.002    0.000 _base.py:635(__exit__)


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/10_cm_optimized/10-n_samples_small_1000.txt:
--------------------------------------------------------------------------------
 1 |          9875 function calls (9865 primitive calls) in 0.867 seconds
 2 | 
 3 |    Ordered by: cumulative time
 4 |    List reduced from 131 to 20 due to restriction <20>
 5 | 
 6 |    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
 7 |         1    0.000    0.000    0.867    0.867 {built-in method builtins.exec}
 8 |         1    0.000    0.000    0.867    0.867 <string>:1(<module>)
 9 |         1    0.000    0.000    0.867    0.867 1517976664.py:1(func)
10 |        10    0.001    0.000    0.866    0.087 coef.py:275(cm)
11 |       226    0.001    0.000    0.845    0.004 threading.py:280(wait)
12 |       902    0.845    0.001    0.845    0.001 {method 'acquire' of '_thread.lock' objects}
13 |        10    0.000    0.000    0.831    0.083 coef.py:414(compute_coef)
14 |        10    0.000    0.000    0.830    0.083 coef.py:407(cdist_func)
15 |        10    0.007    0.001    0.830    0.083 coef.py:168(cdist_parts_parallel)
16 |       120    0.000    0.000    0.818    0.007 threading.py:556(wait)
17 |       100    0.001    0.000    0.815    0.008 _base.py:201(as_completed)
18 |       110    0.000    0.000    0.028    0.000 _base.py:417(result)
19 |        30    0.000    0.000    0.027    0.001 _base.py:601(result_iterator)
20 |       110    0.001    0.000    0.009    0.000 thread.py:155(submit)
21 |       110    0.001    0.000    0.008    0.000 thread.py:174(_adjust_thread_count)
22 |        10    0.002    0.000    0.007    0.001 coef.py:186(<dictcomp>)
23 |        30    0.000    0.000    0.006    0.000 threading.py:873(start)
24 |        10    0.000    0.000    0.004    0.000 _base.py:572(map)
25 |        10    0.000    0.000    0.004    0.000 _base.py:597(<listcomp>)
26 |        10    0.000    0.000    0.002    0.000 _base.py:635(__exit__)


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/10_cm_optimized/10-n_samples_small_500.txt:
--------------------------------------------------------------------------------
 1 |          9462 function calls (9452 primitive calls) in 0.392 seconds
 2 | 
 3 |    Ordered by: cumulative time
 4 |    List reduced from 131 to 20 due to restriction <20>
 5 | 
 6 |    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
 7 |         1    0.000    0.000    0.392    0.392 {built-in method builtins.exec}
 8 |         1    0.000    0.000    0.392    0.392 <string>:1(<module>)
 9 |         1    0.000    0.000    0.392    0.392 1517976664.py:1(func)
10 |        10    0.001    0.000    0.392    0.039 coef.py:275(cm)
11 |        10    0.000    0.000    0.363    0.036 coef.py:414(compute_coef)
12 |        10    0.000    0.000    0.362    0.036 coef.py:407(cdist_func)
13 |        10    0.043    0.004    0.362    0.036 coef.py:168(cdist_parts_parallel)
14 |       201    0.001    0.000    0.338    0.002 threading.py:280(wait)
15 |       810    0.338    0.000    0.338    0.000 {method 'acquire' of '_thread.lock' objects}
16 |       108    0.000    0.000    0.317    0.003 threading.py:556(wait)
17 |       100    0.001    0.000    0.312    0.003 _base.py:201(as_completed)
18 |       110    0.000    0.000    0.021    0.000 _base.py:417(result)
19 |        30    0.000    0.000    0.021    0.001 _base.py:601(result_iterator)
20 |       110    0.000    0.000    0.011    0.000 thread.py:155(submit)
21 |       110    0.000    0.000    0.010    0.000 thread.py:174(_adjust_thread_count)
22 |        30    0.000    0.000    0.008    0.000 threading.py:873(start)
23 |        10    0.001    0.000    0.007    0.001 coef.py:186(<dictcomp>)
24 |        10    0.000    0.000    0.004    0.000 _base.py:572(map)
25 |        10    0.000    0.000    0.004    0.000 _base.py:597(<listcomp>)
26 |        10    0.000    0.000    0.002    0.000 _base.py:635(__exit__)


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/06_cm_optimized/py/07-many_samples.py:
--------------------------------------------------------------------------------
 1 | # ---
 2 | # jupyter:
 3 | #   jupytext:
 4 | #     cell_metadata_filter: all,-execution,-papermill,-trusted
 5 | #     text_representation:
 6 | #       extension: .py
 7 | #       format_name: percent
 8 | #       format_version: '1.3'
 9 | #       jupytext_version: 1.11.5
10 | #   kernelspec:
11 | #     display_name: Python 3 (ipykernel)
12 | #     language: python
13 | #     name: python3
14 | # ---
15 | 
16 | # %% [markdown] tags=[]
17 | # # Description
18 | 
19 | # %% [markdown]
20 | # Clustermatch run using a larger number of samples.
21 | 
22 | # %% [markdown] tags=[]
23 | # # Modules
24 | 
25 | # %% tags=[]
26 | import numpy as np
27 | 
28 | from ccc.coef import ccc
29 | 
30 | # %% [markdown] tags=[]
31 | # # Data
32 | 
33 | # %% tags=[]
34 | n_genes, n_samples = 10, 30000
35 | 
36 | # %% tags=[]
37 | np.random.seed(0)
38 | 
39 | # %% tags=[]
40 | data = np.random.rand(n_genes, n_samples)
41 | 
42 | # %% tags=[]
43 | data.shape
44 | 
45 | 
46 | # %% [markdown] tags=[]
47 | # # With default `internal_n_clusters`
48 | 
49 | # %% tags=[]
50 | def func():
51 |     n_clust = list(range(2, 10 + 1))
52 |     return ccc(data, internal_n_clusters=n_clust)
53 | 
54 | 
55 | # %% tags=[]
56 | # %%timeit func()
57 | func()
58 | 
59 | # %% tags=[]
60 | # %%prun -s cumulative -l 50 -T 07-cm_many_samples-default_internal_n_clusters.txt
61 | func()
62 | 
63 | 
64 | # %% [markdown] tags=[]
65 | # # With reduced `internal_n_clusters`
66 | 
67 | # %% tags=[]
68 | def func():
69 |     n_clust = list(range(2, 5 + 1))
70 |     return ccc(data, internal_n_clusters=n_clust)
71 | 
72 | 
73 | # %% tags=[]
74 | # %%timeit func()
75 | func()
76 | 
77 | # %% tags=[]
78 | # %%prun -s cumulative -l 50 -T 07-cm_many_samples-less_internal_n_clusters.txt
79 | func()
80 | 
81 | # %% tags=[]
82 | 


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/07_cm_optimized/py/07-many_samples.py:
--------------------------------------------------------------------------------
 1 | # ---
 2 | # jupyter:
 3 | #   jupytext:
 4 | #     cell_metadata_filter: all,-execution,-papermill,-trusted
 5 | #     text_representation:
 6 | #       extension: .py
 7 | #       format_name: percent
 8 | #       format_version: '1.3'
 9 | #       jupytext_version: 1.11.5
10 | #   kernelspec:
11 | #     display_name: Python 3 (ipykernel)
12 | #     language: python
13 | #     name: python3
14 | # ---
15 | 
16 | # %% [markdown] tags=[]
17 | # # Description
18 | 
19 | # %% [markdown]
20 | # Clustermatch run using a larger number of samples.
21 | 
22 | # %% [markdown] tags=[]
23 | # # Modules
24 | 
25 | # %% tags=[]
26 | import numpy as np
27 | 
28 | from ccc.coef import ccc
29 | 
30 | # %% [markdown] tags=[]
31 | # # Data
32 | 
33 | # %% tags=[]
34 | n_genes, n_samples = 10, 30000
35 | 
36 | # %% tags=[]
37 | np.random.seed(0)
38 | 
39 | # %% tags=[]
40 | data = np.random.rand(n_genes, n_samples)
41 | 
42 | # %% tags=[]
43 | data.shape
44 | 
45 | 
46 | # %% [markdown] tags=[]
47 | # # With default `internal_n_clusters`
48 | 
49 | # %% tags=[]
50 | def func():
51 |     n_clust = list(range(2, 10 + 1))
52 |     return ccc(data, internal_n_clusters=n_clust)
53 | 
54 | 
55 | # %% tags=[]
56 | # %%timeit func()
57 | func()
58 | 
59 | # %% tags=[]
60 | # %%prun -s cumulative -l 50 -T 07-cm_many_samples-default_internal_n_clusters.txt
61 | func()
62 | 
63 | 
64 | # %% [markdown] tags=[]
65 | # # With reduced `internal_n_clusters`
66 | 
67 | # %% tags=[]
68 | def func():
69 |     n_clust = list(range(2, 5 + 1))
70 |     return ccc(data, internal_n_clusters=n_clust)
71 | 
72 | 
73 | # %% tags=[]
74 | # %%timeit func()
75 | func()
76 | 
77 | # %% tags=[]
78 | # %%prun -s cumulative -l 50 -T 07-cm_many_samples-less_internal_n_clusters.txt
79 | func()
80 | 
81 | # %% tags=[]
82 | 


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/10_cm_optimized/10-n_samples_large_100000.txt:
--------------------------------------------------------------------------------
 1 |          9838 function calls (9828 primitive calls) in 4.263 seconds
 2 | 
 3 |    Ordered by: cumulative time
 4 |    List reduced from 131 to 20 due to restriction <20>
 5 | 
 6 |    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
 7 |         1    0.000    0.000    4.263    4.263 {built-in method builtins.exec}
 8 |         1    0.000    0.000    4.263    4.263 <string>:1(<module>)
 9 |         1    0.000    0.000    4.263    4.263 1517976664.py:1(func)
10 |        10    0.005    0.001    4.263    0.426 coef.py:275(cm)
11 |       221    0.001    0.000    4.234    0.019 threading.py:280(wait)
12 |       892    4.234    0.005    4.234    0.005 {method 'acquire' of '_thread.lock' objects}
13 |        10    0.000    0.000    2.689    0.269 coef.py:414(compute_coef)
14 |        10    0.000    0.000    2.688    0.269 coef.py:407(cdist_func)
15 |        10    0.003    0.000    2.688    0.269 coef.py:168(cdist_parts_parallel)
16 |       120    0.000    0.000    2.676    0.022 threading.py:556(wait)
17 |       100    0.001    0.000    2.675    0.027 _base.py:201(as_completed)
18 |       110    0.000    0.000    1.559    0.014 _base.py:417(result)
19 |        30    0.000    0.000    1.559    0.052 _base.py:601(result_iterator)
20 |        10    0.004    0.000    0.009    0.001 coef.py:186(<dictcomp>)
21 |       110    0.001    0.000    0.009    0.000 thread.py:155(submit)
22 |       110    0.001    0.000    0.007    0.000 thread.py:174(_adjust_thread_count)
23 |        30    0.000    0.000    0.005    0.000 threading.py:873(start)
24 |        10    0.000    0.000    0.004    0.000 _base.py:572(map)
25 |        10    0.000    0.000    0.004    0.000 _base.py:597(<listcomp>)
26 |        50    0.003    0.000    0.003    0.000 {built-in method numpy.zeros}


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/11_cm_optimized/py/06-many_genes.py:
--------------------------------------------------------------------------------
 1 | # ---
 2 | # jupyter:
 3 | #   jupytext:
 4 | #     cell_metadata_filter: all,-execution,-papermill,-trusted
 5 | #     text_representation:
 6 | #       extension: .py
 7 | #       format_name: percent
 8 | #       format_version: '1.3'
 9 | #       jupytext_version: 1.11.5
10 | #   kernelspec:
11 | #     display_name: Python 3 (ipykernel)
12 | #     language: python
13 | #     name: python3
14 | # ---
15 | 
16 | # %% [markdown] tags=[]
17 | # # Description
18 | 
19 | # %% [markdown] tags=[]
20 | # Clustermatch run using a larger number of genes.
21 | 
22 | # %% [markdown] tags=[]
23 | # # Remove pycache dir
24 | 
25 | # %% tags=[]
26 | # !echo ${CODE_DIR}
27 | 
28 | # %% tags=[]
29 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -print
30 | 
31 | # %% tags=[]
32 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -exec rm -rf {} \;
33 | 
34 | # %% tags=[]
35 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -print
36 | 
37 | # %% [markdown] tags=[]
38 | # # Modules
39 | 
40 | # %% tags=[]
41 | import numpy as np
42 | 
43 | from ccc.coef import ccc
44 | 
45 | # %% tags=[]
46 | # let numba compile all the code before profiling
47 | ccc(np.random.rand(10), np.random.rand(10))
48 | 
49 | # %% [markdown] tags=[]
50 | # # Data
51 | 
52 | # %% tags=[]
53 | n_genes, n_samples = 500, 1000
54 | 
55 | # %% tags=[]
56 | np.random.seed(0)
57 | 
58 | # %% tags=[]
59 | data = np.random.rand(n_genes, n_samples)
60 | 
61 | # %% tags=[]
62 | data.shape
63 | 
64 | 
65 | # %% [markdown] tags=[]
66 | # # Profile
67 | 
68 | # %% tags=[]
69 | def func():
70 |     n_clust = list(range(2, 10 + 1))
71 |     return ccc(data, internal_n_clusters=n_clust)
72 | 
73 | 
74 | # %% tags=[]
75 | # %%timeit func()
76 | func()
77 | 
78 | # %% tags=[]
79 | # %%prun -s cumulative -l 50 -T 06-cm_many_genes.txt
80 | func()
81 | 
82 | # %% tags=[]
83 | 


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/12_cm_optimized/py/06-many_genes.py:
--------------------------------------------------------------------------------
 1 | # ---
 2 | # jupyter:
 3 | #   jupytext:
 4 | #     cell_metadata_filter: all,-execution,-papermill,-trusted
 5 | #     notebook_metadata_filter: -jupytext.text_representation.jupytext_version
 6 | #     text_representation:
 7 | #       extension: .py
 8 | #       format_name: percent
 9 | #       format_version: '1.3'
10 | #   kernelspec:
11 | #     display_name: Python 3 (ipykernel)
12 | #     language: python
13 | #     name: python3
14 | # ---
15 | 
16 | # %% [markdown] tags=[]
17 | # # Description
18 | 
19 | # %% [markdown] tags=[]
20 | # Clustermatch run using a larger number of genes.
21 | 
22 | # %% [markdown] tags=[]
23 | # # Remove pycache dir
24 | 
25 | # %% tags=[]
26 | # !echo ${CODE_DIR}
27 | 
28 | # %% tags=[]
29 | # !find ${CODE_DIR}/libs -regex '^.*\(__pycache__\)$' -print
30 | 
31 | # %% tags=[]
32 | # !find ${CODE_DIR}/libs -regex '^.*\(__pycache__\)$' -exec rm -rf {} \;
33 | 
34 | # %% tags=[]
35 | # !find ${CODE_DIR}/libs -regex '^.*\(__pycache__\)$' -print
36 | 
37 | # %% [markdown] tags=[]
38 | # # Modules
39 | 
40 | # %% tags=[]
41 | import numpy as np
42 | 
43 | from ccc.coef import ccc
44 | 
45 | # %% tags=[]
46 | # let numba compile all the code before profiling
47 | ccc(np.random.rand(10), np.random.rand(10))
48 | 
49 | # %% [markdown] tags=[]
50 | # # Data
51 | 
52 | # %% tags=[]
53 | n_genes, n_samples = 500, 1000
54 | 
55 | # %% tags=[]
56 | np.random.seed(0)
57 | 
58 | # %% tags=[]
59 | data = np.random.rand(n_genes, n_samples)
60 | 
61 | # %% tags=[]
62 | data.shape
63 | 
64 | 
65 | # %% [markdown] tags=[]
66 | # # Profile
67 | 
68 | # %% tags=[]
69 | def func():
70 |     n_clust = list(range(2, 10 + 1))
71 |     return ccc(data, internal_n_clusters=n_clust)
72 | 
73 | 
74 | # %% tags=[]
75 | # %%timeit func()
76 | func()
77 | 
78 | # %% tags=[]
79 | # %%prun -s cumulative -l 50 -T 06-cm_many_genes.txt
80 | func()
81 | 
82 | # %% tags=[]
83 | 


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/10_cm_optimized/06-n_samples_large.txt:
--------------------------------------------------------------------------------
 1 |          21654 function calls in 7.834 seconds
 2 | 
 3 |    Ordered by: cumulative time
 4 |    List reduced from 64 to 20 due to restriction <20>
 5 | 
 6 |    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
 7 |         1    0.000    0.000    7.834    7.834 {built-in method builtins.exec}
 8 |         1    0.000    0.000    7.834    7.834 <string>:1(<module>)
 9 |         1    0.000    0.000    7.834    7.834 691993785.py:1(func)
10 |        10    0.016    0.002    7.834    0.783 coef.py:251(_cm)
11 |        20    0.015    0.001    3.914    0.196 coef.py:154(_get_parts)
12 |        10    3.902    0.390    3.902    0.390 coef.py:183(cdist_parts)
13 |       180    0.134    0.001    3.602    0.020 coef.py:63(run_quantile_clustering)
14 |       360    3.045    0.008    3.045    0.008 {method 'argsort' of 'numpy.ndarray' objects}
15 |       180    0.300    0.002    1.943    0.011 stats.py:8631(rankdata)
16 |      1620    0.019    0.000    1.858    0.001 {built-in method numpy.core._multiarray_umath.implement_array_function}
17 |       540    0.001    0.000    1.543    0.003 fromnumeric.py:51(_wrapfunc)
18 |       180    0.000    0.000    1.528    0.008 <__array_function__ internals>:2(argsort)
19 |       180    0.000    0.000    1.528    0.008 fromnumeric.py:1006(argsort)
20 |        20    0.001    0.000    0.296    0.015 coef.py:177(<listcomp>)
21 |       180    0.000    0.000    0.295    0.002 <__array_function__ internals>:2(unique)
22 |       180    0.000    0.000    0.295    0.002 arraysetops.py:138(unique)
23 |       180    0.007    0.000    0.294    0.002 arraysetops.py:320(_unique1d)
24 |       180    0.284    0.002    0.284    0.002 {method 'sort' of 'numpy.ndarray' objects}
25 |       180    0.052    0.000    0.052    0.000 {method 'cumsum' of 'numpy.ndarray' objects}
26 |       360    0.005    0.000    0.031    0.000 index_tricks.py:323(__getitem__)


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/05_cm_optimized/04-cm_ari_numba.txt:
--------------------------------------------------------------------------------
 1 |          592106 function calls in 53.048 seconds
 2 | 
 3 |    Ordered by: cumulative time
 4 |    List reduced from 70 to 20 due to restriction <20>
 5 | 
 6 |    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
 7 |         1    0.000    0.000   53.048   53.048 {built-in method builtins.exec}
 8 |         1    0.000    0.000   53.048   53.048 <string>:1(<module>)
 9 |         1    0.000    0.000   53.048   53.048 4139949497.py:1(func)
10 |         1    0.024    0.024   53.048   53.048 coef.py:163(cm)
11 |      4950    0.020    0.000   52.745    0.011 distance.py:2616(cdist)
12 |      4950    0.954    0.000   52.720    0.011 distance.py:2606(_cdist_callable)
13 |    400950   51.761    0.000   51.761    0.000 metrics.py:46(adjusted_rand_index)
14 |       100    0.003    0.000    0.239    0.002 coef.py:113(_get_parts)
15 |       900    0.021    0.000    0.199    0.000 coef.py:29(run_quantile_clustering)
16 |     13950    0.032    0.000    0.158    0.000 {built-in method numpy.core._multiarray_umath.implement_array_function}
17 |       900    0.026    0.000    0.125    0.000 stats.py:8631(rankdata)
18 |      3600    0.003    0.000    0.086    0.000 fromnumeric.py:51(_wrapfunc)
19 |      1800    0.002    0.000    0.083    0.000 <__array_function__ internals>:2(argsort)
20 |      1800    0.001    0.000    0.080    0.000 fromnumeric.py:1006(argsort)
21 |      1800    0.077    0.000    0.077    0.000 {method 'argsort' of 'numpy.ndarray' objects}
22 |      1800    0.015    0.000    0.039    0.000 index_tricks.py:323(__getitem__)
23 |       900    0.001    0.000    0.037    0.000 <__array_function__ internals>:2(unique)
24 |       900    0.001    0.000    0.035    0.000 arraysetops.py:138(unique)
25 |       900    0.012    0.000    0.033    0.000 arraysetops.py:320(_unique1d)
26 |      4950    0.005    0.000    0.029    0.000 <__array_function__ internals>:2(unravel_index)


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/05_cm_optimized/04-cm_ari_sklearn.txt:
--------------------------------------------------------------------------------
 1 |          365055656 function calls (362649956 primitive calls) in 431.008 seconds
 2 | 
 3 |    Ordered by: cumulative time
 4 |    List reduced from 209 to 20 due to restriction <20>
 5 | 
 6 |    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
 7 |         1    0.000    0.000  431.008  431.008 {built-in method builtins.exec}
 8 |         1    0.000    0.000  431.008  431.008 <string>:1(<module>)
 9 |         1    0.000    0.000  431.008  431.008 4139949497.py:1(func)
10 |         1    0.028    0.028  431.008  431.008 coef.py:163(cm)
11 |      4950    0.022    0.000  430.705    0.087 distance.py:2616(cdist)
12 |      4950    0.881    0.000  430.677    0.087 distance.py:2606(_cdist_callable)
13 |    400950    5.395    0.000  429.789    0.001 _supervised.py:302(adjusted_rand_score)
14 |    400950    6.795    0.000  424.394    0.001 _supervised.py:154(pair_confusion_matrix)
15 |   1202850    2.658    0.000  201.502    0.000 validation.py:59(inner_f)
16 |    400950    3.032    0.000  161.605    0.000 _supervised.py:87(contingency_matrix)
17 | 15651000/14047200   17.349    0.000  126.910    0.000 {built-in method numpy.core._multiarray_umath.implement_array_function}
18 |    400950    1.945    0.000  121.473    0.000 _supervised.py:32(check_clusterings)
19 |   2005650    1.842    0.000   91.816    0.000 <__array_function__ internals>:2(unique)
20 |    801900    3.315    0.000   88.057    0.000 compressed.py:588(sum)
21 |   2005650    3.214    0.000   87.180    0.000 arraysetops.py:138(unique)
22 |   2005650   30.384    0.000   81.455    0.000 arraysetops.py:320(_unique1d)
23 |   1202850    7.100    0.000   78.486    0.000 compressed.py:27(__init__)
24 |    400950    5.167    0.000   71.729    0.000 coo.py:372(tocsr)
25 |    801900    9.618    0.000   68.168    0.000 multiclass.py:186(type_of_target)
26 |    400950    2.239    0.000   61.496    0.000 base.py:968(sum)


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/05_cm_optimized/py/05-compare_precomputing_of_parts.py:
--------------------------------------------------------------------------------
 1 | # ---
 2 | # jupyter:
 3 | #   jupytext:
 4 | #     cell_metadata_filter: all,-execution,-papermill,-trusted
 5 | #     text_representation:
 6 | #       extension: .py
 7 | #       format_name: percent
 8 | #       format_version: '1.3'
 9 | #       jupytext_version: 1.11.5
10 | #   kernelspec:
11 | #     display_name: Python 3 (ipykernel)
12 | #     language: python
13 | #     name: python3
14 | # ---
15 | 
16 | # %% [markdown] tags=[]
17 | # # Description
18 | 
19 | # %% [markdown]
20 | # Compares two different ccc implementations: one using precomputation of internal clusterings, and the other one using the original implementation that does not perform such precomputation.
21 | 
22 | # %% [markdown]
23 | # # Modules
24 | 
25 | # %% tags=[]
26 | import numpy as np
27 | 
28 | from ccc.coef import ccc
29 | 
30 | # %% [markdown]
31 | # # Data
32 | 
33 | # %% tags=[]
34 | n_genes, n_samples = 100, 1000
35 | 
36 | # %% tags=[]
37 | np.random.seed(0)
38 | 
39 | # %% tags=[]
40 | data = np.random.rand(n_genes, n_samples)
41 | 
42 | # %% tags=[]
43 | data.shape
44 | 
45 | 
46 | # %% [markdown] tags=[]
47 | # # Improved implementation (`precompute_parts=True`)
48 | 
49 | # %% tags=[]
50 | def func():
51 |     return ccc(data, internal_n_clusters=range(2, 10 + 1), precompute_parts=True)
52 | 
53 | 
54 | # %% tags=[]
55 | # %%timeit func()
56 | func()
57 | 
58 | # %% tags=[]
59 | # %%prun -s cumulative -l 20 -T 05-cm_precompute_parts_true.txt
60 | func()
61 | 
62 | 
63 | # %% [markdown] tags=[]
64 | # # Original implementation (`precompute_parts=False`)
65 | 
66 | # %% tags=[]
67 | def func():
68 |     return ccc(data, internal_n_clusters=range(2, 10 + 1), precompute_parts=False)
69 | 
70 | 
71 | # %% tags=[]
72 | # %%timeit func()
73 | func()
74 | 
75 | # %% tags=[]
76 | # %%prun -s cumulative -l 20 -T 05-cm_precompute_parts_false.txt
77 | func()
78 | 
79 | # %% tags=[]
80 | 


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/10_cm_optimized/05-n_samples_small.txt:
--------------------------------------------------------------------------------
 1 |          5094 function calls in 0.047 seconds
 2 | 
 3 |    Ordered by: cumulative time
 4 |    List reduced from 36 to 20 due to restriction <20>
 5 | 
 6 |    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
 7 |         1    0.000    0.000    0.047    0.047 {built-in method builtins.exec}
 8 |         1    0.000    0.000    0.047    0.047 <string>:1(<module>)
 9 |         1    0.000    0.000    0.047    0.047 1556911885.py:1(func)
10 |        10    0.001    0.000    0.047    0.005 coef.py:266(_cm)
11 |        20    0.001    0.000    0.026    0.001 coef.py:170(_get_parts)
12 |        10    0.020    0.002    0.020    0.002 coef.py:199(cdist_parts)
13 |       180    0.007    0.000    0.020    0.000 coef.py:81(run_quantile_clustering)
14 |       180    0.009    0.000    0.009    0.000 coef.py:32(rank)
15 |       360    0.001    0.000    0.006    0.000 {built-in method numpy.core._multiarray_umath.implement_array_function}
16 |        20    0.000    0.000    0.005    0.000 coef.py:193(<listcomp>)
17 |       180    0.000    0.000    0.004    0.000 <__array_function__ internals>:2(unique)
18 |       180    0.000    0.000    0.004    0.000 arraysetops.py:138(unique)
19 |       180    0.002    0.000    0.003    0.000 arraysetops.py:320(_unique1d)
20 |       180    0.000    0.000    0.002    0.000 <__array_function__ internals>:2(searchsorted)
21 |       180    0.000    0.000    0.002    0.000 fromnumeric.py:1283(searchsorted)
22 |       180    0.000    0.000    0.001    0.000 fromnumeric.py:51(_wrapfunc)
23 |       180    0.001    0.000    0.001    0.000 {method 'sort' of 'numpy.ndarray' objects}
24 |       180    0.001    0.000    0.001    0.000 {method 'searchsorted' of 'numpy.ndarray' objects}
25 |       180    0.001    0.000    0.001    0.000 {method 'argsort' of 'numpy.ndarray' objects}
26 |       230    0.000    0.000    0.000    0.000 {built-in method numpy.zeros}


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/10_cm_optimized/05-n_samples_large.txt:
--------------------------------------------------------------------------------
 1 |          5094 function calls in 19.355 seconds
 2 | 
 3 |    Ordered by: cumulative time
 4 |    List reduced from 36 to 20 due to restriction <20>
 5 | 
 6 |    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
 7 |         1    0.000    0.000   19.355   19.355 {built-in method builtins.exec}
 8 |         1    0.000    0.000   19.355   19.355 <string>:1(<module>)
 9 |         1    0.009    0.009   19.355   19.355 1556911885.py:1(func)
10 |        10    0.032    0.003   19.346    1.935 coef.py:266(_cm)
11 |        20    0.013    0.001   14.474    0.724 coef.py:170(_get_parts)
12 |       180    0.210    0.001   14.050    0.078 coef.py:81(run_quantile_clustering)
13 |       180   11.764    0.065   11.764    0.065 coef.py:32(rank)
14 |        10    4.839    0.484    4.839    0.484 coef.py:199(cdist_parts)
15 |       180    2.066    0.011    2.066    0.011 {method 'argsort' of 'numpy.ndarray' objects}
16 |       360    0.001    0.000    0.412    0.001 {built-in method numpy.core._multiarray_umath.implement_array_function}
17 |        20    0.001    0.000    0.408    0.020 coef.py:193(<listcomp>)
18 |       180    0.000    0.000    0.407    0.002 <__array_function__ internals>:2(unique)
19 |       180    0.001    0.000    0.406    0.002 arraysetops.py:138(unique)
20 |       180    0.013    0.000    0.405    0.002 arraysetops.py:320(_unique1d)
21 |       180    0.387    0.002    0.387    0.002 {method 'sort' of 'numpy.ndarray' objects}
22 |       180    0.001    0.000    0.007    0.000 <__array_function__ internals>:2(searchsorted)
23 |       230    0.005    0.000    0.005    0.000 {built-in method numpy.zeros}
24 |       180    0.001    0.000    0.005    0.000 fromnumeric.py:1283(searchsorted)
25 |       180    0.000    0.000    0.004    0.000 fromnumeric.py:51(_wrapfunc)
26 |       180    0.004    0.000    0.004    0.000 {method 'searchsorted' of 'numpy.ndarray' objects}


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/05_cm_optimized/05-cm_precompute_parts_false.txt:
--------------------------------------------------------------------------------
 1 |          91218606 function calls (81318606 primitive calls) in 121.734 seconds
 2 | 
 3 |    Ordered by: cumulative time
 4 |    List reduced from 82 to 20 due to restriction <20>
 5 | 
 6 |    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
 7 |         1    0.000    0.000  121.734  121.734 {built-in method builtins.exec}
 8 |         1    0.000    0.000  121.734  121.734 <string>:1(<module>)
 9 |         1    0.000    0.000  121.734  121.734 1745201673.py:1(func)
10 |         1    0.086    0.086  121.734  121.734 coef.py:163(cm)
11 |      4950    0.021    0.000   49.955    0.010 distance.py:2616(cdist)
12 |      4950    0.988    0.000   49.929    0.010 distance.py:2606(_cdist_callable)
13 |    400950   48.935    0.000   48.935    0.000 metrics.py:46(adjusted_rand_index)
14 |      4950    0.028    0.000   48.169    0.010 coef.py:153(_get_common_features)
15 |      9900    0.038    0.000   48.098    0.005 coef.py:149(_isempty)
16 |      9900   19.278    0.002   47.668    0.005 coef.py:150(<listcomp>)
17 | 20695950/10795950   12.181    0.000   36.403    0.000 {built-in method numpy.core._multiarray_umath.implement_array_function}
18 |   9900000    4.311    0.000   28.390    0.000 <__array_function__ internals>:2(isreal)
19 |      9900    0.247    0.000   23.440    0.002 coef.py:113(_get_parts)
20 |     89100    2.080    0.000   19.487    0.000 coef.py:29(run_quantile_clustering)
21 |   9900000    4.959    0.000   17.102    0.000 type_check.py:247(isreal)
22 |     89100    2.589    0.000   12.277    0.000 stats.py:8631(rankdata)
23 |   9900000    3.887    0.000   12.144    0.000 <__array_function__ internals>:2(imag)
24 |    356400    0.272    0.000    8.465    0.000 fromnumeric.py:51(_wrapfunc)
25 |    178200    0.152    0.000    8.200    0.000 <__array_function__ internals>:2(argsort)
26 |    178200    0.149    0.000    7.904    0.000 fromnumeric.py:1006(argsort)


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/05_cm_optimized/05-cm_precompute_parts_true.txt:
--------------------------------------------------------------------------------
 1 |          592106 function calls in 50.073 seconds
 2 | 
 3 |    Ordered by: cumulative time
 4 |    List reduced from 70 to 20 due to restriction <20>
 5 | 
 6 |    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
 7 |         1    0.000    0.000   50.073   50.073 {built-in method builtins.exec}
 8 |         1    0.000    0.000   50.073   50.073 <string>:1(<module>)
 9 |         1    0.000    0.000   50.073   50.073 674090675.py:1(func)
10 |         1    0.023    0.023   50.073   50.073 coef.py:163(cm)
11 |      4950    0.018    0.000   49.776    0.010 distance.py:2616(cdist)
12 |      4950    0.953    0.000   49.753    0.010 distance.py:2606(_cdist_callable)
13 |    400950   48.794    0.000   48.794    0.000 metrics.py:46(adjusted_rand_index)
14 |       100    0.003    0.000    0.238    0.002 coef.py:113(_get_parts)
15 |       900    0.021    0.000    0.198    0.000 coef.py:29(run_quantile_clustering)
16 |     13950    0.030    0.000    0.156    0.000 {built-in method numpy.core._multiarray_umath.implement_array_function}
17 |       900    0.026    0.000    0.125    0.000 stats.py:8631(rankdata)
18 |      3600    0.003    0.000    0.086    0.000 fromnumeric.py:51(_wrapfunc)
19 |      1800    0.002    0.000    0.083    0.000 <__array_function__ internals>:2(argsort)
20 |      1800    0.002    0.000    0.080    0.000 fromnumeric.py:1006(argsort)
21 |      1800    0.077    0.000    0.077    0.000 {method 'argsort' of 'numpy.ndarray' objects}
22 |      1800    0.015    0.000    0.039    0.000 index_tricks.py:323(__getitem__)
23 |       900    0.001    0.000    0.037    0.000 <__array_function__ internals>:2(unique)
24 |       900    0.002    0.000    0.035    0.000 arraysetops.py:138(unique)
25 |       900    0.012    0.000    0.032    0.000 arraysetops.py:320(_unique1d)
26 |      4950    0.005    0.000    0.027    0.000 <__array_function__ internals>:2(unravel_index)


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/10_cm_optimized/06-n_samples_small.txt:
--------------------------------------------------------------------------------
 1 |          21654 function calls in 0.059 seconds
 2 | 
 3 |    Ordered by: cumulative time
 4 |    List reduced from 64 to 20 due to restriction <20>
 5 | 
 6 |    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
 7 |         1    0.000    0.000    0.059    0.059 {built-in method builtins.exec}
 8 |         1    0.000    0.000    0.059    0.059 <string>:1(<module>)
 9 |         1    0.000    0.000    0.059    0.059 691993785.py:1(func)
10 |        10    0.001    0.000    0.059    0.006 coef.py:251(_cm)
11 |        20    0.001    0.000    0.036    0.002 coef.py:154(_get_parts)
12 |       180    0.006    0.000    0.032    0.000 coef.py:63(run_quantile_clustering)
13 |       180    0.005    0.000    0.022    0.000 stats.py:8631(rankdata)
14 |        10    0.022    0.002    0.022    0.002 coef.py:183(cdist_parts)
15 |       360    0.004    0.000    0.012    0.000 index_tricks.py:323(__getitem__)
16 |      1620    0.002    0.000    0.010    0.000 {built-in method numpy.core._multiarray_umath.implement_array_function}
17 |        20    0.000    0.000    0.004    0.000 coef.py:177(<listcomp>)
18 |       540    0.001    0.000    0.003    0.000 fromnumeric.py:51(_wrapfunc)
19 |       180    0.000    0.000    0.003    0.000 <__array_function__ internals>:2(unique)
20 |       180    0.000    0.000    0.003    0.000 arraysetops.py:138(unique)
21 |       360    0.001    0.000    0.003    0.000 numerictypes.py:599(find_common_type)
22 |       360    0.002    0.000    0.002    0.000 {method 'argsort' of 'numpy.ndarray' objects}
23 |       180    0.001    0.000    0.002    0.000 arraysetops.py:320(_unique1d)
24 |       180    0.000    0.000    0.002    0.000 <__array_function__ internals>:2(searchsorted)
25 |       180    0.000    0.000    0.002    0.000 <__array_function__ internals>:2(argsort)
26 |       360    0.001    0.000    0.002    0.000 <__array_function__ internals>:2(concatenate)


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/05_cm_optimized/06-cm_many_genes.txt:
--------------------------------------------------------------------------------
 1 |          12560506 function calls in 1263.543 seconds
 2 | 
 3 |    Ordered by: cumulative time
 4 |    List reduced from 70 to 20 due to restriction <20>
 5 | 
 6 |    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
 7 |         1    0.000    0.000 1263.543 1263.543 {built-in method builtins.exec}
 8 |         1    0.000    0.000 1263.543 1263.543 <string>:1(<module>)
 9 |         1    0.000    0.000 1263.543 1263.543 674090675.py:1(func)
10 |         1    0.538    0.538 1263.543 1263.543 coef.py:163(cm)
11 |    124750    0.457    0.000 1260.921    0.010 distance.py:2616(cdist)
12 |    124750   25.373    0.000 1260.340    0.010 distance.py:2606(_cdist_callable)
13 |  10104750 1234.826    0.000 1234.826    0.000 metrics.py:46(adjusted_rand_index)
14 |    169750    0.576    0.000    1.202    0.000 {built-in method numpy.core._multiarray_umath.implement_array_function}
15 |       500    0.012    0.000    1.184    0.002 coef.py:113(_get_parts)
16 |      4500    0.106    0.000    0.983    0.000 coef.py:29(run_quantile_clustering)
17 |    124750    0.126    0.000    0.681    0.000 <__array_function__ internals>:2(unravel_index)
18 |      4500    0.130    0.000    0.616    0.000 stats.py:8631(rankdata)
19 |     18000    0.013    0.000    0.429    0.000 fromnumeric.py:51(_wrapfunc)
20 |      9000    0.008    0.000    0.415    0.000 <__array_function__ internals>:2(argsort)
21 |      9000    0.007    0.000    0.401    0.000 fromnumeric.py:1006(argsort)
22 |      9000    0.384    0.000    0.384    0.000 {method 'argsort' of 'numpy.ndarray' objects}
23 |    124750    0.216    0.000    0.216    0.000 {method 'argmax' of 'numpy.ndarray' objects}
24 |      9000    0.073    0.000    0.193    0.000 index_tricks.py:323(__getitem__)
25 |      4500    0.004    0.000    0.183    0.000 <__array_function__ internals>:2(unique)
26 |      4500    0.007    0.000    0.173    0.000 arraysetops.py:138(unique)


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/05_cm_optimized/07-cm_many_samples-less_internal_n_clusters.txt:
--------------------------------------------------------------------------------
 1 |          6641 function calls in 2.164 seconds
 2 | 
 3 |    Ordered by: cumulative time
 4 |    List reduced from 70 to 20 due to restriction <20>
 5 | 
 6 |    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
 7 |         1    0.000    0.000    2.164    2.164 {built-in method builtins.exec}
 8 |         1    0.000    0.000    2.164    2.164 <string>:1(<module>)
 9 |         1    0.001    0.001    2.164    2.164 3897795364.py:1(func)
10 |         1    0.001    0.001    2.163    2.163 coef.py:163(cm)
11 |        45    0.000    0.000    1.908    0.042 distance.py:2616(cdist)
12 |        45    0.004    0.000    1.908    0.042 distance.py:2606(_cdist_callable)
13 |       720    1.904    0.003    1.904    0.003 metrics.py:46(adjusted_rand_index)
14 |        10    0.000    0.000    0.253    0.025 coef.py:113(_get_parts)
15 |        40    0.010    0.000    0.222    0.006 coef.py:29(run_quantile_clustering)
16 |       445    0.002    0.000    0.213    0.000 {built-in method numpy.core._multiarray_umath.implement_array_function}
17 |       160    0.000    0.000    0.181    0.001 fromnumeric.py:51(_wrapfunc)
18 |        80    0.000    0.000    0.180    0.002 <__array_function__ internals>:2(argsort)
19 |        80    0.000    0.000    0.180    0.002 fromnumeric.py:1006(argsort)
20 |        80    0.179    0.002    0.179    0.002 {method 'argsort' of 'numpy.ndarray' objects}
21 |        40    0.022    0.001    0.121    0.003 stats.py:8631(rankdata)
22 |        40    0.000    0.000    0.030    0.001 <__array_function__ internals>:2(unique)
23 |        40    0.000    0.000    0.030    0.001 arraysetops.py:138(unique)
24 |        40    0.002    0.000    0.030    0.001 arraysetops.py:320(_unique1d)
25 |        40    0.028    0.001    0.028    0.001 {method 'sort' of 'numpy.ndarray' objects}
26 |        40    0.003    0.000    0.003    0.000 {method 'cumsum' of 'numpy.ndarray' objects}


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/05_cm_optimized/07-cm_many_samples-default_internal_n_clusters.txt:
--------------------------------------------------------------------------------
 1 |          16016 function calls in 15.982 seconds
 2 | 
 3 |    Ordered by: cumulative time
 4 |    List reduced from 70 to 20 due to restriction <20>
 5 | 
 6 |    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
 7 |         1    0.000    0.000   15.982   15.982 {built-in method builtins.exec}
 8 |         1    0.000    0.000   15.982   15.982 <string>:1(<module>)
 9 |         1    0.002    0.002   15.982   15.982 674090675.py:1(func)
10 |         1    0.001    0.001   15.980   15.980 coef.py:163(cm)
11 |        45    0.000    0.000   15.391    0.342 distance.py:2616(cdist)
12 |        45    0.020    0.000   15.390    0.342 distance.py:2606(_cdist_callable)
13 |      3645   15.371    0.004   15.371    0.004 metrics.py:46(adjusted_rand_index)
14 |        10    0.001    0.000    0.587    0.059 coef.py:113(_get_parts)
15 |        90    0.024    0.000    0.503    0.006 coef.py:29(run_quantile_clustering)
16 |       945    0.003    0.000    0.493    0.001 {built-in method numpy.core._multiarray_umath.implement_array_function}
17 |       360    0.000    0.000    0.413    0.001 fromnumeric.py:51(_wrapfunc)
18 |       180    0.000    0.000    0.410    0.002 <__array_function__ internals>:2(argsort)
19 |       180    0.000    0.000    0.410    0.002 fromnumeric.py:1006(argsort)
20 |       180    0.409    0.002    0.409    0.002 {method 'argsort' of 'numpy.ndarray' objects}
21 |        90    0.047    0.001    0.271    0.003 stats.py:8631(rankdata)
22 |        90    0.000    0.000    0.076    0.001 <__array_function__ internals>:2(unique)
23 |        90    0.000    0.000    0.076    0.001 arraysetops.py:138(unique)
24 |        90    0.004    0.000    0.076    0.001 arraysetops.py:320(_unique1d)
25 |        90    0.071    0.001    0.071    0.001 {method 'sort' of 'numpy.ndarray' objects}
26 |        90    0.008    0.000    0.008    0.000 {method 'cumsum' of 'numpy.ndarray' objects}


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/05_cm_optimized/py/04-compare_numba_ari.py:
--------------------------------------------------------------------------------
 1 | # ---
 2 | # jupyter:
 3 | #   jupytext:
 4 | #     cell_metadata_filter: all,-execution,-papermill,-trusted
 5 | #     text_representation:
 6 | #       extension: .py
 7 | #       format_name: percent
 8 | #       format_version: '1.3'
 9 | #       jupytext_version: 1.11.5
10 | #   kernelspec:
11 | #     display_name: Python 3 (ipykernel)
12 | #     language: python
13 | #     name: python3
14 | # ---
15 | 
16 | # %% [markdown] tags=[]
17 | # # Description
18 | 
19 | # %% [markdown]
20 | # Compares two different ccc implementations: one using the new optimized adjusted Rand index (ARI) with numba, and the other one using the ARI from scikit-learn.
21 | 
22 | # %% [markdown] tags=[]
23 | # # Modules
24 | 
25 | # %% tags=[]
26 | import numpy as np
27 | 
28 | from ccc import coef
29 | 
30 | # %% [markdown] tags=[]
31 | # # Data
32 | 
33 | # %% tags=[]
34 | n_genes, n_samples = 100, 1000
35 | 
36 | # %% tags=[]
37 | np.random.seed(0)
38 | 
39 | # %% tags=[]
40 | data = np.random.rand(n_genes, n_samples)
41 | 
42 | # %% tags=[]
43 | data.shape
44 | 
45 | 
46 | # %% [markdown] tags=[]
47 | # # Improved implementation (ARI implemented in numba)
48 | 
49 | # %% tags=[]
50 | def func():
51 |     return coef.ccc(data, internal_n_clusters=range(2, 10 + 1), precompute_parts=True)
52 | 
53 | 
54 | # %% tags=[]
55 | # %%timeit func()
56 | func()
57 | 
58 | # %% tags=[]
59 | # %%prun -s cumulative -l 20 -T 04-cm_ari_numba.txt
60 | func()
61 | 
62 | # %% [markdown] tags=[]
63 | # # Original implementation (ARI from sklearn)
64 | 
65 | # %% tags=[]
66 | from sklearn.metrics import adjusted_rand_score
67 | 
68 | # %% tags=[]
69 | coef.ari = adjusted_rand_score
70 | 
71 | 
72 | # %% tags=[]
73 | def func():
74 |     return coef.ccc(data, internal_n_clusters=range(2, 10 + 1), precompute_parts=True)
75 | 
76 | 
77 | # %% tags=[]
78 | # %%timeit func()
79 | func()
80 | 
81 | # %% tags=[]
82 | # %%prun -s cumulative -l 20 -T 04-cm_ari_sklearn.txt
83 | func()
84 | 
85 | # %% tags=[]
86 | 


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/12_cm_optimized/py/10-many_genes.py:
--------------------------------------------------------------------------------
 1 | # ---
 2 | # jupyter:
 3 | #   jupytext:
 4 | #     cell_metadata_filter: all,-execution,-papermill,-trusted
 5 | #     notebook_metadata_filter: -jupytext.text_representation.jupytext_version
 6 | #     text_representation:
 7 | #       extension: .py
 8 | #       format_name: percent
 9 | #       format_version: '1.3'
10 | #   kernelspec:
11 | #     display_name: Python 3 (ipykernel)
12 | #     language: python
13 | #     name: python3
14 | # ---
15 | 
16 | # %% [markdown] tags=[]
17 | # # Description
18 | 
19 | # %% [markdown] tags=[]
20 | # Similar as `06` but it computes across gene pairs instead of data matrix.
21 | 
22 | # %% [markdown] tags=[]
23 | # # Remove pycache dir
24 | 
25 | # %% tags=[]
26 | # !echo ${CODE_DIR}
27 | 
28 | # %% tags=[]
29 | # !find ${CODE_DIR}/libs -regex '^.*\(__pycache__\)$' -print
30 | 
31 | # %% tags=[]
32 | # !find ${CODE_DIR}/libs -regex '^.*\(__pycache__\)$' -exec rm -rf {} \;
33 | 
34 | # %% tags=[]
35 | # !find ${CODE_DIR}/libs -regex '^.*\(__pycache__\)$' -print
36 | 
37 | # %% [markdown] tags=[]
38 | # # Modules
39 | 
40 | # %% tags=[]
41 | import numpy as np
42 | 
43 | from ccc.coef import ccc
44 | 
45 | # %% tags=[]
46 | # let numba compile all the code before profiling
47 | ccc(np.random.rand(10), np.random.rand(10))
48 | 
49 | # %% [markdown] tags=[]
50 | # # Data
51 | 
52 | # %% tags=[]
53 | n_genes, n_samples = 500, 1000
54 | 
55 | # %% tags=[]
56 | np.random.seed(0)
57 | 
58 | # %% tags=[]
59 | data = np.random.rand(n_genes, n_samples)
60 | 
61 | # %% tags=[]
62 | data.shape
63 | 
64 | 
65 | # %% [markdown] tags=[]
66 | # # Profile
67 | 
68 | # %% tags=[]
69 | def func():
70 |     res = np.full(int((data.shape[0] * (data.shape[0] - 1)) / 2), np.nan)
71 | 
72 |     n_clust = list(range(2, 10 + 1))
73 |     idx = 0
74 |     for i in range(data.shape[0] - 1):
75 |         for j in range(i + 1, data.shape[0]):
76 |             res[idx] = ccc(data[i], data[j], internal_n_clusters=n_clust)
77 |             idx += 1
78 | 
79 | 
80 | # %% tags=[]
81 | # %%timeit func()
82 | func()
83 | 
84 | # %% tags=[]
85 | # %%prun -s cumulative -l 50 -T 10-cm_many_genes.txt
86 | func()
87 | 
88 | # %% tags=[]
89 | 


--------------------------------------------------------------------------------
/nbs/25_pvalue/py/00-ccc_pvalue_dist-generate-data_matrix.py:
--------------------------------------------------------------------------------
 1 | # ---
 2 | # jupyter:
 3 | #   jupytext:
 4 | #     cell_metadata_filter: all,-execution,-papermill,-trusted
 5 | #     notebook_metadata_filter: -jupytext.text_representation.jupytext_version
 6 | #     text_representation:
 7 | #       extension: .py
 8 | #       format_name: percent
 9 | #       format_version: '1.3'
10 | #   kernelspec:
11 | #     display_name: Python 3 (ipykernel)
12 | #     language: python
13 | #     name: python3
14 | # ---
15 | 
16 | # %% [markdown] tags=[]
17 | # # Description
18 | 
19 | # %% [markdown] tags=[]
20 | # Generates a distribution of pvalues under the null hypothesis of no association.
21 | #
22 | # This notebook uses a data matrix as input for CCC and parallelizes computation across gene pairs.
23 | 
24 | # %% [markdown] tags=[]
25 | # # Modules loading
26 | 
27 | # %% tags=[]
28 | import numpy as np
29 | 
30 | from ccc.coef import ccc
31 | from ccc import conf
32 | 
33 | # %% [markdown] tags=[]
34 | # # Settings
35 | 
36 | # %% tags=[]
37 | rs = np.random.RandomState(0)
38 | 
39 | # %% tags=[]
40 | DATA_N_OBJS, DATA_N_FEATURES = 100, 1000
41 | PVALUE_N_PERMS = 1000
42 | 
43 | # %% [markdown] tags=[]
44 | # # Paths
45 | 
46 | # %% tags=[]
47 | OUTPUT_DIR = conf.RESULTS_DIR / "ccc_null-pvalues"
48 | OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
49 | 
50 | # %% tags=[]
51 | OUTPUT_DIR
52 | 
53 | # %% [markdown] tags=[]
54 | # # Generate random data
55 | 
56 | # %% tags=[]
57 | data = rs.rand(DATA_N_OBJS, DATA_N_FEATURES)
58 | 
59 | # %% tags=[]
60 | data.shape
61 | 
62 | # %% [markdown] tags=[]
63 | # # Run CCC
64 | 
65 | # %% tags=[]
66 | res = ccc(
67 |     data,
68 |     n_jobs=conf.GENERAL["N_JOBS"],
69 |     pvalue_n_perms=PVALUE_N_PERMS,
70 | )
71 | 
72 | # %% tags=[]
73 | cm_values, cm_pvalues = res
74 | 
75 | # %% tags=[]
76 | cm_values.shape
77 | 
78 | # %% tags=[]
79 | cm_pvalues.shape
80 | 
81 | # %% [markdown] tags=[]
82 | # # Save
83 | 
84 | # %% tags=[]
85 | output_file = OUTPUT_DIR / "data_matrix-cm_values.npy"
86 | display(output_file)
87 | 
88 | np.save(output_file, cm_values)
89 | 
90 | # %% tags=[]
91 | output_file = OUTPUT_DIR / "data_matrix-cm_pvalues.npy"
92 | display(output_file)
93 | 
94 | np.save(output_file, cm_pvalues)
95 | 
96 | # %% tags=[]
97 | 


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/11_cm_optimized/py/08-many_genes.py:
--------------------------------------------------------------------------------
 1 | # ---
 2 | # jupyter:
 3 | #   jupytext:
 4 | #     cell_metadata_filter: all,-execution,-papermill,-trusted
 5 | #     text_representation:
 6 | #       extension: .py
 7 | #       format_name: percent
 8 | #       format_version: '1.3'
 9 | #       jupytext_version: 1.11.5
10 | #   kernelspec:
11 | #     display_name: Python 3 (ipykernel)
12 | #     language: python
13 | #     name: python3
14 | # ---
15 | 
16 | # %% [markdown] tags=[]
17 | # # Description
18 | 
19 | # %% [markdown] tags=[]
20 | # Similar as `06` but with numba disabled to compare with a pure Python implementation.
21 | #
22 | # Here I had to reduce the number of `n_genes`, since it takes too much otherwise.
23 | 
24 | # %% [markdown] tags=[]
25 | # # Disable numba
26 | 
27 | # %% tags=[]
28 | # %env NUMBA_DISABLE_JIT=1
29 | 
30 | # %% [markdown] tags=[]
31 | # # Remove pycache dir
32 | 
33 | # %% tags=[]
34 | # !echo ${CODE_DIR}
35 | 
36 | # %% tags=[]
37 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -print
38 | 
39 | # %% tags=[]
40 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -prune -exec rm -rf {} \;
41 | 
42 | # %% tags=[]
43 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -print
44 | 
45 | # %% [markdown] tags=[]
46 | # # Modules
47 | 
48 | # %% tags=[]
49 | import numpy as np
50 | 
51 | from ccc.coef import ccc
52 | 
53 | # %% tags=[]
54 | # let numba compile all the code before profiling
55 | ccc(np.random.rand(10), np.random.rand(10))
56 | 
57 | # %% [markdown] tags=[]
58 | # # Data
59 | 
60 | # %% tags=[]
61 | n_genes, n_samples = 50, 1000
62 | 
63 | # %% tags=[]
64 | np.random.seed(0)
65 | 
66 | # %% tags=[]
67 | data = np.random.rand(n_genes, n_samples)
68 | 
69 | # %% tags=[]
70 | data.shape
71 | 
72 | 
73 | # %% [markdown] tags=[]
74 | # # Profile
75 | 
76 | # %% tags=[]
77 | def func():
78 |     n_clust = list(range(2, 10 + 1))
79 |     return ccc(data, internal_n_clusters=n_clust)
80 | 
81 | 
82 | # %% tags=[]
83 | # %%timeit func()
84 | func()
85 | 
86 | # %% tags=[]
87 | # %%prun -s cumulative -l 50 -T 08-cm_many_genes.txt
88 | func()
89 | 
90 | # %% [markdown] tags=[]
91 | # **CONCLUSIONS:** compared with notebook `06` (which has 500 rows (`n_genes`) instead of 50 here), this one would have taken 2.80 hours for 500 rows based on this results. Whereas the numba-compiled version took ~7 minutes.
92 | 
93 | # %% tags=[]
94 | 


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/11_cm_optimized/py/07-many_samples.py:
--------------------------------------------------------------------------------
  1 | # ---
  2 | # jupyter:
  3 | #   jupytext:
  4 | #     cell_metadata_filter: all,-execution,-papermill,-trusted
  5 | #     text_representation:
  6 | #       extension: .py
  7 | #       format_name: percent
  8 | #       format_version: '1.3'
  9 | #       jupytext_version: 1.11.5
 10 | #   kernelspec:
 11 | #     display_name: Python 3 (ipykernel)
 12 | #     language: python
 13 | #     name: python3
 14 | # ---
 15 | 
 16 | # %% [markdown] tags=[]
 17 | # # Description
 18 | 
 19 | # %% [markdown] tags=[]
 20 | # Clustermatch run using a larger number of samples.
 21 | 
 22 | # %% [markdown] tags=[]
 23 | # # Remove pycache dir
 24 | 
 25 | # %% tags=[]
 26 | # !echo ${CODE_DIR}
 27 | 
 28 | # %% tags=[]
 29 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -print
 30 | 
 31 | # %% tags=[]
 32 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -prune -exec rm -rf {} \;
 33 | 
 34 | # %% tags=[]
 35 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -print
 36 | 
 37 | # %% [markdown] tags=[]
 38 | # # Modules
 39 | 
 40 | # %% tags=[]
 41 | import numpy as np
 42 | 
 43 | from ccc.coef import ccc
 44 | 
 45 | # %% tags=[]
 46 | # let numba compile all the code before profiling
 47 | ccc(np.random.rand(10), np.random.rand(10))
 48 | 
 49 | # %% [markdown] tags=[]
 50 | # # Data
 51 | 
 52 | # %% tags=[]
 53 | n_genes, n_samples = 10, 30000
 54 | 
 55 | # %% tags=[]
 56 | np.random.seed(0)
 57 | 
 58 | # %% tags=[]
 59 | data = np.random.rand(n_genes, n_samples)
 60 | 
 61 | # %% tags=[]
 62 | data.shape
 63 | 
 64 | 
 65 | # %% [markdown] tags=[]
 66 | # # With default `internal_n_clusters`
 67 | 
 68 | # %% tags=[]
 69 | def func():
 70 |     n_clust = list(range(2, 10 + 1))
 71 |     return ccc(data, internal_n_clusters=n_clust)
 72 | 
 73 | 
 74 | # %% tags=[]
 75 | # %%timeit func()
 76 | func()
 77 | 
 78 | # %% tags=[]
 79 | # %%prun -s cumulative -l 50 -T 07-cm_many_samples-default_internal_n_clusters.txt
 80 | func()
 81 | 
 82 | 
 83 | # %% [markdown] tags=[]
 84 | # # With reduced `internal_n_clusters`
 85 | 
 86 | # %% tags=[]
 87 | def func():
 88 |     n_clust = list(range(2, 5 + 1))
 89 |     return ccc(data, internal_n_clusters=n_clust)
 90 | 
 91 | 
 92 | # %% tags=[]
 93 | # %%timeit func()
 94 | func()
 95 | 
 96 | # %% tags=[]
 97 | # %%prun -s cumulative -l 50 -T 07-cm_many_samples-less_internal_n_clusters.txt
 98 | func()
 99 | 
100 | # %% tags=[]
101 | 


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/12_cm_optimized/py/08-many_genes.py:
--------------------------------------------------------------------------------
 1 | # ---
 2 | # jupyter:
 3 | #   jupytext:
 4 | #     cell_metadata_filter: all,-execution,-papermill,-trusted
 5 | #     notebook_metadata_filter: -jupytext.text_representation.jupytext_version
 6 | #     text_representation:
 7 | #       extension: .py
 8 | #       format_name: percent
 9 | #       format_version: '1.3'
10 | #   kernelspec:
11 | #     display_name: Python 3 (ipykernel)
12 | #     language: python
13 | #     name: python3
14 | # ---
15 | 
16 | # %% [markdown] tags=[]
17 | # # Description
18 | 
19 | # %% [markdown] tags=[]
20 | # Similar as `06` but with numba disabled to compare with a pure Python implementation.
21 | #
22 | # Here I had to reduce the number of `n_genes`, since it takes too much otherwise.
23 | 
24 | # %% [markdown] tags=[]
25 | # # Disable numba
26 | 
27 | # %% tags=[]
28 | # %env NUMBA_DISABLE_JIT=1
29 | 
30 | # %% [markdown] tags=[]
31 | # # Remove pycache dir
32 | 
33 | # %% tags=[]
34 | # !echo ${CODE_DIR}
35 | 
36 | # %% tags=[]
37 | # !find ${CODE_DIR}/libs -regex '^.*\(__pycache__\)$' -print
38 | 
39 | # %% tags=[]
40 | # !find ${CODE_DIR}/libs -regex '^.*\(__pycache__\)$' -prune -exec rm -rf {} \;
41 | 
42 | # %% tags=[]
43 | # !find ${CODE_DIR}/libs -regex '^.*\(__pycache__\)$' -print
44 | 
45 | # %% [markdown] tags=[]
46 | # # Modules
47 | 
48 | # %% tags=[]
49 | import numpy as np
50 | 
51 | from ccc.coef import ccc
52 | 
53 | # %% tags=[]
54 | # let numba compile all the code before profiling
55 | ccc(np.random.rand(10), np.random.rand(10))
56 | 
57 | # %% [markdown] tags=[]
58 | # # Data
59 | 
60 | # %% tags=[]
61 | n_genes, n_samples = 50, 1000
62 | 
63 | # %% tags=[]
64 | np.random.seed(0)
65 | 
66 | # %% tags=[]
67 | data = np.random.rand(n_genes, n_samples)
68 | 
69 | # %% tags=[]
70 | data.shape
71 | 
72 | 
73 | # %% [markdown] tags=[]
74 | # # Profile
75 | 
76 | # %% tags=[]
77 | def func():
78 |     n_clust = list(range(2, 10 + 1))
79 |     return ccc(data, internal_n_clusters=n_clust)
80 | 
81 | 
82 | # %% tags=[]
83 | # %%timeit func()
84 | func()
85 | 
86 | # %% tags=[]
87 | # %%prun -s cumulative -l 50 -T 08-cm_many_genes.txt
88 | func()
89 | 
90 | # %% [markdown] tags=[]
91 | # **CONCLUSIONS:** compared with notebook `06` (which has 500 rows (`n_genes`) instead of 50 here), this one would have taken 2.80 hours for 500 rows based on this results. Whereas the numba-compiled version took ~7 minutes.
92 | 
93 | # %% tags=[]
94 | 


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/12_cm_optimized/py/07-many_samples.py:
--------------------------------------------------------------------------------
  1 | # ---
  2 | # jupyter:
  3 | #   jupytext:
  4 | #     cell_metadata_filter: all,-execution,-papermill,-trusted
  5 | #     notebook_metadata_filter: -jupytext.text_representation.jupytext_version
  6 | #     text_representation:
  7 | #       extension: .py
  8 | #       format_name: percent
  9 | #       format_version: '1.3'
 10 | #   kernelspec:
 11 | #     display_name: Python 3 (ipykernel)
 12 | #     language: python
 13 | #     name: python3
 14 | # ---
 15 | 
 16 | # %% [markdown] tags=[]
 17 | # # Description
 18 | 
 19 | # %% [markdown] tags=[]
 20 | # Clustermatch run using a larger number of samples.
 21 | 
 22 | # %% [markdown] tags=[]
 23 | # # Remove pycache dir
 24 | 
 25 | # %% tags=[]
 26 | # !echo ${CODE_DIR}
 27 | 
 28 | # %% tags=[]
 29 | # !find ${CODE_DIR}/libs -regex '^.*\(__pycache__\)$' -print
 30 | 
 31 | # %% tags=[]
 32 | # !find ${CODE_DIR}/libs -regex '^.*\(__pycache__\)$' -prune -exec rm -rf {} \;
 33 | 
 34 | # %% tags=[]
 35 | # !find ${CODE_DIR}/libs -regex '^.*\(__pycache__\)$' -print
 36 | 
 37 | # %% [markdown] tags=[]
 38 | # # Modules
 39 | 
 40 | # %% tags=[]
 41 | import numpy as np
 42 | 
 43 | from ccc.coef import ccc
 44 | 
 45 | # %% tags=[]
 46 | # let numba compile all the code before profiling
 47 | ccc(np.random.rand(10), np.random.rand(10))
 48 | 
 49 | # %% [markdown] tags=[]
 50 | # # Data
 51 | 
 52 | # %% tags=[]
 53 | n_genes, n_samples = 10, 30000
 54 | 
 55 | # %% tags=[]
 56 | np.random.seed(0)
 57 | 
 58 | # %% tags=[]
 59 | data = np.random.rand(n_genes, n_samples)
 60 | 
 61 | # %% tags=[]
 62 | data.shape
 63 | 
 64 | 
 65 | # %% [markdown] tags=[]
 66 | # # With default `internal_n_clusters`
 67 | 
 68 | # %% tags=[]
 69 | def func():
 70 |     n_clust = list(range(2, 10 + 1))
 71 |     return ccc(data, internal_n_clusters=n_clust)
 72 | 
 73 | 
 74 | # %% tags=[]
 75 | # %%timeit func()
 76 | func()
 77 | 
 78 | # %% tags=[]
 79 | # %%prun -s cumulative -l 50 -T 07-cm_many_samples-default_internal_n_clusters.txt
 80 | func()
 81 | 
 82 | 
 83 | # %% [markdown] tags=[]
 84 | # # With reduced `internal_n_clusters`
 85 | 
 86 | # %% tags=[]
 87 | def func():
 88 |     n_clust = list(range(2, 5 + 1))
 89 |     return ccc(data, internal_n_clusters=n_clust)
 90 | 
 91 | 
 92 | # %% tags=[]
 93 | # %%timeit func()
 94 | func()
 95 | 
 96 | # %% tags=[]
 97 | # %%prun -s cumulative -l 50 -T 07-cm_many_samples-less_internal_n_clusters.txt
 98 | func()
 99 | 
100 | # %% tags=[]
101 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD-2-Clause Plus Patent License
 2 | 
 3 | Copyright (c) 2020-2021, Contributors & the Greene Laboratory at the University of Pennsylvania
 4 | 
 5 | Redistribution and use in source and binary forms, with or without modification, are permitted
 6 | provided that the following conditions are met:
 7 | 
 8 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions
 9 |    and the following disclaimer.
10 | 
11 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions
12 |    and the following disclaimer in the documentation and/or other materials provided with the
13 |    distribution.
14 | 
15 | Subject to the terms and conditions of this license, each copyright holder and contributor hereby
16 | grants to those receiving rights under this license a perpetual, worldwide, non-exclusive,
17 | no-charge, royalty-free, irrevocable (except for failure to satisfy the conditions of this license)
18 | patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer this
19 | software, where such license applies only to those patent claims, already acquired or hereafter
20 | acquired, licensable by such copyright holder or contributor that are necessarily infringed by:
21 | 
22 | (a) their Contribution(s) (the licensed copyrights of copyright holders and non-copyrightable
23 |     additions of contributors, in source or binary form) alone; or
24 | 
25 | (b) combination of their Contribution(s) with the work of authorship to which such Contribution(s)
26 |     was added by such copyright holder or contributor, if, at the time the Contribution is added,
27 |     such addition causes such combination to be necessarily infringed. The patent license shall not
28 |     apply to any other combinations which include the Contribution.
29 | 
30 | Except as expressly stated above, no rights or licenses from any copyright holder or contributor is
31 | granted under this license, whether expressly, by implication, estoppel or otherwise.
32 | 
33 | DISCLAIMER
34 | 
35 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
36 | IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
37 | FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR
38 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
39 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
40 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
41 | IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
42 | THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43 | 


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/11_cm_optimized/py/09-many_samples.py:
--------------------------------------------------------------------------------
  1 | # ---
  2 | # jupyter:
  3 | #   jupytext:
  4 | #     cell_metadata_filter: all,-execution,-papermill,-trusted
  5 | #     text_representation:
  6 | #       extension: .py
  7 | #       format_name: percent
  8 | #       format_version: '1.3'
  9 | #       jupytext_version: 1.11.5
 10 | #   kernelspec:
 11 | #     display_name: Python 3 (ipykernel)
 12 | #     language: python
 13 | #     name: python3
 14 | # ---
 15 | 
 16 | # %% [markdown] tags=[]
 17 | # # Description
 18 | 
 19 | # %% [markdown] tags=[]
 20 | # Similar as `07` but with numba disabled to compare with a pure Python implementation.
 21 | 
 22 | # %% [markdown] tags=[]
 23 | # # Disable numba
 24 | 
 25 | # %% tags=[]
 26 | # %env NUMBA_DISABLE_JIT=1
 27 | 
 28 | # %% [markdown] tags=[]
 29 | # # Remove pycache dir
 30 | 
 31 | # %% tags=[]
 32 | # !echo ${CODE_DIR}
 33 | 
 34 | # %% tags=[]
 35 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -print
 36 | 
 37 | # %% tags=[]
 38 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -prune -exec rm -rf {} \;
 39 | 
 40 | # %% tags=[]
 41 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -print
 42 | 
 43 | # %% [markdown] tags=[]
 44 | # # Modules
 45 | 
 46 | # %% tags=[]
 47 | import numpy as np
 48 | 
 49 | from ccc.coef import ccc
 50 | 
 51 | # %% tags=[]
 52 | # let numba compile all the code before profiling
 53 | ccc(np.random.rand(10), np.random.rand(10))
 54 | 
 55 | # %% [markdown] tags=[]
 56 | # # Data
 57 | 
 58 | # %% tags=[]
 59 | n_genes, n_samples = 10, 30000
 60 | 
 61 | # %% tags=[]
 62 | np.random.seed(0)
 63 | 
 64 | # %% tags=[]
 65 | data = np.random.rand(n_genes, n_samples)
 66 | 
 67 | # %% tags=[]
 68 | data.shape
 69 | 
 70 | 
 71 | # %% [markdown] tags=[]
 72 | # # With default `internal_n_clusters`
 73 | 
 74 | # %% tags=[]
 75 | def func():
 76 |     n_clust = list(range(2, 10 + 1))
 77 |     return ccc(data, internal_n_clusters=n_clust)
 78 | 
 79 | 
 80 | # %% tags=[]
 81 | # %%timeit func()
 82 | func()
 83 | 
 84 | # %% tags=[]
 85 | # %%prun -s cumulative -l 50 -T 09-cm_many_samples-default_internal_n_clusters.txt
 86 | func()
 87 | 
 88 | 
 89 | # %% [markdown] tags=[]
 90 | # These results are just slightly worse than the numba-compiled version (notebook `07`).
 91 | 
 92 | # %% [markdown] tags=[]
 93 | # # With reduced `internal_n_clusters`
 94 | 
 95 | # %% tags=[]
 96 | def func():
 97 |     n_clust = list(range(2, 5 + 1))
 98 |     return ccc(data, internal_n_clusters=n_clust)
 99 | 
100 | 
101 | # %% tags=[]
102 | # %%timeit func()
103 | func()
104 | 
105 | # %% tags=[]
106 | # %%prun -s cumulative -l 50 -T 09-cm_many_samples-less_internal_n_clusters.txt
107 | func()
108 | 
109 | # %% [markdown] tags=[]
110 | # These results are slightly better than the numba-compiled version (notebook `07`), which is surprising. In the future, it would be interesting to disable threading here to get accurate profiling results to debug this issue.
111 | 
112 | # %% tags=[]
113 | 


--------------------------------------------------------------------------------
/tests/test_pytorch_core.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import numpy as np
 3 | 
 4 | from ccc.pytorch.core import unravel_index_2d
 5 | 
 6 | 
 7 | def test_unravel_index_2d_square_simple():
 8 |     shape = (2, 2)
 9 |     assert unravel_index_2d(0, shape) == (0, 0)
10 |     assert unravel_index_2d(1, shape) == (0, 1)
11 |     assert unravel_index_2d(2, shape) == (1, 0)
12 |     assert unravel_index_2d(3, shape) == (1, 1)
13 | 
14 | 
15 | def test_unravel_index_2d_rect_simple():
16 |     shape = (2, 3)
17 |     assert unravel_index_2d(0, shape) == (0, 0)
18 |     assert unravel_index_2d(1, shape) == (0, 1)
19 |     assert unravel_index_2d(2, shape) == (0, 2)
20 |     assert unravel_index_2d(3, shape) == (1, 0)
21 |     assert unravel_index_2d(4, shape) == (1, 1)
22 |     assert unravel_index_2d(5, shape) == (1, 2)
23 | 
24 |     shape = (1, 4)
25 |     assert unravel_index_2d(0, shape) == (0, 0)
26 |     assert unravel_index_2d(1, shape) == (0, 1)
27 |     assert unravel_index_2d(2, shape) == (0, 2)
28 |     assert unravel_index_2d(3, shape) == (0, 3)
29 | 
30 |     shape = (4, 1)
31 |     assert unravel_index_2d(0, shape) == (0, 0)
32 |     assert unravel_index_2d(1, shape) == (1, 0)
33 |     assert unravel_index_2d(2, shape) == (2, 0)
34 |     assert unravel_index_2d(3, shape) == (3, 0)
35 | 
36 | 
37 | def test_unravel_index_2d_square0():
38 |     x = np.array([[0, 7], [-5, 6.999]])
39 |     x_max_idx = np.argmax(x, axis=None)
40 |     assert x_max_idx == 1
41 | 
42 |     expected_idx = np.unravel_index(x_max_idx, x.shape)
43 |     observed_idx = unravel_index_2d(x_max_idx, x.shape)
44 | 
45 |     assert expected_idx == observed_idx == (0, 1)
46 | 
47 | 
48 | def test_unravel_index_2d_square1():
49 |     x = np.array([[0, 7], [-5, 7.01]])
50 |     x_max_idx = np.argmax(x, axis=None)
51 |     assert x_max_idx == 3
52 | 
53 |     expected_idx = np.unravel_index(x_max_idx, x.shape)
54 |     observed_idx = unravel_index_2d(x_max_idx, x.shape)
55 | 
56 |     assert expected_idx == observed_idx == (1, 1)
57 | 
58 | 
59 | def test_unravel_index_2d_square_all_equal():
60 |     x = np.array([[7.0, 7.0], [7.0, 7.0]])
61 |     x_max_idx = np.argmax(x, axis=None)
62 |     assert x_max_idx == 0
63 | 
64 |     expected_idx = np.unravel_index(x_max_idx, x.shape)
65 |     observed_idx = unravel_index_2d(x_max_idx, x.shape)
66 | 
67 |     assert expected_idx == observed_idx == (0, 0)
68 | 
69 | 
70 | def test_unravel_index_2d_rect():
71 |     x = np.array([[0, 7, -5.6], [8.1, 6.999, 0]])
72 |     x_max_idx = np.argmax(x, axis=None)
73 |     assert x_max_idx == 3
74 | 
75 |     expected_idx = np.unravel_index(x_max_idx, x.shape)
76 |     observed_idx = unravel_index_2d(x_max_idx, x.shape)
77 | 
78 |     assert expected_idx == observed_idx == (1, 0)
79 | 
80 | 
81 | def test_unravel_index_index_out_of_bounds():
82 |     with pytest.raises(ValueError):
83 |         unravel_index_2d(6, (2, 3))
84 | 
85 | 
86 | def test_unravel_index_non_2d():
87 |     with pytest.raises(ValueError):
88 |         unravel_index_2d(0, (2, 3, 4))
89 | 


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/12_cm_optimized/py/11-many_samples.py:
--------------------------------------------------------------------------------
  1 | # ---
  2 | # jupyter:
  3 | #   jupytext:
  4 | #     cell_metadata_filter: all,-execution,-papermill,-trusted
  5 | #     notebook_metadata_filter: -jupytext.text_representation.jupytext_version
  6 | #     text_representation:
  7 | #       extension: .py
  8 | #       format_name: percent
  9 | #       format_version: '1.3'
 10 | #   kernelspec:
 11 | #     display_name: Python 3 (ipykernel)
 12 | #     language: python
 13 | #     name: python3
 14 | # ---
 15 | 
 16 | # %% [markdown] tags=[]
 17 | # # Description
 18 | 
 19 | # %% [markdown] tags=[]
 20 | # Similar as `06` but it computes across gene pairs instead of data matrix.
 21 | 
 22 | # %% [markdown] tags=[]
 23 | # # Remove pycache dir
 24 | 
 25 | # %% tags=[]
 26 | # !echo ${CODE_DIR}
 27 | 
 28 | # %% tags=[]
 29 | # !find ${CODE_DIR}/libs -regex '^.*\(__pycache__\)$' -print
 30 | 
 31 | # %% tags=[]
 32 | # !find ${CODE_DIR}/libs -regex '^.*\(__pycache__\)$' -prune -exec rm -rf {} \;
 33 | 
 34 | # %% tags=[]
 35 | # !find ${CODE_DIR}/libs -regex '^.*\(__pycache__\)$' -print
 36 | 
 37 | # %% [markdown] tags=[]
 38 | # # Modules
 39 | 
 40 | # %% tags=[]
 41 | import numpy as np
 42 | 
 43 | from ccc.coef import ccc
 44 | 
 45 | # %% tags=[]
 46 | # let numba compile all the code before profiling
 47 | ccc(np.random.rand(10), np.random.rand(10))
 48 | 
 49 | # %% [markdown] tags=[]
 50 | # # Data
 51 | 
 52 | # %% tags=[]
 53 | n_genes, n_samples = 10, 30000
 54 | 
 55 | # %% tags=[]
 56 | np.random.seed(0)
 57 | 
 58 | # %% tags=[]
 59 | data = np.random.rand(n_genes, n_samples)
 60 | 
 61 | # %% tags=[]
 62 | data.shape
 63 | 
 64 | 
 65 | # %% [markdown] tags=[]
 66 | # # With default `internal_n_clusters`
 67 | 
 68 | # %% tags=[]
 69 | def func():
 70 |     res = np.full(int((data.shape[0] * (data.shape[0] - 1)) / 2), np.nan)
 71 | 
 72 |     n_clust = list(range(2, 10 + 1))
 73 |     idx = 0
 74 |     for i in range(data.shape[0] - 1):
 75 |         for j in range(i + 1, data.shape[0]):
 76 |             res[idx] = ccc(data[i], data[j], internal_n_clusters=n_clust)
 77 |             idx += 1
 78 | 
 79 | 
 80 | # %% tags=[]
 81 | # %%timeit func()
 82 | func()
 83 | 
 84 | # %% tags=[]
 85 | # %%prun -s cumulative -l 50 -T 11-cm_many_samples-default_internal_n_clusters.txt
 86 | func()
 87 | 
 88 | 
 89 | # %% [markdown] tags=[]
 90 | # # With reduced `internal_n_clusters`
 91 | 
 92 | # %% tags=[]
 93 | def func():
 94 |     res = np.full(int((data.shape[0] * (data.shape[0] - 1)) / 2), np.nan)
 95 | 
 96 |     n_clust = list(range(2, 5 + 1))
 97 |     idx = 0
 98 |     for i in range(data.shape[0] - 1):
 99 |         for j in range(i + 1, data.shape[0]):
100 |             res[idx] = ccc(data[i], data[j], internal_n_clusters=n_clust)
101 |             idx += 1
102 | 
103 | 
104 | # %% tags=[]
105 | # %%timeit func()
106 | func()
107 | 
108 | # %% tags=[]
109 | # %%prun -s cumulative -l 50 -T 11-cm_many_samples-less_internal_n_clusters.txt
110 | func()
111 | 
112 | # %% tags=[]
113 | 


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/12_cm_optimized/py/09-many_samples.py:
--------------------------------------------------------------------------------
  1 | # ---
  2 | # jupyter:
  3 | #   jupytext:
  4 | #     cell_metadata_filter: all,-execution,-papermill,-trusted
  5 | #     notebook_metadata_filter: -jupytext.text_representation.jupytext_version
  6 | #     text_representation:
  7 | #       extension: .py
  8 | #       format_name: percent
  9 | #       format_version: '1.3'
 10 | #   kernelspec:
 11 | #     display_name: Python 3 (ipykernel)
 12 | #     language: python
 13 | #     name: python3
 14 | # ---
 15 | 
 16 | # %% [markdown] tags=[]
 17 | # # Description
 18 | 
 19 | # %% [markdown] tags=[]
 20 | # Similar as `07` but with numba disabled to compare with a pure Python implementation.
 21 | 
 22 | # %% [markdown] tags=[]
 23 | # # Disable numba
 24 | 
 25 | # %% tags=[]
 26 | # %env NUMBA_DISABLE_JIT=1
 27 | 
 28 | # %% [markdown] tags=[]
 29 | # # Remove pycache dir
 30 | 
 31 | # %% tags=[]
 32 | # !echo ${CODE_DIR}
 33 | 
 34 | # %% tags=[]
 35 | # !find ${CODE_DIR}/libs -regex '^.*\(__pycache__\)$' -print
 36 | 
 37 | # %% tags=[]
 38 | # !find ${CODE_DIR}/libs -regex '^.*\(__pycache__\)$' -prune -exec rm -rf {} \;
 39 | 
 40 | # %% tags=[]
 41 | # !find ${CODE_DIR}/libs -regex '^.*\(__pycache__\)$' -print
 42 | 
 43 | # %% [markdown] tags=[]
 44 | # # Modules
 45 | 
 46 | # %% tags=[]
 47 | import numpy as np
 48 | 
 49 | from ccc.coef import ccc
 50 | 
 51 | # %% tags=[]
 52 | # let numba compile all the code before profiling
 53 | ccc(np.random.rand(10), np.random.rand(10))
 54 | 
 55 | # %% [markdown] tags=[]
 56 | # # Data
 57 | 
 58 | # %% tags=[]
 59 | n_genes, n_samples = 10, 30000
 60 | 
 61 | # %% tags=[]
 62 | np.random.seed(0)
 63 | 
 64 | # %% tags=[]
 65 | data = np.random.rand(n_genes, n_samples)
 66 | 
 67 | # %% tags=[]
 68 | data.shape
 69 | 
 70 | 
 71 | # %% [markdown] tags=[]
 72 | # # With default `internal_n_clusters`
 73 | 
 74 | # %% tags=[]
 75 | def func():
 76 |     n_clust = list(range(2, 10 + 1))
 77 |     return ccc(data, internal_n_clusters=n_clust)
 78 | 
 79 | 
 80 | # %% tags=[]
 81 | # %%timeit func()
 82 | func()
 83 | 
 84 | # %% tags=[]
 85 | # %%prun -s cumulative -l 50 -T 09-cm_many_samples-default_internal_n_clusters.txt
 86 | func()
 87 | 
 88 | 
 89 | # %% [markdown] tags=[]
 90 | # These results are just slightly worse than the numba-compiled version (notebook `07`).
 91 | 
 92 | # %% [markdown] tags=[]
 93 | # # With reduced `internal_n_clusters`
 94 | 
 95 | # %% tags=[]
 96 | def func():
 97 |     n_clust = list(range(2, 5 + 1))
 98 |     return ccc(data, internal_n_clusters=n_clust)
 99 | 
100 | 
101 | # %% tags=[]
102 | # %%timeit func()
103 | func()
104 | 
105 | # %% tags=[]
106 | # %%prun -s cumulative -l 50 -T 09-cm_many_samples-less_internal_n_clusters.txt
107 | func()
108 | 
109 | # %% [markdown] tags=[]
110 | # These results are slightly better than the numba-compiled version (notebook `07`), which is surprising. In the future, it would be interesting to disable threading here to get accurate profiling results to debug this issue.
111 | 
112 | # %% tags=[]
113 | 


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/10_cm_optimized/py/01-cdist_parts_v00.py:
--------------------------------------------------------------------------------
  1 | # ---
  2 | # jupyter:
  3 | #   jupytext:
  4 | #     cell_metadata_filter: all,-execution,-papermill,-trusted
  5 | #     text_representation:
  6 | #       extension: .py
  7 | #       format_name: percent
  8 | #       format_version: '1.3'
  9 | #       jupytext_version: 1.11.5
 10 | #   kernelspec:
 11 | #     display_name: Python 3 (ipykernel)
 12 | #     language: python
 13 | #     name: python3
 14 | # ---
 15 | 
 16 | # %% [markdown] tags=[]
 17 | # # Description
 18 | 
 19 | # %% [markdown]
 20 | # UPDATE:
 21 | #
 22 | # list changes here
 23 | 
 24 | # %% [markdown]
 25 | # ![image.png](attachment:3ca43189-f499-4016-a6b7-e0b476fcac1b.png)
 26 | 
 27 | # %% [markdown] tags=[]
 28 | # # Remove pycache dir
 29 | 
 30 | # %%
 31 | # !echo ${CODE_DIR}
 32 | 
 33 | # %%
 34 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -print
 35 | 
 36 | # %%
 37 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -exec rm -rf {} \;
 38 | 
 39 | # %%
 40 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -print
 41 | 
 42 | # %% [markdown] tags=[]
 43 | # # Modules
 44 | 
 45 | # %% tags=[]
 46 | import numpy as np
 47 | 
 48 | from ccc.coef import _cm
 49 | 
 50 | # %% [markdown] tags=[]
 51 | # # Settings
 52 | 
 53 | # %%
 54 | N_REPS = 10
 55 | 
 56 | # %% tags=[]
 57 | np.random.seed(0)
 58 | 
 59 | # %% [markdown] tags=[]
 60 | # # Setup
 61 | 
 62 | # %%
 63 | # let numba compile all the code before profiling
 64 | _cm.py_func(np.random.rand(10), np.random.rand(10))
 65 | 
 66 | # %% [markdown] tags=[]
 67 | # # Run with `n_samples` small
 68 | 
 69 | # %%
 70 | N_SAMPLES = 100
 71 | 
 72 | # %%
 73 | x = np.random.rand(N_SAMPLES)
 74 | y = np.random.rand(N_SAMPLES)
 75 | 
 76 | 
 77 | # %% tags=[]
 78 | def func():
 79 |     for i in range(N_REPS):
 80 |         # py_func accesses the original python function, not the numba-optimized one
 81 |         # this is needed to be able to profile the function
 82 |         _cm.py_func(x, y)
 83 | 
 84 | 
 85 | # %% tags=[]
 86 | # %%timeit -n1 -r1 func()
 87 | func()
 88 | 
 89 | # %% tags=[]
 90 | # %%prun -s cumulative -l 20 -T 01-n_samples_small.txt
 91 | func()
 92 | 
 93 | # %% [markdown] tags=[]
 94 | # **No improvement** for this case.
 95 | 
 96 | # %% [markdown] tags=[]
 97 | # # Run with `n_samples` large
 98 | 
 99 | # %%
100 | N_SAMPLES = 100000
101 | 
102 | # %%
103 | x = np.random.rand(N_SAMPLES)
104 | y = np.random.rand(N_SAMPLES)
105 | 
106 | 
107 | # %% tags=[]
108 | def func():
109 |     for i in range(N_REPS):
110 |         # py_func accesses the original python function, not the numba-optimized one
111 |         # this is needed to be able to profile the function
112 |         _cm.py_func(x, y)
113 | 
114 | 
115 | # %% tags=[]
116 | # %%timeit -n1 -r1 func()
117 | func()
118 | 
119 | # %% tags=[]
120 | # %%prun -s cumulative -l 20 -T 01-n_samples_large.txt
121 | func()
122 | 
123 | # %% [markdown] tags=[]
124 | # **Important improvement** for this case. `cdist_parts` takes now 0.370 percall instead of 0.824 (from reference).
125 | 
126 | # %%
127 | 


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/10_cm_optimized/py/00-run_reference.py:
--------------------------------------------------------------------------------
  1 | # ---
  2 | # jupyter:
  3 | #   jupytext:
  4 | #     cell_metadata_filter: all,-execution,-papermill,-trusted
  5 | #     text_representation:
  6 | #       extension: .py
  7 | #       format_name: percent
  8 | #       format_version: '1.3'
  9 | #       jupytext_version: 1.11.5
 10 | #   kernelspec:
 11 | #     display_name: Python 3 (ipykernel)
 12 | #     language: python
 13 | #     name: python3
 14 | # ---
 15 | 
 16 | # %% [markdown] tags=[]
 17 | # # Description
 18 | 
 19 | # %% [markdown]
 20 | # Creates a point of reference/comparison with non-optimized version of ccc.
 21 | 
 22 | # %% [markdown] tags=[]
 23 | # # Remove pycache dir
 24 | 
 25 | # %%
 26 | # !echo ${CODE_DIR}
 27 | 
 28 | # %%
 29 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -print
 30 | 
 31 | # %%
 32 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -exec rm -rf {} \;
 33 | 
 34 | # %%
 35 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -print
 36 | 
 37 | # %% [markdown] tags=[]
 38 | # # Modules
 39 | 
 40 | # %% tags=[]
 41 | import numpy as np
 42 | 
 43 | from ccc.coef import _cm
 44 | 
 45 | # %% [markdown] tags=[]
 46 | # # Settings
 47 | 
 48 | # %%
 49 | N_REPS = 10
 50 | 
 51 | # %% tags=[]
 52 | np.random.seed(0)
 53 | 
 54 | # %% [markdown] tags=[]
 55 | # # Setup
 56 | 
 57 | # %%
 58 | # let numba compile all the code before profiling
 59 | _cm.py_func(np.random.rand(10), np.random.rand(10))
 60 | 
 61 | # %% [markdown] tags=[]
 62 | # # Run with `n_samples` small
 63 | 
 64 | # %%
 65 | N_SAMPLES = 100
 66 | 
 67 | # %%
 68 | x = np.random.rand(N_SAMPLES)
 69 | y = np.random.rand(N_SAMPLES)
 70 | 
 71 | 
 72 | # %% tags=[]
 73 | def func():
 74 |     for i in range(N_REPS):
 75 |         # py_func accesses the original python function, not the numba-optimized one
 76 |         # this is needed to be able to profile the function
 77 |         _cm.py_func(x, y)
 78 | 
 79 | 
 80 | # %% tags=[]
 81 | # %%timeit -n1 -r1 func()
 82 | func()
 83 | 
 84 | # %% tags=[]
 85 | # %%prun -s cumulative -l 20 -T 00-n_samples_small.txt
 86 | func()
 87 | 
 88 | # %% [markdown] tags=[]
 89 | # The bottleneck functions are, in order of importance:
 90 | # 1. `cdist_parts`
 91 | # 1. `_get_parts`
 92 | 
 93 | # %% [markdown] tags=[]
 94 | # # Run with `n_samples` large
 95 | 
 96 | # %%
 97 | N_SAMPLES = 100000
 98 | 
 99 | # %%
100 | x = np.random.rand(N_SAMPLES)
101 | y = np.random.rand(N_SAMPLES)
102 | 
103 | 
104 | # %% tags=[]
105 | def func():
106 |     for i in range(N_REPS):
107 |         # py_func accesses the original python function, not the numba-optimized one
108 |         # this is needed to be able to profile the function
109 |         _cm.py_func(x, y)
110 | 
111 | 
112 | # %% tags=[]
113 | # %%timeit -n1 -r1 func()
114 | func()
115 | 
116 | # %% tags=[]
117 | # %%prun -s cumulative -l 20 -T 00-n_samples_large.txt
118 | func()
119 | 
120 | # %% [markdown] tags=[]
121 | # The bottleneck functions now are **different**, in order of importance:
122 | # 1. `_get_parts`
123 | # 1. `cdist_parts`
124 | 
125 | # %%
126 | 


--------------------------------------------------------------------------------
/nbs/99_manuscript/k_max/py/01-k_max-runs.py:
--------------------------------------------------------------------------------
  1 | # ---
  2 | # jupyter:
  3 | #   jupytext:
  4 | #     cell_metadata_filter: all,-execution,-papermill,-trusted
  5 | #     notebook_metadata_filter: -jupytext.text_representation.jupytext_version
  6 | #     text_representation:
  7 | #       extension: .py
  8 | #       format_name: percent
  9 | #       format_version: '1.3'
 10 | #   kernelspec:
 11 | #     display_name: Python 3 (ipykernel)
 12 | #     language: python
 13 | #     name: python3
 14 | # ---
 15 | 
 16 | # %% [markdown] tags=[]
 17 | # # Description
 18 | 
 19 | # %% [markdown] tags=[]
 20 | # Runs CCC with different values for parameter $k_{\mathrm{max}}$ to assess the constant baseline property empirically.
 21 | 
 22 | # %% [markdown] tags=[]
 23 | # # Modules loading
 24 | 
 25 | # %% tags=[]
 26 | import numpy as np
 27 | import pandas as pd
 28 | from tqdm import tqdm
 29 | 
 30 | from ccc import conf
 31 | from ccc.coef import ccc
 32 | 
 33 | # %% [markdown] tags=[]
 34 | # # Settings
 35 | 
 36 | # %% tags=[]
 37 | display(conf.GENERAL["N_JOBS"])
 38 | 
 39 | # %% tags=[]
 40 | DATA_SIZES = [
 41 |     200,
 42 |     600,
 43 |     1800,
 44 | ]
 45 | 
 46 | # split data size in this many points
 47 | K_MAX_N_SPLITS = 10
 48 | 
 49 | # always include this value since it is the default we use in CCC
 50 | DEFAULT_K_MAX = 10
 51 | 
 52 | # N_REPS = 10
 53 | 
 54 | # %% tags=[]
 55 | np.random.seed(0)
 56 | 
 57 | # %% [markdown] tags=[]
 58 | # # Paths
 59 | 
 60 | # %% tags=[]
 61 | OUTPUT_DIR = conf.RESULTS_DIR / "k_max_test"
 62 | OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
 63 | display(OUTPUT_DIR)
 64 | 
 65 | # %% [markdown] tags=[]
 66 | # # Run
 67 | 
 68 | # %% tags=[]
 69 | # initialize (i.e., compile with numba)
 70 | ccc(np.random.rand(100), np.random.rand(100))
 71 | 
 72 | # %% tags=[]
 73 | results = pd.DataFrame(columns=["data_size", "k_max", "k_max_as_n_fraction", "coef"])
 74 | 
 75 | idx = 0
 76 | for data_size in tqdm(DATA_SIZES):
 77 |     # get the values for k_max to try...
 78 |     k_max_splits = np.linspace(2, data_size, K_MAX_N_SPLITS)
 79 |     # ... but always add the default k_max used by CCC
 80 |     k_max_splits = [int(i) for i in np.sort(np.append(k_max_splits, DEFAULT_K_MAX))]
 81 | 
 82 |     # generate random data
 83 |     # TODO: if I generate normal data, what happens?
 84 |     # d1 = np.random.rand(data_size)
 85 |     # d2 = np.random.rand(data_size)
 86 |     d1 = np.random.normal(size=data_size)
 87 |     d2 = np.random.normal(size=data_size)
 88 | 
 89 |     for k_max in tqdm(k_max_splits):
 90 |         c = ccc(d1, d2, internal_n_clusters=k_max, n_jobs=conf.GENERAL["N_JOBS"])
 91 | 
 92 |         results.loc[idx] = [data_size, k_max, k_max / data_size, c]
 93 |         idx += 1
 94 | 
 95 |         # save
 96 |         results.to_pickle(OUTPUT_DIR / "k_max-results.pkl")
 97 | 
 98 | # %% [markdown] tags=[]
 99 | # # Check
100 | 
101 | # %% tags=[]
102 | results.shape
103 | 
104 | # %% tags=[]
105 | assert results.shape[0] == int(len(DATA_SIZES) * (K_MAX_N_SPLITS + 1))
106 | 
107 | # %% tags=[]
108 | results.head()
109 | 
110 | # %% tags=[]
111 | 


--------------------------------------------------------------------------------
/tests/data/README.md:
--------------------------------------------------------------------------------
 1 | # Data used in unit tests
 2 | 
 3 | ## Clustermatch data
 4 | 
 5 | The `clustermatch-example-*.pkl` files were generated using the original clustermatch
 6 | code (https://github.com/sinc-lab/clustermatch - Commit 8b66b3d7) plus the patch below:
 7 | 
 8 | ```patch
 9 | $ git diff
10 | diff --git a/clustermatch/cluster.py b/clustermatch/cluster.py
11 | index 9f7d06c..07e8192 100644
12 | --- a/clustermatch/cluster.py
13 | +++ b/clustermatch/cluster.py
14 | @@ -160,7 +160,7 @@ def _get_range_n_clusters(n_common_features, **kwargs):
15 |      if internal_n_clusters is None:
16 |          estimated_k = int(np.floor(np.sqrt(n_common_features)))
17 |          estimated_k = np.min((estimated_k, 10))
18 | -        range_n_clusters = range(2, np.max((estimated_k, 3)))
19 | +        range_n_clusters = range(2, np.max((estimated_k, 3))+1)
20 |      elif isinstance(internal_n_clusters, (tuple, list, range)):
21 |          range_n_clusters = internal_n_clusters
22 |      elif isinstance(internal_n_clusters, int):
23 | @@ -211,7 +211,7 @@ def row_col_from_condensed_index(d,i):
24 |  
25 |  
26 |  def _compute_ari(part1, part2):
27 | -    if np.isnan(part1).any() or len(part1) == 0:
28 | +    if np.isnan(part1).any() or np.isnan(part2).any() or len(part1) == 0 or len(part2) == 0:
29 |          return 0.0
30 |  
31 |      return ari(part1, part2)
32 | ```
33 | 
34 | Then I moved to the git root directory and executed the following commands in ipython:
35 | 
36 | ### Random data without NaN
37 | ```python
38 | from pathlib import Path
39 | 
40 | import numpy as np
41 | import pandas as pd
42 | 
43 | from clustermatch.cluster import calculate_simmatrix
44 | 
45 | np.random.seed(0)
46 | random_data = pd.DataFrame(np.random.rand(20, 100))
47 | 
48 | OUTPUT_DIR = Path("/home/miltondp/projects/ccc/ccc/tests/data/")
49 | 
50 | random_data.to_pickle(OUTPUT_DIR / "ccc-random_data-data.pkl")
51 | 
52 | int_n_clusters = range(2, 10+1)
53 | cm_sim_matrix = calculate_simmatrix(random_data, internal_n_clusters=int_n_clusters, n_jobs=3)
54 | cm_sim_matrix.to_pickle(OUTPUT_DIR / "ccc-random_data-coef.pkl")
55 | ```
56 | 
57 | 
58 | THIS IS WITH THE ORIGINAL DATA WITH NANS
59 | ### Tomato dataset used in the original clustermatch implementation (contains NaN)
60 | ```python
61 | from pathlib import Path
62 | 
63 | import pandas as pd
64 | 
65 | from clustermatch.cluster import calculate_simmatrix
66 | from clustermatch.utils.data import merge_sources
67 | 
68 | data_files = ['experiments/tomato/data/real_sample.xlsx']
69 | merged_sources, feature_names, sources_names = merge_sources(data_files)
70 | 
71 | OUTPUT_DIR = Path("/home/miltondp/projects/ccc/ccc/tests/data/")
72 | 
73 | merged_sources_final = merged_sources.apply(lambda x: pd.to_numeric(x, errors="coerce"), axis=1)
74 | merged_sources_final = merged_sources_final.dropna(how="all")
75 | merged_sources_final.to_pickle(OUTPUT_DIR / "ccc-example-data.pkl")
76 | 
77 | int_n_clusters = range(2, 5)
78 | cm_sim_matrix = calculate_simmatrix(merged_sources_final, internal_n_clusters=int_n_clusters, n_jobs=3)
79 | cm_sim_matrix.to_pickle(OUTPUT_DIR / "ccc-example-coef.pkl")
80 | ```
81 | 


--------------------------------------------------------------------------------
/tests/test_scipy_stats.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from scipy import stats
 3 | 
 4 | from ccc.scipy.stats import rank
 5 | 
 6 | 
 7 | def test_rank_no_duplicates():
 8 |     data = np.array([0, 10, 1, 5, 7, 8, -5, -2])
 9 | 
10 |     expected_ranks = stats.rankdata(data, "average")
11 |     observed_ranks = rank(data)
12 | 
13 |     np.testing.assert_array_equal(observed_ranks, expected_ranks)
14 | 
15 | 
16 | def test_rank_one_duplicate_group():
17 |     data = np.array([0, 10, 1, 5, 7, 8, 1, -2])
18 | 
19 |     expected_ranks = stats.rankdata(data, "average")
20 |     observed_ranks = rank(data)
21 | 
22 |     np.testing.assert_array_equal(observed_ranks, expected_ranks)
23 | 
24 | 
25 | def test_rank_one_duplicate_group_with_more_elements():
26 |     data = np.array([0, 10, 1, 1, 7, 8, 1, -2])
27 | 
28 |     expected_ranks = stats.rankdata(data, "average")
29 |     observed_ranks = rank(data)
30 | 
31 |     np.testing.assert_array_equal(observed_ranks, expected_ranks)
32 | 
33 | 
34 | def test_rank_one_duplicate_group_at_beginning():
35 |     data = np.array([0, 0, 1, -10, 7, 8, 9.4, -2])
36 | 
37 |     expected_ranks = stats.rankdata(data, "average")
38 |     observed_ranks = rank(data)
39 | 
40 |     np.testing.assert_array_equal(observed_ranks, expected_ranks)
41 | 
42 | 
43 | def test_rank_one_duplicate_group_at_beginning_with_more_elements():
44 |     data = np.array([0.13, 0.13, 0.13, 1, -10, 7, 8, 9.4, -2])
45 | 
46 |     expected_ranks = stats.rankdata(data, "average")
47 |     observed_ranks = rank(data)
48 | 
49 |     np.testing.assert_array_equal(observed_ranks, expected_ranks)
50 | 
51 | 
52 | def test_rank_one_duplicate_group_at_beginning_are_smallest():
53 |     data = np.array([0, 10, 1.5, -99.5, -99.5, -99.5, 5, 7, 8, -5, -2])
54 | 
55 |     expected_ranks = stats.rankdata(data, "average")
56 |     observed_ranks = rank(data)
57 | 
58 |     np.testing.assert_array_equal(observed_ranks, expected_ranks)
59 | 
60 | 
61 | def test_rank_one_duplicate_group_at_end():
62 |     data = np.array([0, 1, -10, 7, 8, 9.4, -2.5, -2.5])
63 | 
64 |     expected_ranks = stats.rankdata(data, "average")
65 |     observed_ranks = rank(data)
66 | 
67 |     np.testing.assert_array_equal(observed_ranks, expected_ranks)
68 | 
69 | 
70 | def test_rank_one_duplicate_group_at_end_with_more_elements():
71 |     data = np.array([0, 1, -10, 7, 8, 9.4, -12.5, -12.5, -12.5])
72 | 
73 |     expected_ranks = stats.rankdata(data, "average")
74 |     observed_ranks = rank(data)
75 | 
76 |     np.testing.assert_array_equal(observed_ranks, expected_ranks)
77 | 
78 | 
79 | def test_rank_one_duplicate_group_at_end_is_the_largest():
80 |     data = np.array([0, 1, -10, 7, 8, 9.4, 120.5, 120.5, 120.5])
81 | 
82 |     expected_ranks = stats.rankdata(data, "average")
83 |     observed_ranks = rank(data)
84 | 
85 |     np.testing.assert_array_equal(observed_ranks, expected_ranks)
86 | 
87 | 
88 | def test_rank_all_are_duplicates():
89 |     data = np.array([1.5, 1.5, 1.5, 1.5])
90 | 
91 |     expected_ranks = stats.rankdata(data, "average")
92 |     observed_ranks = rank(data)
93 | 
94 |     np.testing.assert_array_equal(observed_ranks, expected_ranks)
95 | 


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/10_cm_optimized/py/04-get_parts_v00.py:
--------------------------------------------------------------------------------
  1 | # ---
  2 | # jupyter:
  3 | #   jupytext:
  4 | #     cell_metadata_filter: all,-execution,-papermill,-trusted
  5 | #     text_representation:
  6 | #       extension: .py
  7 | #       format_name: percent
  8 | #       format_version: '1.3'
  9 | #       jupytext_version: 1.11.5
 10 | #   kernelspec:
 11 | #     display_name: Python 3 (ipykernel)
 12 | #     language: python
 13 | #     name: python3
 14 | # ---
 15 | 
 16 | # %% [markdown] tags=[]
 17 | # # Description
 18 | 
 19 | # %% [markdown]
 20 | # Now `cdist_parts` has been optimized with previous profiling tests.
 21 | #
 22 | # Here we profile function `_get_parts`.
 23 | #
 24 | # USING _cm not _cm.py_func
 25 | 
 26 | # %% [markdown]
 27 | #
 28 | 
 29 | # %% [markdown] tags=[]
 30 | # # Remove pycache dir
 31 | 
 32 | # %%
 33 | # !echo ${CODE_DIR}
 34 | 
 35 | # %%
 36 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -print
 37 | 
 38 | # %%
 39 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -exec rm -rf {} \;
 40 | 
 41 | # %%
 42 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -print
 43 | 
 44 | # %% [markdown] tags=[]
 45 | # # Modules
 46 | 
 47 | # %% tags=[]
 48 | import numpy as np
 49 | 
 50 | from ccc.coef import _cm
 51 | 
 52 | # %% [markdown] tags=[]
 53 | # # Settings
 54 | 
 55 | # %%
 56 | N_REPS = 10
 57 | 
 58 | # %% tags=[]
 59 | np.random.seed(0)
 60 | 
 61 | # %% [markdown] tags=[]
 62 | # # Setup
 63 | 
 64 | # %%
 65 | # let numba compile all the code before profiling
 66 | _cm.py_func(np.random.rand(10), np.random.rand(10))
 67 | 
 68 | # %% [markdown] tags=[]
 69 | # # Run with `n_samples` small
 70 | 
 71 | # %%
 72 | N_SAMPLES = 100
 73 | 
 74 | # %%
 75 | x = np.random.rand(N_SAMPLES)
 76 | y = np.random.rand(N_SAMPLES)
 77 | 
 78 | 
 79 | # %% tags=[]
 80 | def func():
 81 |     for i in range(N_REPS):
 82 |         # py_func accesses the original python function, not the numba-optimized one
 83 |         # this is needed to be able to profile the function
 84 |         _cm.py_func(x, y)
 85 | 
 86 | 
 87 | # %% tags=[]
 88 | # %%timeit -n1 -r1 func()
 89 | func()
 90 | 
 91 | # %% tags=[]
 92 | # %%prun -s cumulative -l 20 -T 04-n_samples_small.txt
 93 | func()
 94 | 
 95 | # %% [markdown] tags=[]
 96 | # **No improvement** for this case with respect to reference.
 97 | 
 98 | # %% [markdown] tags=[]
 99 | # # Run with `n_samples` large
100 | 
101 | # %%
102 | N_SAMPLES = 100000
103 | 
104 | # %%
105 | x = np.random.rand(N_SAMPLES)
106 | y = np.random.rand(N_SAMPLES)
107 | 
108 | 
109 | # %% tags=[]
110 | def func():
111 |     for i in range(N_REPS):
112 |         # py_func accesses the original python function, not the numba-optimized one
113 |         # this is needed to be able to profile the function
114 |         _cm.py_func(x, y)
115 | 
116 | 
117 | # %% tags=[]
118 | # %%timeit -n1 -r1 func()
119 | func()
120 | 
121 | # %% tags=[]
122 | # %%prun -s cumulative -l 20 -T 04-n_samples_large.txt
123 | func()
124 | 
125 | # %% [markdown] tags=[]
126 | # **No improvement** for this case. In fact, it's a bit worse compared with reference (10.568 tottime).
127 | 
128 | # %%
129 | 


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/10_cm_optimized/py/02-cdist_parts_v01.py:
--------------------------------------------------------------------------------
  1 | # ---
  2 | # jupyter:
  3 | #   jupytext:
  4 | #     cell_metadata_filter: all,-execution,-papermill,-trusted
  5 | #     text_representation:
  6 | #       extension: .py
  7 | #       format_name: percent
  8 | #       format_version: '1.3'
  9 | #       jupytext_version: 1.11.5
 10 | #   kernelspec:
 11 | #     display_name: Python 3 (ipykernel)
 12 | #     language: python
 13 | #     name: python3
 14 | # ---
 15 | 
 16 | # %% [markdown] tags=[]
 17 | # # Description
 18 | 
 19 | # %% [markdown]
 20 | # UPDATE:
 21 | #
 22 | # list changes here
 23 | 
 24 | # %% [markdown]
 25 | # ![image.png](attachment:0b015079-ce2b-4e6c-b2ea-22980d3c2f7d.png)
 26 | 
 27 | # %% [markdown] tags=[]
 28 | # # Remove pycache dir
 29 | 
 30 | # %%
 31 | # !echo ${CODE_DIR}
 32 | 
 33 | # %%
 34 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -print
 35 | 
 36 | # %%
 37 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -exec rm -rf {} \;
 38 | 
 39 | # %%
 40 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -print
 41 | 
 42 | # %% [markdown] tags=[]
 43 | # # Modules
 44 | 
 45 | # %% tags=[]
 46 | import numpy as np
 47 | 
 48 | from ccc.coef import _cm
 49 | 
 50 | # %% [markdown] tags=[]
 51 | # # Settings
 52 | 
 53 | # %%
 54 | N_REPS = 10
 55 | 
 56 | # %% tags=[]
 57 | np.random.seed(0)
 58 | 
 59 | # %% [markdown] tags=[]
 60 | # # Setup
 61 | 
 62 | # %%
 63 | # let numba compile all the code before profiling
 64 | _cm.py_func(np.random.rand(10), np.random.rand(10))
 65 | 
 66 | # %% [markdown] tags=[]
 67 | # # Run with `n_samples` small
 68 | 
 69 | # %%
 70 | N_SAMPLES = 100
 71 | 
 72 | # %%
 73 | x = np.random.rand(N_SAMPLES)
 74 | y = np.random.rand(N_SAMPLES)
 75 | 
 76 | 
 77 | # %% tags=[]
 78 | def func():
 79 |     for i in range(N_REPS):
 80 |         # py_func accesses the original python function, not the numba-optimized one
 81 |         # this is needed to be able to profile the function
 82 |         _cm.py_func(x, y)
 83 | 
 84 | 
 85 | # %% tags=[]
 86 | # %%timeit -n1 -r1 func()
 87 | func()
 88 | 
 89 | # %% tags=[]
 90 | # %%prun -s cumulative -l 20 -T 02-n_samples_small.txt
 91 | func()
 92 | 
 93 | # %% [markdown] tags=[]
 94 | # **No improvement** for this case.
 95 | 
 96 | # %% [markdown] tags=[]
 97 | # # Run with `n_samples` large
 98 | 
 99 | # %%
100 | N_SAMPLES = 100000
101 | 
102 | # %%
103 | x = np.random.rand(N_SAMPLES)
104 | y = np.random.rand(N_SAMPLES)
105 | 
106 | 
107 | # %% tags=[]
108 | def func():
109 |     for i in range(N_REPS):
110 |         # py_func accesses the original python function, not the numba-optimized one
111 |         # this is needed to be able to profile the function
112 |         _cm.py_func(x, y)
113 | 
114 | 
115 | # %% tags=[]
116 | # %%timeit -n1 -r1 func()
117 | func()
118 | 
119 | # %% tags=[]
120 | # %%prun -s cumulative -l 20 -T 02-n_samples_large.txt
121 | func()
122 | 
123 | # %% [markdown] tags=[]
124 | # **Important improvement** for this case. `cdist_parts` takes now 0.370 percall instead of 0.824 (from reference).
125 | #
126 | # **However**, compared with `v00` (0.370 per call), this one is slightly worse.
127 | 
128 | # %%
129 | 


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/10_cm_optimized/py/03-cdist_parts_v02.py:
--------------------------------------------------------------------------------
  1 | # ---
  2 | # jupyter:
  3 | #   jupytext:
  4 | #     cell_metadata_filter: all,-execution,-papermill,-trusted
  5 | #     text_representation:
  6 | #       extension: .py
  7 | #       format_name: percent
  8 | #       format_version: '1.3'
  9 | #       jupytext_version: 1.11.5
 10 | #   kernelspec:
 11 | #     display_name: Python 3 (ipykernel)
 12 | #     language: python
 13 | #     name: python3
 14 | # ---
 15 | 
 16 | # %% [markdown] tags=[]
 17 | # # Description
 18 | 
 19 | # %% [markdown]
 20 | # UPDATE:
 21 | #
 22 | # list changes here
 23 | 
 24 | # %% [markdown]
 25 | # ![image.png](attachment:bee5d958-22e0-4cd2-8667-9b29973604f7.png)
 26 | 
 27 | # %% [markdown] tags=[]
 28 | # # Remove pycache dir
 29 | 
 30 | # %%
 31 | # !echo ${CODE_DIR}
 32 | 
 33 | # %%
 34 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -print
 35 | 
 36 | # %%
 37 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -exec rm -rf {} \;
 38 | 
 39 | # %%
 40 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -print
 41 | 
 42 | # %% [markdown] tags=[]
 43 | # # Modules
 44 | 
 45 | # %% tags=[]
 46 | import numpy as np
 47 | 
 48 | from ccc.coef import _cm
 49 | 
 50 | # %% [markdown] tags=[]
 51 | # # Settings
 52 | 
 53 | # %%
 54 | N_REPS = 10
 55 | 
 56 | # %% tags=[]
 57 | np.random.seed(0)
 58 | 
 59 | # %% [markdown] tags=[]
 60 | # # Setup
 61 | 
 62 | # %%
 63 | # let numba compile all the code before profiling
 64 | _cm.py_func(np.random.rand(10), np.random.rand(10))
 65 | 
 66 | # %% [markdown] tags=[]
 67 | # # Run with `n_samples` small
 68 | 
 69 | # %%
 70 | N_SAMPLES = 100
 71 | 
 72 | # %%
 73 | x = np.random.rand(N_SAMPLES)
 74 | y = np.random.rand(N_SAMPLES)
 75 | 
 76 | 
 77 | # %% tags=[]
 78 | def func():
 79 |     for i in range(N_REPS):
 80 |         # py_func accesses the original python function, not the numba-optimized one
 81 |         # this is needed to be able to profile the function
 82 |         _cm.py_func(x, y)
 83 | 
 84 | 
 85 | # %% tags=[]
 86 | # %%timeit -n1 -r1 func()
 87 | func()
 88 | 
 89 | # %% tags=[]
 90 | # %%prun -s cumulative -l 20 -T 03-n_samples_small.txt
 91 | func()
 92 | 
 93 | # %% [markdown] tags=[]
 94 | # **No improvement** for this case.
 95 | 
 96 | # %% [markdown] tags=[]
 97 | # # Run with `n_samples` large
 98 | 
 99 | # %%
100 | N_SAMPLES = 100000
101 | 
102 | # %%
103 | x = np.random.rand(N_SAMPLES)
104 | y = np.random.rand(N_SAMPLES)
105 | 
106 | 
107 | # %% tags=[]
108 | def func():
109 |     for i in range(N_REPS):
110 |         # py_func accesses the original python function, not the numba-optimized one
111 |         # this is needed to be able to profile the function
112 |         _cm.py_func(x, y)
113 | 
114 | 
115 | # %% tags=[]
116 | # %%timeit -n1 -r1 func()
117 | func()
118 | 
119 | # %% tags=[]
120 | # %%prun -s cumulative -l 20 -T 03-n_samples_large.txt
121 | func()
122 | 
123 | # %% [markdown] tags=[]
124 | # **Important improvement** for this case. `cdist_parts` takes now 0.370 percall instead of 0.824 (from reference).
125 | #
126 | # **However**, compared with `v00` (0.370 per call) or `v01` (0.385), this one does not change.
127 | 
128 | # %%
129 | 


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/10_cm_optimized/py/05-get_parts_v01.py:
--------------------------------------------------------------------------------
  1 | # ---
  2 | # jupyter:
  3 | #   jupytext:
  4 | #     cell_metadata_filter: all,-execution,-papermill,-trusted
  5 | #     text_representation:
  6 | #       extension: .py
  7 | #       format_name: percent
  8 | #       format_version: '1.3'
  9 | #       jupytext_version: 1.11.5
 10 | #   kernelspec:
 11 | #     display_name: Python 3 (ipykernel)
 12 | #     language: python
 13 | #     name: python3
 14 | # ---
 15 | 
 16 | # %% [markdown] tags=[]
 17 | # # Description
 18 | 
 19 | # %% [markdown]
 20 | # Now `cdist_parts` has been optimized with previous profiling tests.
 21 | #
 22 | # Here we profile function `_get_parts`.
 23 | #
 24 | # Here I disabled njit in `_get_parts` and `run_quantile_clustering` to be able to profile.
 25 | 
 26 | # %% [markdown]
 27 | #
 28 | 
 29 | # %% [markdown] tags=[]
 30 | # # Remove pycache dir
 31 | 
 32 | # %%
 33 | # !echo ${CODE_DIR}
 34 | 
 35 | # %%
 36 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -print
 37 | 
 38 | # %%
 39 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -exec rm -rf {} \;
 40 | 
 41 | # %%
 42 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -print
 43 | 
 44 | # %% [markdown] tags=[]
 45 | # # Modules
 46 | 
 47 | # %% tags=[]
 48 | import numpy as np
 49 | 
 50 | from ccc.coef import _cm
 51 | 
 52 | # %% [markdown] tags=[]
 53 | # # Settings
 54 | 
 55 | # %%
 56 | N_REPS = 10
 57 | 
 58 | # %% tags=[]
 59 | np.random.seed(0)
 60 | 
 61 | # %% [markdown] tags=[]
 62 | # # Setup
 63 | 
 64 | # %%
 65 | # let numba compile all the code before profiling
 66 | _cm.py_func(np.random.rand(10), np.random.rand(10))
 67 | 
 68 | # %% [markdown] tags=[]
 69 | # # Run with `n_samples` small
 70 | 
 71 | # %%
 72 | N_SAMPLES = 100
 73 | 
 74 | # %%
 75 | x = np.random.rand(N_SAMPLES)
 76 | y = np.random.rand(N_SAMPLES)
 77 | 
 78 | 
 79 | # %% tags=[]
 80 | def func():
 81 |     for i in range(N_REPS):
 82 |         # py_func accesses the original python function, not the numba-optimized one
 83 |         # this is needed to be able to profile the function
 84 |         _cm.py_func(x, y)
 85 | 
 86 | 
 87 | # %% tags=[]
 88 | # %%timeit -n1 -r1 func()
 89 | func()
 90 | 
 91 | # %% tags=[]
 92 | # %%prun -s cumulative -l 20 -T 05-n_samples_small.txt
 93 | func()
 94 | 
 95 | # %% [markdown] tags=[]
 96 | # In this case (small number of samples), `cdist_parts` is still the most consuming function, followed by `rank` (`tottime`).
 97 | 
 98 | # %% [markdown] tags=[]
 99 | # # Run with `n_samples` large
100 | 
101 | # %%
102 | N_SAMPLES = 100000
103 | 
104 | # %%
105 | x = np.random.rand(N_SAMPLES)
106 | y = np.random.rand(N_SAMPLES)
107 | 
108 | 
109 | # %% tags=[]
110 | def func():
111 |     for i in range(N_REPS):
112 |         # py_func accesses the original python function, not the numba-optimized one
113 |         # this is needed to be able to profile the function
114 |         _cm.py_func(x, y)
115 | 
116 | 
117 | # %% tags=[]
118 | # %%timeit -n1 -r1 func()
119 | func()
120 | 
121 | # %% tags=[]
122 | # %%prun -s cumulative -l 20 -T 05-n_samples_large.txt
123 | func()
124 | 
125 | # %% [markdown] tags=[]
126 | # `rank` is the function that needs optimization.
127 | 
128 | # %%
129 | 


--------------------------------------------------------------------------------
/nbs/25_pvalue/py/01-ccc_pvalue_dist-generate-gene_pairs.py:
--------------------------------------------------------------------------------
  1 | # ---
  2 | # jupyter:
  3 | #   jupytext:
  4 | #     cell_metadata_filter: all,-execution,-papermill,-trusted
  5 | #     notebook_metadata_filter: -jupytext.text_representation.jupytext_version
  6 | #     text_representation:
  7 | #       extension: .py
  8 | #       format_name: percent
  9 | #       format_version: '1.3'
 10 | #   kernelspec:
 11 | #     display_name: Python 3 (ipykernel)
 12 | #     language: python
 13 | #     name: python3
 14 | # ---
 15 | 
 16 | # %% [markdown] tags=[]
 17 | # # Description
 18 | 
 19 | # %% [markdown] tags=[]
 20 | # Generates a distribution of pvalues under the null hypothesis of no association.
 21 | #
 22 | # This notebook uses individual gene pairs as input for CCC and parallelizes permutations.
 23 | 
 24 | # %% [markdown] tags=[]
 25 | # # Modules loading
 26 | 
 27 | # %% tags=[]
 28 | import numpy as np
 29 | from joblib import Parallel, delayed
 30 | 
 31 | from ccc.coef import ccc
 32 | from ccc import conf
 33 | 
 34 | # %% [markdown] tags=[]
 35 | # # Settings
 36 | 
 37 | # %% tags=[]
 38 | rs = np.random.RandomState(0)
 39 | 
 40 | # %% tags=[]
 41 | N_JOBS = 1
 42 | display(N_JOBS)
 43 | 
 44 | PVALUE_N_JOBS = conf.GENERAL["N_JOBS"]
 45 | display(PVALUE_N_JOBS)
 46 | 
 47 | # %% tags=[]
 48 | DATA_N_OBJS, DATA_N_FEATURES = 100, 1000
 49 | PVALUE_N_PERMS = 1000
 50 | 
 51 | # %% [markdown] tags=[]
 52 | # # Paths
 53 | 
 54 | # %% tags=[]
 55 | OUTPUT_DIR = conf.RESULTS_DIR / "ccc_null-pvalues"
 56 | OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
 57 | 
 58 | # %% tags=[]
 59 | OUTPUT_DIR
 60 | 
 61 | # %% [markdown] tags=[]
 62 | # # Generate random data
 63 | 
 64 | # %% tags=[]
 65 | data = rs.rand(DATA_N_OBJS, DATA_N_FEATURES)
 66 | 
 67 | # %% tags=[]
 68 | data.shape
 69 | 
 70 | 
 71 | # %% [markdown] tags=[]
 72 | # # Run CCC
 73 | 
 74 | # %% tags=[]
 75 | def ccc_single(x, y):
 76 |     return ccc(x, y, pvalue_n_perms=PVALUE_N_PERMS, n_jobs=PVALUE_N_JOBS)
 77 | 
 78 | 
 79 | # %% tags=[]
 80 | results = Parallel(n_jobs=N_JOBS)(
 81 |     delayed(ccc_single)(data[i], data[j])
 82 |     for i in range(data.shape[0] - 1)
 83 |     for j in range(i + 1, data.shape[0])
 84 | )
 85 | 
 86 | # %% tags=[]
 87 | assert len(results) == (DATA_N_OBJS * (DATA_N_OBJS - 1)) / 2
 88 | 
 89 | # %% tags=[]
 90 | results[0]
 91 | 
 92 | # %% tags=[]
 93 | cm_values = [x[0] for x in results]
 94 | 
 95 | # %% tags=[]
 96 | cm_pvalues = [x[1] for x in results]
 97 | 
 98 | # %% tags=[]
 99 | assert len(cm_values) == len(cm_pvalues)
100 | assert len(cm_values) == (DATA_N_OBJS * (DATA_N_OBJS - 1)) / 2
101 | 
102 | # %% tags=[]
103 | cm_values = np.array(cm_values)
104 | cm_pvalues = np.array(cm_pvalues)
105 | 
106 | # %% tags=[]
107 | cm_values.shape
108 | 
109 | # %% tags=[]
110 | cm_values
111 | 
112 | # %% tags=[]
113 | cm_pvalues.shape
114 | 
115 | # %% tags=[]
116 | cm_pvalues
117 | 
118 | # %% [markdown] tags=[]
119 | # # Save
120 | 
121 | # %% tags=[]
122 | output_file = OUTPUT_DIR / "gene_pairs-cm_values.npy"
123 | display(output_file)
124 | 
125 | np.save(output_file, cm_values)
126 | 
127 | # %% tags=[]
128 | output_file = OUTPUT_DIR / "gene_pairs-cm_pvalues.npy"
129 | display(output_file)
130 | 
131 | np.save(output_file, cm_pvalues)
132 | 
133 | # %% tags=[]
134 | 


--------------------------------------------------------------------------------
/libs/ccc/corr.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Functions to compute different correlation coefficients.
  3 | 
  4 | All correlation functions in this module are expected to have the same input and output
  5 | structure:
  6 | 
  7 |  * The input is a pandas DataFrame with genes in rows (Ensembl IDs) and samples
  8 |    in columns. The values are gene expression data normalized with some technique,
  9 |    but that should not be relevant for the correlation method. No empty values
 10 |    are allowed.
 11 | 
 12 |  * The output is a pandas DataFrame, a symmetric correlation matrix with genes
 13 |    in rows and columns (Ensembl IDs), and the values are the correlation
 14 |    coefficients. Diagonal values are expected to be ones.
 15 | """
 16 | import pandas as pd
 17 | import numpy as np
 18 | from sklearn.metrics import pairwise_distances
 19 | 
 20 | 
 21 | def pearson(data: pd.DataFrame) -> pd.DataFrame:
 22 |     """
 23 |     Compute the Pearson correlation coefficient.
 24 |     """
 25 |     corr_mat = 1 - pairwise_distances(data.to_numpy(), metric="correlation", n_jobs=1)
 26 | 
 27 |     np.fill_diagonal(corr_mat, 1.0)
 28 | 
 29 |     return pd.DataFrame(
 30 |         corr_mat,
 31 |         index=data.index.copy(),
 32 |         columns=data.index.copy(),
 33 |     )
 34 | 
 35 | 
 36 | def spearman(data: pd.DataFrame) -> pd.DataFrame:
 37 |     """
 38 |     Compute the Spearman correlation coefficient.
 39 |     """
 40 |     # compute ranks
 41 |     data = data.rank(axis=1)
 42 | 
 43 |     corr_mat = 1 - pairwise_distances(data.to_numpy(), metric="correlation", n_jobs=1)
 44 | 
 45 |     np.fill_diagonal(corr_mat, 1.0)
 46 | 
 47 |     return pd.DataFrame(
 48 |         corr_mat,
 49 |         index=data.index.copy(),
 50 |         columns=data.index.copy(),
 51 |     )
 52 | 
 53 | 
 54 | def mic(data: pd.DataFrame, estimator="mic_approx", n_jobs=None) -> pd.DataFrame:
 55 |     """
 56 |     Compute the Maximal Correlation Coefficient (MIC).
 57 |     """
 58 |     from scipy.spatial.distance import squareform
 59 |     from minepy import pstats
 60 |     from ccc.methods import mic as mic_single
 61 | 
 62 |     if n_jobs is None:
 63 |         corr_mat = pstats(
 64 |             data.to_numpy(),
 65 |             est=estimator,
 66 |         )[0]
 67 | 
 68 |         corr_mat = squareform(corr_mat)
 69 |     else:
 70 |         corr_mat = pairwise_distances(data.to_numpy(), metric=mic_single, n_jobs=n_jobs)
 71 | 
 72 |     np.fill_diagonal(corr_mat, 1.0)
 73 | 
 74 |     return pd.DataFrame(
 75 |         corr_mat,
 76 |         index=data.index.copy(),
 77 |         columns=data.index.copy(),
 78 |     )
 79 | 
 80 | 
 81 | def ccc(data: pd.DataFrame, internal_n_clusters=None, n_jobs=1) -> pd.DataFrame:
 82 |     """
 83 |     Compute the Clustermatch Correlation Coefficient (CCC).
 84 |     """
 85 |     from scipy.spatial.distance import squareform
 86 |     from ccc.coef import ccc
 87 | 
 88 |     corr_mat = ccc(
 89 |         data.to_numpy(),
 90 |         internal_n_clusters=internal_n_clusters,
 91 |         n_jobs=n_jobs,
 92 |     )
 93 | 
 94 |     corr_mat = squareform(corr_mat)
 95 |     np.fill_diagonal(corr_mat, 1.0)
 96 | 
 97 |     return pd.DataFrame(
 98 |         corr_mat,
 99 |         index=data.index.copy(),
100 |         columns=data.index.copy(),
101 |     )
102 | 


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/10_cm_optimized/py/07-get_parts_v03.py:
--------------------------------------------------------------------------------
  1 | # ---
  2 | # jupyter:
  3 | #   jupytext:
  4 | #     cell_metadata_filter: all,-execution,-papermill,-trusted
  5 | #     text_representation:
  6 | #       extension: .py
  7 | #       format_name: percent
  8 | #       format_version: '1.3'
  9 | #       jupytext_version: 1.11.5
 10 | #   kernelspec:
 11 | #     display_name: Python 3 (ipykernel)
 12 | #     language: python
 13 | #     name: python3
 14 | # ---
 15 | 
 16 | # %% [markdown] tags=[]
 17 | # # Description
 18 | 
 19 | # %% [markdown]
 20 | # Now `cdist_parts` has been optimized with previous profiling tests.
 21 | #
 22 | # Here we profile function `_get_parts`.
 23 | #
 24 | # Here I try a completely new `rank` function.
 25 | # I'm also trying a slightly different `run_quantile_clustering`, given the changes to `rank`.
 26 | # I'm also parallelizing `_get_parts` inside `_cm`.
 27 | 
 28 | # %% [markdown] tags=[]
 29 | # # Remove pycache dir
 30 | 
 31 | # %%
 32 | # !echo ${CODE_DIR}
 33 | 
 34 | # %%
 35 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -print
 36 | 
 37 | # %%
 38 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -exec rm -rf {} \;
 39 | 
 40 | # %%
 41 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -print
 42 | 
 43 | # %% [markdown] tags=[]
 44 | # # Modules
 45 | 
 46 | # %% tags=[]
 47 | import numpy as np
 48 | 
 49 | from ccc.coef import _cm
 50 | 
 51 | # %% [markdown] tags=[]
 52 | # # Settings
 53 | 
 54 | # %%
 55 | N_REPS = 10
 56 | 
 57 | # %% tags=[]
 58 | np.random.seed(0)
 59 | 
 60 | # %% [markdown] tags=[]
 61 | # # Setup
 62 | 
 63 | # %%
 64 | # let numba compile all the code before profiling
 65 | _cm(np.random.rand(10), np.random.rand(10))
 66 | 
 67 | # %% [markdown] tags=[]
 68 | # # Run with `n_samples` small
 69 | 
 70 | # %%
 71 | N_SAMPLES = 100
 72 | 
 73 | # %%
 74 | x = np.random.rand(N_SAMPLES)
 75 | y = np.random.rand(N_SAMPLES)
 76 | 
 77 | 
 78 | # %% tags=[]
 79 | def func():
 80 |     for i in range(N_REPS):
 81 |         # py_func accesses the original python function, not the numba-optimized one
 82 |         # this is needed to be able to profile the function
 83 |         _cm(x, y)
 84 | 
 85 | 
 86 | # %% tags=[]
 87 | # %%timeit -n1 -r4 func()
 88 | func()
 89 | 
 90 | # %% tags=[]
 91 | # %%prun -s cumulative -l 20 -T 07-n_samples_small.txt
 92 | func()
 93 | 
 94 | # %% [markdown] tags=[]
 95 | # In this case (small number of samples), `cdist_parts` is still the most consuming function, followed by `rank` (`tottime`).
 96 | 
 97 | # %% [markdown] tags=[]
 98 | # # Run with `n_samples` large
 99 | 
100 | # %%
101 | N_SAMPLES = 100000
102 | 
103 | # %%
104 | x = np.random.rand(N_SAMPLES)
105 | y = np.random.rand(N_SAMPLES)
106 | 
107 | 
108 | # %% tags=[]
109 | def func():
110 |     for i in range(N_REPS):
111 |         # py_func accesses the original python function, not the numba-optimized one
112 |         # this is needed to be able to profile the function
113 |         _cm(x, y)
114 | 
115 | 
116 | # %% tags=[]
117 | # %%timeit -n1 -r4 func()
118 | func()
119 | 
120 | # %% tags=[]
121 | # %%prun -s cumulative -l 20 -T 07-n_samples_large.txt
122 | func()
123 | 
124 | # %% [markdown] tags=[]
125 | # **Large improvement** using a new `rank` function and parallelizing the call of `_get_parts` from `_cm`.
126 | 
127 | # %%
128 | 


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/10_cm_optimized/py/06-get_parts_v02.py:
--------------------------------------------------------------------------------
  1 | # ---
  2 | # jupyter:
  3 | #   jupytext:
  4 | #     cell_metadata_filter: all,-execution,-papermill,-trusted
  5 | #     text_representation:
  6 | #       extension: .py
  7 | #       format_name: percent
  8 | #       format_version: '1.3'
  9 | #       jupytext_version: 1.11.5
 10 | #   kernelspec:
 11 | #     display_name: Python 3 (ipykernel)
 12 | #     language: python
 13 | #     name: python3
 14 | # ---
 15 | 
 16 | # %% [markdown] tags=[]
 17 | # # Description
 18 | 
 19 | # %% [markdown]
 20 | # Now `cdist_parts` has been optimized with previous profiling tests.
 21 | #
 22 | # Here we profile function `_get_parts`.
 23 | #
 24 | # Here I disabled njit in `_get_parts` and `run_quantile_clustering` to be able to profile.
 25 | #
 26 | # Here I tried `scipy.stats.rankdata` instead of the `rank` function I wrote.
 27 | 
 28 | # %% [markdown]
 29 | #
 30 | 
 31 | # %% [markdown] tags=[]
 32 | # # Remove pycache dir
 33 | 
 34 | # %%
 35 | # !echo ${CODE_DIR}
 36 | 
 37 | # %%
 38 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -print
 39 | 
 40 | # %%
 41 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -exec rm -rf {} \;
 42 | 
 43 | # %%
 44 | # !find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -print
 45 | 
 46 | # %% [markdown] tags=[]
 47 | # # Modules
 48 | 
 49 | # %% tags=[]
 50 | import numpy as np
 51 | 
 52 | from ccc.coef import _cm
 53 | 
 54 | # %% [markdown] tags=[]
 55 | # # Settings
 56 | 
 57 | # %%
 58 | N_REPS = 10
 59 | 
 60 | # %% tags=[]
 61 | np.random.seed(0)
 62 | 
 63 | # %% [markdown] tags=[]
 64 | # # Setup
 65 | 
 66 | # %%
 67 | # let numba compile all the code before profiling
 68 | _cm(np.random.rand(10), np.random.rand(10))
 69 | 
 70 | # %% [markdown] tags=[]
 71 | # # Run with `n_samples` small
 72 | 
 73 | # %%
 74 | N_SAMPLES = 100
 75 | 
 76 | # %%
 77 | x = np.random.rand(N_SAMPLES)
 78 | y = np.random.rand(N_SAMPLES)
 79 | 
 80 | 
 81 | # %% tags=[]
 82 | def func():
 83 |     for i in range(N_REPS):
 84 |         # py_func accesses the original python function, not the numba-optimized one
 85 |         # this is needed to be able to profile the function
 86 |         _cm(x, y)
 87 | 
 88 | 
 89 | # %% tags=[]
 90 | # %%timeit -n1 -r1 func()
 91 | func()
 92 | 
 93 | # %% tags=[]
 94 | # %%prun -s cumulative -l 20 -T 06-n_samples_small.txt
 95 | func()
 96 | 
 97 | # %% [markdown] tags=[]
 98 | # In this case (small number of samples), `cdist_parts` is still the most consuming function, followed by `rank` (`tottime`).
 99 | 
100 | # %% [markdown] tags=[]
101 | # # Run with `n_samples` large
102 | 
103 | # %%
104 | N_SAMPLES = 100000
105 | 
106 | # %%
107 | x = np.random.rand(N_SAMPLES)
108 | y = np.random.rand(N_SAMPLES)
109 | 
110 | 
111 | # %% tags=[]
112 | def func():
113 |     for i in range(N_REPS):
114 |         # py_func accesses the original python function, not the numba-optimized one
115 |         # this is needed to be able to profile the function
116 |         _cm(x, y)
117 | 
118 | 
119 | # %% tags=[]
120 | # %%timeit -n1 -r1 func()
121 | func()
122 | 
123 | # %% tags=[]
124 | # %%prun -s cumulative -l 20 -T 06-n_samples_large.txt
125 | func()
126 | 
127 | # %% [markdown] tags=[]
128 | # **Large improvement** using the scipy rankdata function. The current `rank` function needs optimization.
129 | 
130 | # %%
131 | 


--------------------------------------------------------------------------------
/scripts/run_docker.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # It runs the Docker container of this project by mounting the code and
  4 | # manuscript directories inside the container. This makes that any file created
  5 | # during the execution is locally available and ready to be pushed to the repo.
  6 | # Plus, the code is always run inside the same environment (including the full
  7 | # operating system).
  8 | #
  9 | # We assume the repo code is in the current directory, so the user has to make
 10 | # sure this is right.
 11 | 
 12 | # general settings
 13 | DOCKER_IMAGE_NAMESPACE="miltondp"
 14 | DOCKER_IMAGE_NAME="ccc"
 15 | DOCKER_TAG="${CM_DOCKER_IMAGE_TAG:-latest}"
 16 | DOCKER_PUBLISH_HOST="127.0.0.1"
 17 | DOCKER_CONTAINER_PORT="8893"
 18 | DOCKER_HOST_PORT="8893"
 19 | 
 20 | # project-specific environment variables
 21 | ROOT_DIR="${CM_ROOT_DIR}"
 22 | MANUSCRIPT_DIR="${CM_MANUSCRIPT_DIR}"
 23 | N_JOBS_VARNAME="CM_N_JOBS"
 24 | N_JOBS=${!N_JOBS_VARNAME}
 25 | 
 26 | # parameters parsing
 27 | # read arguments
 28 | POSITIONAL_ARGS=()
 29 | 
 30 | while [[ $# -gt 0 ]]; do
 31 |   case $1 in
 32 |     --docker-args)
 33 |       DOCKER_ARGS="$2"
 34 |       shift # past argument
 35 |       shift # past value
 36 |       ;;
 37 |     *)
 38 |       POSITIONAL_ARGS+=("$1") # save positional arg
 39 |       shift # past argument
 40 |       ;;
 41 |   esac
 42 | done
 43 | 
 44 | set -- "${POSITIONAL_ARGS[@]}" # restore positional parameters
 45 | 
 46 | echo "Configuration:"
 47 | 
 48 | CODE_DIR=`pwd`
 49 | 
 50 | # root dir
 51 | if [ -z "${ROOT_DIR}" ]; then
 52 |   ROOT_DIR="${CODE_DIR}/base"
 53 | fi
 54 | 
 55 | # manuscript dir
 56 | if [ -z "${MANUSCRIPT_DIR}" ]; then
 57 |   MANUSCRIPT_DIR="/tmp/${DOCKER_IMAGE_NAME}_manuscript"
 58 |   mkdir -p ${MANUSCRIPT_DIR}
 59 | fi
 60 | 
 61 | if [ -z "${N_JOBS}" ]; then
 62 |   N_JOBS=1
 63 | fi
 64 | 
 65 | echo "Configuration:"
 66 | echo "  Code dir: ${CODE_DIR}"
 67 | echo "  Root dir: ${ROOT_DIR}"
 68 | echo "  Manuscript dir: ${MANUSCRIPT_DIR}"
 69 | echo "  CPU cores: ${N_JOBS}"
 70 | echo "  Docker image tag: ${DOCKER_TAG}"
 71 | 
 72 | echo
 73 | echo "Waiting 2 seconds before starting"
 74 | echo
 75 | sleep 2
 76 | 
 77 | # always create data directory before running Docker
 78 | mkdir -p ${ROOT_DIR}
 79 | 
 80 | COMMAND="$@"
 81 | PORT_ARG="-p ${DOCKER_PUBLISH_HOST}:${DOCKER_HOST_PORT}:${DOCKER_CONTAINER_PORT}"
 82 | if [ -z "${COMMAND}" ]; then
 83 |   FULL_COMMAND=()
 84 | else
 85 |   FULL_COMMAND=(/bin/bash -c "${COMMAND}")
 86 |   PORT_ARG=""
 87 | fi
 88 | 
 89 | echo "Full command: ${FULL_COMMAND}"
 90 | 
 91 | if [ -z "${DOCKER_ARGS}" ]; then
 92 |   # by default, use interactive mode (enables cancelling run with Ctrl C from console)
 93 |   DOCKER_ARGS="-ti"
 94 | fi
 95 | 
 96 | # show commands being executed
 97 | echo
 98 | set -x
 99 | 
100 | # run
101 | docker run --rm ${PORT_ARG} ${DOCKER_ARGS} \
102 |   -e ${N_JOBS_VARNAME}=${N_JOBS} \
103 |   -e NUMBA_NUM_THREADS=${N_JOBS} \
104 |   -e MKL_NUM_THREADS=${N_JOBS} \
105 |   -e OPEN_BLAS_NUM_THREADS=${N_JOBS} \
106 |   -e NUMEXPR_NUM_THREADS=${N_JOBS} \
107 |   -e OMP_NUM_THREADS=${N_JOBS} \
108 |   -e "${CM_RUN_NBS_OVERRIDE}":${CM_RUN_NBS_OVERRIDE:-0} \
109 |   -v "${CODE_DIR}:/opt/code" \
110 |   -v "${ROOT_DIR}:/opt/data" \
111 |   -v "${MANUSCRIPT_DIR}:/opt/manuscript" \
112 |   --user "$(id -u):$(id -g)" \
113 |   ${DOCKER_IMAGE_NAMESPACE}/${DOCKER_IMAGE_NAME}:${DOCKER_TAG} "${FULL_COMMAND[@]}"
114 | 
115 | 


--------------------------------------------------------------------------------
/nbs/20_comparison_others/py/60-time_test-1_cpu_core.py:
--------------------------------------------------------------------------------
  1 | # ---
  2 | # jupyter:
  3 | #   jupytext:
  4 | #     cell_metadata_filter: all,-execution,-papermill,-trusted
  5 | #     notebook_metadata_filter: -jupytext.text_representation.jupytext_version
  6 | #     text_representation:
  7 | #       extension: .py
  8 | #       format_name: percent
  9 | #       format_version: '1.3'
 10 | #   kernelspec:
 11 | #     display_name: Python 3 (ipykernel)
 12 | #     language: python
 13 | #     name: python3
 14 | # ---
 15 | 
 16 | # %% [markdown] tags=[]
 17 | # # Description
 18 | 
 19 | # %% [markdown] tags=[]
 20 | # It generates random variables of varying sizes to compare the time taken by CCC and MIC.
 21 | #
 22 | # This notebook uses 1 CPU core.
 23 | 
 24 | # %% [markdown] tags=[]
 25 | # # Modules loading
 26 | 
 27 | # %% [markdown] tags=[]
 28 | # Make sure only one core is used everywhere.
 29 | 
 30 | # %% tags=[]
 31 | # %env CM_N_JOBS=1
 32 | # %env NUMBA_NUM_THREADS=1
 33 | # %env MKL_NUM_THREADS=1
 34 | # %env OPEN_BLAS_NUM_THREADS=1
 35 | # %env NUMEXPR_NUM_THREADS=1
 36 | # %env OMP_NUM_THREADS=1
 37 | 
 38 | # %% tags=[]
 39 | from time import time
 40 | 
 41 | import numpy as np
 42 | import pandas as pd
 43 | from scipy.stats import pearsonr, spearmanr
 44 | 
 45 | from ccc import conf
 46 | from ccc.coef import ccc
 47 | from ccc.methods import mic
 48 | 
 49 | # %% [markdown] tags=[]
 50 | # # Settings
 51 | 
 52 | # %% tags=[]
 53 | OUTPUT_FILENAME = "time_test.pkl"
 54 | 
 55 | # %% tags=[]
 56 | DATA_SIZES = [
 57 |     100,
 58 |     500,
 59 |     1000,
 60 |     5000,
 61 |     10000,
 62 |     50000,
 63 |     100000,
 64 |     1000000,
 65 |     10000000,
 66 | ]
 67 | 
 68 | N_REPS = 10
 69 | 
 70 | # %% tags=[]
 71 | np.random.seed(0)
 72 | 
 73 | # %% [markdown] tags=[]
 74 | # # Paths
 75 | 
 76 | # %% tags=[]
 77 | OUTPUT_DIR = conf.RESULTS_DIR / "time_test"
 78 | OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
 79 | display(OUTPUT_DIR)
 80 | 
 81 | # %% [markdown] tags=[]
 82 | # # Functions
 83 | 
 84 | # %% tags=[]
 85 | time_results = pd.DataFrame(columns=["data_size", "method", "time", "sim"])
 86 | 
 87 | 
 88 | # %% tags=[]
 89 | def run_method(func, method_name, size):
 90 |     n_reps = N_REPS
 91 |     if size < 500:
 92 |         n_reps = 1000
 93 | 
 94 |     for r in range(n_reps):
 95 |         d1 = np.random.rand(size)
 96 |         d2 = np.random.rand(size)
 97 | 
 98 |         start_time = time()
 99 |         sim = func(d1, d2)
100 |         end_time = time()
101 |         met_time = end_time - start_time
102 | 
103 |         idx = time_results.shape[0]
104 |         time_results.loc[idx] = [d1.shape[0], method_name, met_time, sim]
105 | 
106 | 
107 | # %% [markdown] tags=[]
108 | # # Run
109 | 
110 | # %% tags=[]
111 | # initialize methods
112 | ccc(np.random.rand(100), np.random.rand(100))
113 | 
114 | # %% tags=[]
115 | for s in DATA_SIZES:
116 |     print(f"Size: {s}")
117 | 
118 |     print("  p")
119 |     run_method(lambda x, y: pearsonr(x, y)[0], "p-1", s)
120 | 
121 |     print("  s")
122 |     run_method(lambda x, y: spearmanr(x, y)[0], "s-1", s)
123 | 
124 |     print("  cm")
125 |     run_method(lambda x, y: ccc(x, y), "cm-1", s)
126 | 
127 |     if s <= 50000:
128 |         print("  mic_e")
129 |         run_method(lambda x, y: mic(x, y, estimator="mic_e"), "mic_e-1", s)
130 | 
131 |     if s <= 10000:
132 |         print("  mic")
133 |         run_method(lambda x, y: mic(x, y), "mic-1", s)
134 | 
135 |     print("Saving to pickle")
136 |     time_results.to_pickle(OUTPUT_DIR / OUTPUT_FILENAME)
137 | 
138 |     print("\n")
139 | 
140 | # %% [markdown] tags=[]
141 | # # Summary of results
142 | 
143 | # %% tags=[]
144 | time_results.shape
145 | 
146 | # %% tags=[]
147 | time_results.head()
148 | 
149 | # %% tags=[]
150 | 


--------------------------------------------------------------------------------
/nbs/20_comparison_others/py/61-time_test-3_cpu_cores.py:
--------------------------------------------------------------------------------
  1 | # ---
  2 | # jupyter:
  3 | #   jupytext:
  4 | #     cell_metadata_filter: all,-execution,-papermill,-trusted
  5 | #     notebook_metadata_filter: -jupytext.text_representation.jupytext_version
  6 | #     text_representation:
  7 | #       extension: .py
  8 | #       format_name: percent
  9 | #       format_version: '1.3'
 10 | #   kernelspec:
 11 | #     display_name: Python 3 (ipykernel)
 12 | #     language: python
 13 | #     name: python3
 14 | # ---
 15 | 
 16 | # %% [markdown] tags=[]
 17 | # # Description
 18 | 
 19 | # %% [markdown] tags=[]
 20 | # It generates random variables of varying sizes to compare the time taken by CCC and MIC.
 21 | #
 22 | # This notebook uses 3 CPU core.
 23 | 
 24 | # %% [markdown] tags=[]
 25 | # # Modules loading
 26 | 
 27 | # %% tags=[]
 28 | # %env CM_N_JOBS=3
 29 | # %env NUMBA_NUM_THREADS=3
 30 | # %env MKL_NUM_THREADS=3
 31 | # %env OPEN_BLAS_NUM_THREADS=3
 32 | # %env NUMEXPR_NUM_THREADS=3
 33 | # %env OMP_NUM_THREADS=3
 34 | 
 35 | # %% tags=[]
 36 | import os
 37 | from time import time
 38 | 
 39 | import numpy as np
 40 | import pandas as pd
 41 | from scipy.stats import pearsonr, spearmanr
 42 | 
 43 | from ccc import conf
 44 | from ccc.coef import ccc
 45 | from ccc.methods import mic
 46 | 
 47 | # %% [markdown] tags=[]
 48 | # # Settings
 49 | 
 50 | # %% tags=[]
 51 | N_JOBS = int(os.environ["CM_N_JOBS"])
 52 | display(N_JOBS)
 53 | 
 54 | # %% tags=[]
 55 | OUTPUT_FILENAME = "time_test.pkl"
 56 | 
 57 | # %% tags=[]
 58 | DATA_SIZES = [
 59 |     100,
 60 |     500,
 61 |     1000,
 62 |     5000,
 63 |     10000,
 64 |     50000,
 65 |     100000,
 66 |     1000000,
 67 |     10000000,
 68 | ]
 69 | 
 70 | N_REPS = 10
 71 | 
 72 | # %% tags=[]
 73 | np.random.seed(0)
 74 | 
 75 | # %% [markdown] tags=[]
 76 | # # Paths
 77 | 
 78 | # %% tags=[]
 79 | OUTPUT_DIR = conf.RESULTS_DIR / "time_test"
 80 | OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
 81 | display(OUTPUT_DIR)
 82 | 
 83 | # %% [markdown] tags=[]
 84 | # # Functions
 85 | 
 86 | # %% tags=[]
 87 | # append to previous run
 88 | time_results = pd.read_pickle(OUTPUT_DIR / OUTPUT_FILENAME)
 89 | 
 90 | # %% tags=[]
 91 | time_results.shape
 92 | 
 93 | 
 94 | # %% tags=[]
 95 | def run_method(func, method_name, size):
 96 |     n_reps = N_REPS
 97 |     if size < 500:
 98 |         n_reps = 1000
 99 | 
100 |     for r in range(n_reps):
101 |         d1 = np.random.rand(size)
102 |         d2 = np.random.rand(size)
103 | 
104 |         start_time = time()
105 |         sim = func(d1, d2)
106 |         end_time = time()
107 |         met_time = end_time - start_time
108 | 
109 |         idx = time_results.shape[0]
110 |         time_results.loc[idx] = [d1.shape[0], method_name, met_time, sim]
111 | 
112 | 
113 | # %% [markdown] tags=[]
114 | # # Run
115 | 
116 | # %% tags=[]
117 | # initialize methods
118 | ccc(np.random.rand(100), np.random.rand(100))
119 | 
120 | # %% tags=[]
121 | for s in DATA_SIZES:
122 |     print(f"Size: {s}")
123 | 
124 |     print("  p")
125 |     run_method(lambda x, y: pearsonr(x, y)[0], "p-3", s)
126 | 
127 |     print("  s")
128 |     run_method(lambda x, y: spearmanr(x, y)[0], "s-3", s)
129 | 
130 |     print("  cm")
131 |     run_method(lambda x, y: ccc(x, y, n_jobs=N_JOBS), "cm-3", s)
132 | 
133 |     if s <= 50000:
134 |         print("  mic_e")
135 |         run_method(lambda x, y: mic(x, y, estimator="mic_e"), "mic_e-3", s)
136 | 
137 |     if s <= 10000:
138 |         print("  mic")
139 |         run_method(lambda x, y: mic(x, y), "mic-3", s)
140 | 
141 |     print("Saving to pickle")
142 |     time_results.to_pickle(OUTPUT_DIR / OUTPUT_FILENAME)
143 | 
144 |     print("\n")
145 | 
146 | # %% [markdown] tags=[]
147 | # # Summary of results
148 | 
149 | # %% tags=[]
150 | time_results.shape
151 | 
152 | # %% tags=[]
153 | time_results.head()
154 | 
155 | # %% tags=[]
156 | 


--------------------------------------------------------------------------------
/nbs/20_comparison_others/py/62-time_test-6_cpu_cores.py:
--------------------------------------------------------------------------------
  1 | # ---
  2 | # jupyter:
  3 | #   jupytext:
  4 | #     cell_metadata_filter: all,-execution,-papermill,-trusted
  5 | #     notebook_metadata_filter: -jupytext.text_representation.jupytext_version
  6 | #     text_representation:
  7 | #       extension: .py
  8 | #       format_name: percent
  9 | #       format_version: '1.3'
 10 | #   kernelspec:
 11 | #     display_name: Python 3 (ipykernel)
 12 | #     language: python
 13 | #     name: python3
 14 | # ---
 15 | 
 16 | # %% [markdown] tags=[]
 17 | # # Description
 18 | 
 19 | # %% [markdown] tags=[]
 20 | # It generates random variables of varying sizes to compare the time taken by CCC and MIC.
 21 | #
 22 | # This notebook uses 6 CPU core.
 23 | 
 24 | # %% [markdown] tags=[]
 25 | # # Modules loading
 26 | 
 27 | # %% tags=[]
 28 | # %env CM_N_JOBS=6
 29 | # %env NUMBA_NUM_THREADS=6
 30 | # %env MKL_NUM_THREADS=6
 31 | # %env OPEN_BLAS_NUM_THREADS=6
 32 | # %env NUMEXPR_NUM_THREADS=6
 33 | # %env OMP_NUM_THREADS=6
 34 | 
 35 | # %% tags=[]
 36 | import os
 37 | from time import time
 38 | 
 39 | import numpy as np
 40 | import pandas as pd
 41 | from scipy.stats import pearsonr, spearmanr
 42 | 
 43 | from ccc import conf
 44 | from ccc.coef import ccc
 45 | from ccc.methods import mic
 46 | 
 47 | # %% [markdown] tags=[]
 48 | # # Settings
 49 | 
 50 | # %% tags=[]
 51 | N_JOBS = int(os.environ["CM_N_JOBS"])
 52 | display(N_JOBS)
 53 | 
 54 | # %% tags=[]
 55 | OUTPUT_FILENAME = "time_test.pkl"
 56 | 
 57 | # %% tags=[]
 58 | DATA_SIZES = [
 59 |     100,
 60 |     500,
 61 |     1000,
 62 |     5000,
 63 |     10000,
 64 |     50000,
 65 |     100000,
 66 |     1000000,
 67 |     10000000,
 68 | ]
 69 | 
 70 | N_REPS = 10
 71 | 
 72 | # %% tags=[]
 73 | np.random.seed(0)
 74 | 
 75 | # %% [markdown] tags=[]
 76 | # # Paths
 77 | 
 78 | # %% tags=[]
 79 | OUTPUT_DIR = conf.RESULTS_DIR / "time_test"
 80 | OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
 81 | display(OUTPUT_DIR)
 82 | 
 83 | # %% [markdown] tags=[]
 84 | # # Functions
 85 | 
 86 | # %% tags=[]
 87 | # append to previous run
 88 | time_results = pd.read_pickle(OUTPUT_DIR / OUTPUT_FILENAME)
 89 | 
 90 | # %% tags=[]
 91 | time_results.shape
 92 | 
 93 | 
 94 | # %% tags=[]
 95 | def run_method(func, method_name, size):
 96 |     n_reps = N_REPS
 97 |     if size < 500:
 98 |         n_reps = 1000
 99 | 
100 |     for r in range(n_reps):
101 |         d1 = np.random.rand(size)
102 |         d2 = np.random.rand(size)
103 | 
104 |         start_time = time()
105 |         sim = func(d1, d2)
106 |         end_time = time()
107 |         met_time = end_time - start_time
108 | 
109 |         idx = time_results.shape[0]
110 |         time_results.loc[idx] = [d1.shape[0], method_name, met_time, sim]
111 | 
112 | 
113 | # %% [markdown] tags=[]
114 | # # Run
115 | 
116 | # %% tags=[]
117 | # initialize methods
118 | ccc(np.random.rand(100), np.random.rand(100))
119 | 
120 | # %% tags=[]
121 | for s in DATA_SIZES:
122 |     print(f"Size: {s}")
123 | 
124 |     print("  p")
125 |     run_method(lambda x, y: pearsonr(x, y)[0], "p-6", s)
126 | 
127 |     print("  s")
128 |     run_method(lambda x, y: spearmanr(x, y)[0], "s-6", s)
129 | 
130 |     print("  cm")
131 |     run_method(lambda x, y: ccc(x, y, n_jobs=N_JOBS), "cm-6", s)
132 | 
133 |     if s <= 50000:
134 |         print("  mic_e")
135 |         run_method(lambda x, y: mic(x, y, estimator="mic_e"), "mic_e-6", s)
136 | 
137 |     if s <= 10000:
138 |         print("  mic")
139 |         run_method(lambda x, y: mic(x, y), "mic-6", s)
140 | 
141 |     print("Saving to pickle")
142 |     time_results.to_pickle(OUTPUT_DIR / OUTPUT_FILENAME)
143 | 
144 |     print("\n")
145 | 
146 | # %% [markdown] tags=[]
147 | # # Summary of results
148 | 
149 | # %% tags=[]
150 | time_results.shape
151 | 
152 | # %% tags=[]
153 | time_results.head()
154 | 
155 | # %% tags=[]
156 | 


--------------------------------------------------------------------------------
/nbs/99_manuscript/giant/py/03_00-giant-get_gene_info.py:
--------------------------------------------------------------------------------
  1 | # ---
  2 | # jupyter:
  3 | #   jupytext:
  4 | #     cell_metadata_filter: all,-execution,-papermill,-trusted
  5 | #     notebook_metadata_filter: -jupytext.text_representation.jupytext_version
  6 | #     text_representation:
  7 | #       extension: .py
  8 | #       format_name: percent
  9 | #       format_version: '1.3'
 10 | #   kernelspec:
 11 | #     display_name: Python 3 (ipykernel)
 12 | #     language: python
 13 | #     name: python3
 14 | # ---
 15 | 
 16 | # %% [markdown] tags=[]
 17 | # # Description
 18 | 
 19 | # %% [markdown] tags=[]
 20 | # It gets all the gene pairs prioritized by different correlation coefficients and writes a file with gene ID mappings (symbols and Entrez IDs).
 21 | 
 22 | # %% [markdown] tags=[]
 23 | # # Modules
 24 | 
 25 | # %% tags=[]
 26 | # %load_ext rpy2.ipython
 27 | 
 28 | # %% tags=[]
 29 | import pandas as pd
 30 | 
 31 | from ccc import conf
 32 | 
 33 | # %% [markdown] tags=[]
 34 | # # Settings
 35 | 
 36 | # %% tags=[]
 37 | DATASET_CONFIG = conf.GTEX
 38 | 
 39 | # %% [markdown] tags=[]
 40 | # # Paths
 41 | 
 42 | # %% tags=[]
 43 | INPUT_DIR = DATASET_CONFIG["GENE_PAIR_INTERSECTIONS"]
 44 | display(INPUT_DIR)
 45 | 
 46 | assert INPUT_DIR.exists()
 47 | 
 48 | # %% tags=[]
 49 | OUTPUT_DIR = conf.GIANT["RESULTS_DIR"] / "intersection_genes"
 50 | OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
 51 | display(OUTPUT_DIR)
 52 | 
 53 | # %% [markdown] tags=[]
 54 | # # Get gene entrez ids
 55 | 
 56 | # %% tags=[]
 57 | genes = set()
 58 | 
 59 | # %% tags=[]
 60 | data = pd.read_pickle(INPUT_DIR / "clustermatch_vs_pearson.pkl")
 61 | _tmp0 = set(data.index.get_level_values(0))
 62 | _tmp1 = set(data.index.get_level_values(1))
 63 | genes.update(_tmp0.union(_tmp1))
 64 | display(len(genes))
 65 | 
 66 | # %% tags=[]
 67 | data = pd.read_pickle(INPUT_DIR / "clustermatch_vs_pearson_spearman.pkl")
 68 | _tmp0 = set(data.index.get_level_values(0))
 69 | _tmp1 = set(data.index.get_level_values(1))
 70 | genes.update(_tmp0.union(_tmp1))
 71 | display(len(genes))
 72 | 
 73 | # %% tags=[]
 74 | data = pd.read_pickle(INPUT_DIR / "clustermatch_vs_spearman.pkl")
 75 | _tmp0 = set(data.index.get_level_values(0))
 76 | _tmp1 = set(data.index.get_level_values(1))
 77 | genes.update(_tmp0.union(_tmp1))
 78 | display(len(genes))
 79 | 
 80 | # %% tags=[]
 81 | data = pd.read_pickle(INPUT_DIR / "pearson_vs_clustermatch.pkl")
 82 | _tmp0 = set(data.index.get_level_values(0))
 83 | _tmp1 = set(data.index.get_level_values(1))
 84 | genes.update(_tmp0.union(_tmp1))
 85 | display(len(genes))
 86 | 
 87 | # %% tags=[]
 88 | data = pd.read_pickle(INPUT_DIR / "pearson_vs_clustermatch_spearman.pkl")
 89 | _tmp0 = set(data.index.get_level_values(0))
 90 | _tmp1 = set(data.index.get_level_values(1))
 91 | genes.update(_tmp0.union(_tmp1))
 92 | display(len(genes))
 93 | 
 94 | # %% tags=[]
 95 | genes = list(genes)
 96 | assert not pd.Series(genes).isna().any()
 97 | 
 98 | # %% tags=[] magic_args="-i genes -o symbol_to_entrezid" language="R"
 99 | # library(org.Hs.eg.db)
100 | # hs <- org.Hs.eg.db
101 | #
102 | # symbol_to_entrezid <- select(hs,
103 | #        keys = unlist(genes),
104 | #        columns = c("ENTREZID", "SYMBOL"),
105 | #        keytype = "SYMBOL")
106 | 
107 | # %% tags=[]
108 | symbol_to_entrezid.shape
109 | 
110 | # %% tags=[]
111 | assert symbol_to_entrezid.shape[0] == len(genes)
112 | 
113 | # %% tags=[]
114 | symbol_to_entrezid.head()
115 | 
116 | # %% tags=[]
117 | symbol_to_entrezid.isna().any().any()
118 | 
119 | # %% tags=[]
120 | symbol_to_entrezid = symbol_to_entrezid.dropna()
121 | 
122 | # %% tags=[]
123 | symbol_to_entrezid.shape
124 | 
125 | # %% tags=[]
126 | assert symbol_to_entrezid[symbol_to_entrezid["SYMBOL"] == "IFNG"].shape[0] == 1
127 | assert symbol_to_entrezid[symbol_to_entrezid["SYMBOL"] == "RASSF2"].shape[0] == 1
128 | 
129 | # %% [markdown] tags=[]
130 | # # Save
131 | 
132 | # %% tags=[]
133 | symbol_to_entrezid.to_pickle(OUTPUT_DIR / "gene_map-symbol_to_entrezid.pkl")
134 | 
135 | # %% tags=[]
136 | 


--------------------------------------------------------------------------------
/tests/test_conf.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tests the conf.py module.
  3 | """
  4 | import os
  5 | import sys
  6 | import runpy
  7 | from unittest import mock
  8 | 
  9 | import pytest
 10 | 
 11 | 
 12 | def test_conf_module_load():
 13 |     from ccc import conf
 14 | 
 15 |     assert conf is not None
 16 |     assert conf.__file__ is not None
 17 | 
 18 | 
 19 | @mock.patch.dict(os.environ, {}, clear=True)
 20 | def test_conf_entries():
 21 |     from ccc import conf
 22 |     import importlib
 23 | 
 24 |     importlib.reload(conf)
 25 | 
 26 |     assert conf.ROOT_DIR is not None
 27 |     assert conf.ROOT_DIR != ""
 28 | 
 29 |     assert conf.DATA_DIR is not None
 30 |     assert conf.DATA_DIR != ""
 31 | 
 32 |     assert conf.RESULTS_DIR is not None
 33 |     assert conf.RESULTS_DIR != ""
 34 | 
 35 |     assert conf.GENERAL is not None
 36 |     assert len(conf.GENERAL) > 0
 37 |     assert conf.GENERAL["N_JOBS"] is not None
 38 |     assert conf.GENERAL["N_JOBS"] > 0
 39 |     assert conf.GENERAL["N_JOBS_LOW"] is not None
 40 |     assert conf.GENERAL["N_JOBS_LOW"] > 0
 41 | 
 42 |     assert conf.MANUSCRIPT is not None
 43 |     assert "CONTENT_DIR" not in conf.MANUSCRIPT
 44 | 
 45 | 
 46 | def test_conf_main():
 47 |     t = runpy.run_module("ccc.conf", run_name="__main__")
 48 |     assert t is not None
 49 |     assert "print_vars" in t
 50 |     assert "CM_ROOT_DIR" in t["print_vars"]
 51 |     assert "CM_RESULTS_DIR" in t["print_vars"]
 52 |     assert "CM_GENERAL_N_JOBS" in t["print_vars"]
 53 | 
 54 | 
 55 | @pytest.mark.skipif(
 56 |     sys.platform.startswith("win"),
 57 |     reason="exporting variables is only supported in non-Windows platforms",
 58 | )
 59 | def test_conf_export_variables():
 60 |     from pathlib import Path
 61 |     import subprocess
 62 |     from ccc import conf
 63 | 
 64 |     conf_filepath = Path(conf.__file__).resolve()
 65 |     assert conf_filepath is not None
 66 |     assert conf_filepath.exists()
 67 | 
 68 |     # check output
 69 |     r = subprocess.run(["python", conf_filepath], stdout=subprocess.PIPE)
 70 |     assert r is not None
 71 |     assert r.returncode == 0
 72 |     r_output = r.stdout.decode("utf-8")
 73 |     assert r_output is not None
 74 |     assert len(r_output) > 8, r_output
 75 |     assert r_output.count("export ") > 5
 76 | 
 77 |     # check variable
 78 |     r = subprocess.run(
 79 |         f"eval `python {conf_filepath}` && echo $CM_ROOT_DIR",
 80 |         shell=True,
 81 |         stdout=subprocess.PIPE,
 82 |     )
 83 |     assert r is not None
 84 |     assert r.returncode == 0
 85 |     r_output = r.stdout.decode("utf-8").strip()
 86 |     assert r_output is not None
 87 |     assert len(r_output) > 8, r_output
 88 |     assert r_output.startswith("/")
 89 | 
 90 |     # check dict variable
 91 |     r = subprocess.run(
 92 |         f"eval `python {conf_filepath}` && echo $CM_GENERAL_N_JOBS",
 93 |         shell=True,
 94 |         stdout=subprocess.PIPE,
 95 |     )
 96 |     assert r is not None
 97 |     assert r.returncode == 0
 98 |     r_output = r.stdout.decode("utf-8").strip()
 99 |     assert r_output is not None
100 |     assert r_output.isdigit()
101 |     assert int(r_output) > 0
102 | 
103 | 
104 | @mock.patch.dict(os.environ, {"CM_MANUSCRIPT_DIR": "/tmp/some/dir"})
105 | def test_conf_with_manuscript_dir():
106 |     from ccc import conf
107 |     import importlib
108 | 
109 |     importlib.reload(conf)
110 | 
111 |     assert conf.MANUSCRIPT is not None
112 |     assert "CONTENT_DIR" in conf.MANUSCRIPT
113 |     assert conf.MANUSCRIPT["CONTENT_DIR"] is not None
114 |     assert conf.MANUSCRIPT["CONTENT_DIR"] != ""
115 | 
116 | 
117 | @mock.patch.dict(os.environ, {"CM_N_JOBS": ""})
118 | def test_conf_cm_n_jobs_is_empty_string():
119 |     from ccc import conf
120 |     import importlib
121 | 
122 |     importlib.reload(conf)
123 | 
124 |     assert conf.GENERAL is not None
125 |     assert len(conf.GENERAL) > 0
126 |     assert conf.GENERAL["N_JOBS"] is not None
127 |     assert conf.GENERAL["N_JOBS"] > 0
128 |     assert conf.GENERAL["N_JOBS_LOW"] is not None
129 |     assert conf.GENERAL["N_JOBS_LOW"] > 0
130 | 


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/07_cm_optimized/04-cm_ari_numba.txt:
--------------------------------------------------------------------------------
 1 |          149 function calls (143 primitive calls) in 16.228 seconds
 2 | 
 3 |    Ordered by: cumulative time
 4 |    List reduced from 52 to 50 due to restriction <50>
 5 | 
 6 |    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
 7 |         1    0.000    0.000   16.228   16.228 {built-in method builtins.exec}
 8 |         1    0.000    0.000   16.228   16.228 <string>:1(<module>)
 9 |         1    0.000    0.000   16.228   16.228 130967321.py:1(func)
10 |         1    0.000    0.000   16.228   16.228 coef.py:276(cm)
11 |         1   16.227   16.227   16.227   16.227 coef.py:208(_cm)
12 |         9    0.000    0.000    0.001    0.000 typedlist.py:341(append)
13 |         1    0.000    0.000    0.000    0.000 typedlist.py:298(_initialise_list)
14 |         2    0.000    0.000    0.000    0.000 abstract.py:60(__call__)
15 |         2    0.000    0.000    0.000    0.000 typeof.py:25(typeof)
16 |         1    0.000    0.000    0.000    0.000 typedlist.py:270(_parse_arg)
17 |         2    0.000    0.000    0.000    0.000 functools.py:872(wrapper)
18 |         1    0.000    0.000    0.000    0.000 dispatcher.py:677(typeof_pyval)
19 |         2    0.000    0.000    0.000    0.000 abstract.py:48(_intern)
20 |         1    0.000    0.000    0.000    0.000 typeof.py:257(_typeof_nb_type)
21 |         9    0.000    0.000    0.000    0.000 typedlist.py:81(_append)
22 |         2    0.000    0.000    0.000    0.000 {method 'get' of 'dict' objects}
23 |         1    0.000    0.000    0.000    0.000 containers.py:618(__init__)
24 |         6    0.000    0.000    0.000    0.000 {built-in method builtins.isinstance}
25 |         2    0.000    0.000    0.000    0.000 functools.py:816(dispatch)
26 |       4/2    0.000    0.000    0.000    0.000 abstract.py:117(__hash__)
27 |       7/5    0.000    0.000    0.000    0.000 abstract.py:120(__eq__)
28 |         1    0.000    0.000    0.000    0.000 typeof.py:121(_typeof_int)
29 |         1    0.000    0.000    0.000    0.000 typedlist.py:228(__init__)
30 |         1    0.000    0.000    0.000    0.000 typedlist.py:202(__new__)
31 |         4    0.000    0.000    0.000    0.000 abc.py:117(__instancecheck__)
32 |         1    0.000    0.000    0.000    0.000 utils.py:294(bit_length)
33 |        10    0.000    0.000    0.000    0.000 serialize.py:140(_numba_unpickle)
34 |        20    0.000    0.000    0.000    0.000 typedlist.py:280(_numba_type_)
35 |       4/2    0.000    0.000    0.000    0.000 {built-in method builtins.hash}
36 |         1    0.000    0.000    0.000    0.000 functions.py:660(__init__)
37 |         1    0.000    0.000    0.000    0.000 {method 'format' of 'str' objects}
38 |         1    0.000    0.000    0.000    0.000 misc.py:47(unliteral)
39 |         4    0.000    0.000    0.000    0.000 {built-in method _abc._abc_instancecheck}
40 |         1    0.000    0.000    0.000    0.000 typedlist.py:50(_make_list)
41 |         2    0.000    0.000    0.000    0.000 weakref.py:415(__getitem__)
42 |         2    0.000    0.000    0.000    0.000 <string>:1(<lambda>)
43 |         9    0.000    0.000    0.000    0.000 typedlist.py:286(_typed)
44 |         3    0.000    0.000    0.000    0.000 {built-in method __new__ of type object at 0x55a47df0e300}
45 |         1    0.000    0.000    0.000    0.000 {built-in method builtins.any}
46 |         6    0.000    0.000    0.000    0.000 abstract.py:95(key)
47 |         2    0.000    0.000    0.000    0.000 abstract.py:92(__init__)
48 |         1    0.000    0.000    0.000    0.000 {built-in method builtins.getattr}
49 |         3    0.000    0.000    0.000    0.000 containers.py:630(key)
50 |         2    0.000    0.000    0.000    0.000 abstract.py:114(__repr__)
51 |         3    0.000    0.000    0.000    0.000 functions.py:672(key)
52 |         2    0.000    0.000    0.000    0.000 {built-in method _abc.get_cache_token}
53 |         1    0.000    0.000    0.000    0.000 typedlist.py:244(<genexpr>)
54 |         1    0.000    0.000    0.000    0.000 {built-in method builtins.hasattr}
55 |         1    0.000    0.000    0.000    0.000 {built-in method builtins.bin}
56 |         1    0.000    0.000    0.000    0.000 {built-in method builtins.len}


--------------------------------------------------------------------------------
/nbs/others/05_clustermatch_profiling/06_cm_optimized/06-cm_many_genes.txt:
--------------------------------------------------------------------------------
 1 |          149 function calls (143 primitive calls) in 934.834 seconds
 2 | 
 3 |    Ordered by: cumulative time
 4 |    List reduced from 52 to 50 due to restriction <50>
 5 | 
 6 |    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
 7 |         1    0.000    0.000  934.834  934.834 {built-in method builtins.exec}
 8 |         1    0.000    0.000  934.834  934.834 <string>:1(<module>)
 9 |         1    0.000    0.000  934.834  934.834 1750096170.py:1(func)
10 |         1    0.000    0.000  934.834  934.834 coef.py:272(cm)
11 |         1  934.834  934.834  934.834  934.834 coef.py:197(_cm)
12 |         9    0.000    0.000    0.000    0.000 typedlist.py:341(append)
13 |         1    0.000    0.000    0.000    0.000 typedlist.py:298(_initialise_list)
14 |         2    0.000    0.000    0.000    0.000 abstract.py:60(__call__)
15 |         1    0.000    0.000    0.000    0.000 typedlist.py:270(_parse_arg)
16 |         2    0.000    0.000    0.000    0.000 typeof.py:25(typeof)
17 |         2    0.000    0.000    0.000    0.000 functools.py:872(wrapper)
18 |         1    0.000    0.000    0.000    0.000 dispatcher.py:677(typeof_pyval)
19 |         2    0.000    0.000    0.000    0.000 abstract.py:48(_intern)
20 |         2    0.000    0.000    0.000    0.000 {method 'get' of 'dict' objects}
21 |         1    0.000    0.000    0.000    0.000 typeof.py:257(_typeof_nb_type)
22 |         9    0.000    0.000    0.000    0.000 typedlist.py:81(_append)
23 |         1    0.000    0.000    0.000    0.000 containers.py:618(__init__)
24 |       4/2    0.000    0.000    0.000    0.000 abstract.py:117(__hash__)
25 |       4/2    0.000    0.000    0.000    0.000 {built-in method builtins.hash}
26 |         6    0.000    0.000    0.000    0.000 {built-in method builtins.isinstance}
27 |         2    0.000    0.000    0.000    0.000 functools.py:816(dispatch)
28 |         1    0.000    0.000    0.000    0.000 typeof.py:121(_typeof_int)
29 |       7/5    0.000    0.000    0.000    0.000 abstract.py:120(__eq__)
30 |         4    0.000    0.000    0.000    0.000 abc.py:117(__instancecheck__)
31 |         1    0.000    0.000    0.000    0.000 typedlist.py:228(__init__)
32 |         1    0.000    0.000    0.000    0.000 typedlist.py:202(__new__)
33 |         1    0.000    0.000    0.000    0.000 utils.py:294(bit_length)
34 |        10    0.000    0.000    0.000    0.000 serialize.py:140(_numba_unpickle)
35 |         4    0.000    0.000    0.000    0.000 {built-in method _abc._abc_instancecheck}
36 |         1    0.000    0.000    0.000    0.000 typedlist.py:50(_make_list)
37 |         1    0.000    0.000    0.000    0.000 functions.py:660(__init__)
38 |         1    0.000    0.000    0.000    0.000 {method 'format' of 'str' objects}
39 |        20    0.000    0.000    0.000    0.000 typedlist.py:280(_numba_type_)
40 |         1    0.000    0.000    0.000    0.000 misc.py:47(unliteral)
41 |         2    0.000    0.000    0.000    0.000 weakref.py:415(__getitem__)
42 |         2    0.000    0.000    0.000    0.000 <string>:1(<lambda>)
43 |         9    0.000    0.000    0.000    0.000 typedlist.py:286(_typed)
44 |         3    0.000    0.000    0.000    0.000 {built-in method __new__ of type object at 0x55c573e05300}
45 |         1    0.000    0.000    0.000    0.000 {built-in method builtins.any}
46 |         2    0.000    0.000    0.000    0.000 abstract.py:92(__init__)
47 |         6    0.000    0.000    0.000    0.000 abstract.py:95(key)
48 |         3    0.000    0.000    0.000    0.000 containers.py:630(key)
49 |         1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}
50 |         2    0.000    0.000    0.000    0.000 abstract.py:114(__repr__)
51 |         1    0.000    0.000    0.000    0.000 {built-in method builtins.getattr}
52 |         2    0.000    0.000    0.000    0.000 {built-in method _abc.get_cache_token}
53 |         3    0.000    0.000    0.000    0.000 functions.py:672(key)
54 |         1    0.000    0.000    0.000    0.000 typedlist.py:244(<genexpr>)
55 |         1    0.000    0.000    0.000    0.000 {built-in method builtins.hasattr}
56 |         1    0.000    0.000    0.000    0.000 {built-in method builtins.bin}


--------------------------------------------------------------------------------