├── PyCoGAPS ├── __init__.py ├── requirements_analysis.txt ├── config.py ├── msigdb_gsea.py ├── subset_data.py ├── distributed_functions.py ├── run_pycogaps.py ├── parameters.py ├── pycogaps_main.py ├── helper_functions.py └── analysis_functions.py ├── run_vignette.sh ├── figures ├── umap_UMAP.png ├── pdac1kpatternumap.png └── highest_expr_genes_highestExpressedGenes.png ├── readme-figs ├── binaryA.png ├── plot_pm.png ├── res_show.png ├── anndata-result.jpeg ├── anndata-result.png ├── binaryA_cluster.png └── plot_residuals.png ├── .gitmodules ├── data ├── ModSimResult.h5ad ├── inputdata.h5ad ├── cogapsresult.h5ad ├── ModSimBases.txt └── ModSimData.txt ├── .gitattributes ├── requirements.txt ├── Dockerfile ├── .dockerignore ├── genepattern └── Dockerfile ├── tests ├── test_retina.py ├── test_pytime.py ├── test_params.py ├── test_checkpoints.py ├── test_r_time.R ├── test_subsets.py ├── test_distributed.py ├── read_pydata.py ├── test_disttime.py ├── testing.py ├── test_seed_consistency.py └── test_top_level.py ├── pdacvignette.py ├── modsimvignette.py ├── visiumvignette.py ├── .github └── workflows │ └── build-test.yml ├── analyzevisium.py ├── .gitignore ├── analyzepdac.py ├── vignette.py ├── vignette_from_args.py ├── params.yaml ├── setup.py ├── src └── bindings.cpp ├── LICENSE └── README.md /PyCoGAPS/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /run_vignette.sh: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | set -e 3 | 4 | python3 ./modsimvignette.py --user 5 | -------------------------------------------------------------------------------- /figures/umap_UMAP.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FertigLab/pycogaps/HEAD/figures/umap_UMAP.png -------------------------------------------------------------------------------- /readme-figs/binaryA.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FertigLab/pycogaps/HEAD/readme-figs/binaryA.png -------------------------------------------------------------------------------- /readme-figs/plot_pm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FertigLab/pycogaps/HEAD/readme-figs/plot_pm.png -------------------------------------------------------------------------------- /readme-figs/res_show.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FertigLab/pycogaps/HEAD/readme-figs/res_show.png -------------------------------------------------------------------------------- /figures/pdac1kpatternumap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FertigLab/pycogaps/HEAD/figures/pdac1kpatternumap.png -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "src/CoGAPS"] 2 | path = src/CoGAPS 3 | url = https://github.com/FertigLab/CoGAPS.git 4 | -------------------------------------------------------------------------------- /readme-figs/anndata-result.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FertigLab/pycogaps/HEAD/readme-figs/anndata-result.jpeg -------------------------------------------------------------------------------- /readme-figs/anndata-result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FertigLab/pycogaps/HEAD/readme-figs/anndata-result.png -------------------------------------------------------------------------------- /readme-figs/binaryA_cluster.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FertigLab/pycogaps/HEAD/readme-figs/binaryA_cluster.png -------------------------------------------------------------------------------- /readme-figs/plot_residuals.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FertigLab/pycogaps/HEAD/readme-figs/plot_residuals.png -------------------------------------------------------------------------------- /PyCoGAPS/requirements_analysis.txt: -------------------------------------------------------------------------------- 1 | anndata 2 | matplotlib 3 | pandas 4 | scanpy 5 | scikit_learn 6 | scipy 7 | seaborn 8 | numpy 9 | -------------------------------------------------------------------------------- /figures/highest_expr_genes_highestExpressedGenes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FertigLab/pycogaps/HEAD/figures/highest_expr_genes_highestExpressedGenes.png -------------------------------------------------------------------------------- /data/ModSimResult.h5ad: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:227a89dccf5ac700058d1326d8f8ace32f01b7b8959e3323de432ed006d47015 3 | size 32808 4 | -------------------------------------------------------------------------------- /data/inputdata.h5ad: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:3954937ecb9d204be42cd42e0f434f84e16a75f66dae83a0a03ea33152cb2d34 3 | size 166623361 4 | -------------------------------------------------------------------------------- /data/cogapsresult.h5ad: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:c49eccfb366a4b2f82fe446a56681abbead3fa759695ce06e3bd8bfa3bcefc32 3 | size 1549402794 4 | -------------------------------------------------------------------------------- /PyCoGAPS/config.py: -------------------------------------------------------------------------------- 1 | import math 2 | import pandas as pd 3 | import numpy as np 4 | import anndata 5 | import sys 6 | import time 7 | import warnings 8 | import os 9 | import pycogaps 10 | from pycogaps import * 11 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | dat 2 | filter=lfs diff=lfs merge=lfs -text 3 | *.h5ad filter=lfs diff=lfs merge=lfs -text 4 | data/inputdata.h5ad filter=lfs diff=lfs merge=lfs -text 5 | data/cogapsresult.h5ad filter=lfs diff=lfs merge=lfs -text 6 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | anndata>=0.9 2 | boto3==1.33.5 3 | gseapy==1.0.6 4 | h5py==3.9.0 5 | matplotlib==3.7.3 6 | numpy==1.24.4 7 | pandas==2.0.3 8 | pybind11==2.6.2 9 | PyYAML==6.0.1 10 | scanpy==1.9.6 11 | scikit_learn==1.3.1 12 | scipy>=1.10 13 | seaborn>=0.12 14 | setuptools==78.1.1 15 | statsmodels==0.14.0 16 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.8 2 | 3 | COPY . /pycogaps 4 | WORKDIR pycogaps 5 | 6 | RUN apt-get update && apt-get install -y libboost-all-dev 7 | RUN pip install --upgrade pip 8 | RUN pip install -r requirements.txt 9 | 10 | RUN python ./setup.py install 11 | 12 | ENTRYPOINT ["python", "./vignette.py"] 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | *.pyc 3 | *.pyo 4 | *.pyd 5 | .Python 6 | .env 7 | env 8 | pip-log.txt 9 | pip-delete-this-directory.txt 10 | .tox 11 | .coverage 12 | .coverage.* 13 | .cache 14 | nosetests.xml 15 | coverage.xml 16 | *.cover 17 | *.log 18 | .git 19 | .mypy_cache 20 | .pytest_cache 21 | .hypothesis 22 | 23 | #local build results 24 | build -------------------------------------------------------------------------------- /genepattern/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.8 2 | 3 | RUN useradd -ms /bin/bash user 4 | USER user 5 | WORKDIR /home/user 6 | 7 | RUN git clone --recurse-submodules https://github.com/FertigLab/pycogaps.git 8 | 9 | WORKDIR ./pycogaps/ 10 | RUN git pull 11 | 12 | RUN pip install -r requirements.txt --user 13 | 14 | RUN python3 ./setup.py install --user 15 | 16 | ENTRYPOINT [] -------------------------------------------------------------------------------- /tests/test_retina.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append(".") # Adds higher directory to python modules path. 3 | 4 | from PyCoGAPS import * 5 | 6 | path = './src/CoGAPS/inst/extdata/retina_subset_1.h5' 7 | 8 | # if input is an hdf file, then need to create params object, and set key name when initializing 9 | params = CoParams(path, hdfKey='counts') 10 | 11 | result = CoGAPS(path, params=params) 12 | -------------------------------------------------------------------------------- /data/ModSimBases.txt: -------------------------------------------------------------------------------- 1 | 0.055545 0.67669 3.0334 5.0145 3.182 1.4675 2.2051 3.0017 2.1497 0.79272 0.16768 0.11988 0.3686 0.77882 1 0.7788 0.36788 0.1054 0.018316 0.0019305 2 | 0.011109 0.13535 0.60701 1.0097 0.7061 0.66253 1.4442 2.0003 1.4334 0.53299 0.15452 0.32585 1.1041 2.3364 3 2.3364 1.1036 0.3162 0.054947 0.0057914 3 | 0.022218 0.2707 1.214 2.0193 1.4122 1.3251 2.8883 4.0007 2.8661 1.0544 0.19915 0.019312 0.00096148 2.4577e-05 3.2254e-07 2.1733e-09 7.5181e-12 1.3353e-14 1.2176e-17 5.7007e-21 4 | -------------------------------------------------------------------------------- /PyCoGAPS/msigdb_gsea.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import scanpy as sc 3 | import pandas as pd 4 | from PyCoGAPS.analysis_functions import * 5 | 6 | with open('../data/spatialhypoxia8patterns.pkl', 'rb') as fp: 7 | hypoxiaresult8pattern = pickle.load(fp) 8 | 9 | hypoxia_hallmarks = pd.read_table("../hypoxiagenes", header=None) 10 | 11 | genes = list(hypoxia_hallmarks[0]) 12 | genes = list(set(hypoxiaresult8pattern.obs_names).intersection(genes)) 13 | 14 | 15 | 16 | 17 | 18 | 19 | cogaps_stat = calcCoGAPSStat(hypoxiaresult8pattern, genes) 20 | 21 | -------------------------------------------------------------------------------- /tests/test_pytime.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append(".") # Adds higher directory to python modules path. 3 | 4 | from PyCoGAPS import * 5 | import time 6 | 7 | # replace with the path to your data, or use this provided example 8 | path = "data/GSE98638_HCC.TCell.S5063.count.txt" 9 | 10 | params = CoParams(path) 11 | 12 | setParams(params, {"seed": 0, 13 | "nIterations": 1000, 14 | "nPatterns": 5, 15 | "useSparseOptimization": True 16 | }) 17 | 18 | 19 | start = time.time() 20 | result = CoGAPS(path, params) 21 | end = time.time() 22 | print(end - start) 23 | # plotPatternMarkers(result) 24 | -------------------------------------------------------------------------------- /tests/test_params.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append(".") # Adds higher directory to python modules path. 3 | 4 | from PyCoGAPS import * 5 | 6 | 7 | # replace with the path to your data, or use this provided example 8 | path = "./data/GIST.csv" 9 | 10 | # run CoGAPS on your dataset 11 | result = CoGAPS(path) 12 | 13 | # create a CoParams object and set desired parameters 14 | params = CoParams(path) 15 | setParam(params, "maxThreads", 4) 16 | # and/or: 17 | setParams(params, { 18 | 'printMessages': True, 19 | 'maxThreads': 4 20 | }) 21 | 22 | # set distributed params, annotation weights, fixed patterns by calling specific methods 23 | params.setDistributedParams(nSets=5) 24 | 25 | result = CoGAPS(path, params) 26 | 27 | 28 | -------------------------------------------------------------------------------- /tests/test_checkpoints.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append(".") # Adds higher directory to python modules path. 3 | 4 | from PyCoGAPS import * 5 | 6 | mtx_path = "./data/GIST.mtx" 7 | 8 | if isCheckpointsEnabled(): 9 | params = CoParams(mtx_path) 10 | setParams(params, {"nIterations": 100, 11 | "seed": 42}) 12 | res1 = CoGAPS(mtx_path, params, checkpointInterval=100, checkpointOutFile="test.out", messages=False) 13 | res2 = CoGAPS(mtx_path, params, checkpointInFile="test.out", messages=False) 14 | 15 | res1 = res1['GapsResult'] 16 | res2 = res2['GapsResult'] 17 | 18 | assert(np.allclose(toNumpy(res1.Amean), toNumpy(res2.Amean), rtol=0.1)) 19 | assert(np.allclose(toNumpy(res1.Pmean), toNumpy(res2.Pmean), rtol=0.1)) -------------------------------------------------------------------------------- /tests/test_r_time.R: -------------------------------------------------------------------------------- 1 | # library(CoGAPS) 2 | 3 | R.utils::sourceDirectory("/Users/ashleytsang/pycogaps/src/CoGAPS/src") 4 | R.utils::sourceDirectory("/Users/ashleytsang/pycogaps/src/CoGAPS/R") 5 | 6 | 7 | path <- "/Users/ashleytsang/pycogaps/data/GSE98638_HCC.TCell.S5063.count.txt" 8 | counts <- read.table(path, header = TRUE, stringsAsFactors = FALSE) 9 | counts <- counts[-c(1, 2)] 10 | 11 | 12 | params <- new("CogapsParams") 13 | params <- setParam(params, "nPatterns", 3) 14 | params <- setParam(params, "sparseOptimization", TRUE) 15 | params <- setParam(params, "nIterations", 10000) 16 | params <- setParam(params, "seed", 0) 17 | 18 | 19 | 20 | start_time <- Sys.time() 21 | CoGAPS(counts, params) 22 | end_time <- Sys.time() 23 | 24 | diff <- end_time - start_time 25 | diff -------------------------------------------------------------------------------- /pdacvignette.py: -------------------------------------------------------------------------------- 1 | if __name__ == "__main__": 2 | from PyCoGAPS.parameters import * 3 | from PyCoGAPS.pycogaps_main import CoGAPS 4 | import scanpy as sc 5 | 6 | path = "data/inputdata.h5ad" 7 | adata = sc.read_h5ad(path) 8 | adata.X = adata.X.todense() 9 | sc.pp.log1p(adata) 10 | adata = adata.T 11 | adata 12 | 13 | params = CoParams(adata=adata) 14 | 15 | setParams(params, { 16 | 'nIterations':1000, 17 | 'seed': 42, 18 | 'nPatterns': 8, 19 | 'useSparseOptimization': True, 20 | 'distributed': "genome-wide" 21 | # 'transposeData': True 22 | }) 23 | 24 | params.setDistributedParams(nSets=7) 25 | params.printParams() 26 | start = time.time() 27 | result = CoGAPS(adata, params) 28 | end = time.time() 29 | print("TIME:", end - start) 30 | 31 | result.write("data/pdacresultTEST.h5ad") -------------------------------------------------------------------------------- /modsimvignette.py: -------------------------------------------------------------------------------- 1 | from PyCoGAPS.parameters import * 2 | from PyCoGAPS.pycogaps_main import CoGAPS 3 | import scanpy as sc 4 | 5 | modsimpath = "data/ModSimData.txt" 6 | modsimbasespath = "data/ModSimBases.txt" 7 | 8 | modsim = sc.read_text(modsimpath) 9 | modsimbases = sc.read_text(modsimbasespath) 10 | 11 | params = CoParams(path=modsimpath) 12 | params.printParams() 13 | 14 | setParams(params, { 15 | 'nIterations': 50000, 16 | 'seed': 42, 17 | 'nPatterns': 3 18 | }) 19 | 20 | # many people find it helpful to time cogaps runs 21 | start = time.time() 22 | # command that calls CoGAPS 23 | # TIMING: on ModSim data, this only should take about 3 sec 24 | modsimresult = CoGAPS(modsim, params) 25 | end = time.time() 26 | print("TIME:", end - start) 27 | 28 | # always write cogaps result to disc before doing anything else! 29 | modsimresult.write("data/ModSimPyCoGAPSResult.h5ad") 30 | 31 | 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /tests/test_subsets.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append(".") # Adds higher directory to python modules path. 3 | 4 | from PyCoGAPS import * 5 | import subset_data 6 | 7 | path = "./data/GIST.csv" 8 | params = CoParams(path) 9 | 10 | adata = toAnndata(path) 11 | 12 | print('\n### Test Explicit Subsets ###') 13 | setParam(params, "explicitSets", ['IM00', 'IM02']) 14 | params.setDistributedParams(nSets=2) 15 | sets = subset_data.createSets(adata, params) 16 | print(sets) 17 | 18 | print('\n### Test Sample Uniformly Subsets ###') 19 | setParam(params, "explicitSets", None) 20 | sets = subset_data.createSets(adata, params) 21 | print(sets) 22 | 23 | print('\n### Test Annotation Weights Subsets ###') 24 | names = ['IM00', 'IM02'] 25 | wt = [2, 0.5] 26 | weight = dict(zip(names, wt)) 27 | annotation = ['IM00', 'IM02', 'IM00'] 28 | params.setAnnotationWeights(annotation, weight) 29 | sets = subset_data.createSets(adata, params) 30 | print(sets) -------------------------------------------------------------------------------- /visiumvignette.py: -------------------------------------------------------------------------------- 1 | if __name__ == "__main__": 2 | import scanpy as sc 3 | 4 | from PyCoGAPS.parameters import * 5 | from PyCoGAPS.pycogaps_main import CoGAPS 6 | 7 | path = "data/VI_116_4" 8 | adata = sc.read_visium(path) 9 | adata.X = adata.X.todense() 10 | sc.pp.log1p(adata) 11 | adata = adata.T 12 | adata 13 | 14 | params = CoParams(adata=adata) 15 | 16 | setParams( 17 | params, 18 | { 19 | "nIterations": 100, 20 | "seed": 42, 21 | "nPatterns": 8, 22 | "useSparseOptimization": True, 23 | "distributed": "genome-wide" 24 | # 'transposeData': True 25 | }, 26 | ) 27 | 28 | params.setDistributedParams(nSets=4) 29 | params.printParams() 30 | start = time.time() 31 | result = CoGAPS(adata, params) 32 | end = time.time() 33 | print("TIME:", end - start) 34 | 35 | result.write("data/visiumresult50k.h5ad") 36 | # %% 37 | -------------------------------------------------------------------------------- /.github/workflows/build-test.yml: -------------------------------------------------------------------------------- 1 | # Based on the standard github action template for python 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python 3 | 4 | name: build-test-py 5 | 6 | on: 7 | workflow_dispatch: 8 | pull_request: 9 | branches: 10 | - master 11 | 12 | permissions: 13 | contents: read 14 | 15 | jobs: 16 | build: 17 | strategy: 18 | fail-fast: false 19 | matrix: 20 | runs-on: [ubuntu-20.04] 21 | python: ['3.8'] 22 | name: "${{ matrix.runs-on }} - ${{ matrix.python }}" 23 | runs-on: ${{ matrix.runs-on }} 24 | 25 | steps: 26 | - name: Checkout 27 | uses: actions/checkout@v3 28 | with: 29 | submodules: 'true' 30 | 31 | - name: Set up Python 32 | uses: actions/setup-python@v3 33 | with: 34 | python-version: ${{ matrix.python }} 35 | 36 | - name: Boost Linux 37 | if: runner.os == 'Linux' 38 | run: sudo apt-get install libboost-all-dev 39 | 40 | - name: Install deps 41 | run: pip install -r requirements.txt 42 | 43 | - name: Install the app 44 | run: python setup.py install 45 | 46 | - name: Test 47 | run: python -m unittest 48 | -------------------------------------------------------------------------------- /analyzevisium.py: -------------------------------------------------------------------------------- 1 | # %% 2 | import anndata 3 | import pandas as pd 4 | import scanpy as sc 5 | 6 | from PyCoGAPS.analysis_functions import * 7 | from PyCoGAPS.parameters import * 8 | from PyCoGAPS.pycogaps_main import CoGAPS 9 | 10 | # %% 11 | 12 | # %% 13 | cogapsresult = anndata.read_h5ad("data/visiumresult50k.h5ad") 14 | adata = cogapsresult 15 | 16 | # %% 17 | import numpy as np 18 | 19 | path = "data/VI_116_4" 20 | adata = sc.read_visium(path) 21 | 22 | adata.obs["Pattern1"] = cogapsresult.var["Pattern1"] 23 | adata.obs["Pattern2"] = cogapsresult.var["Pattern2"] 24 | adata.obs["Pattern3"] = cogapsresult.var["Pattern3"] 25 | adata.obs["Pattern4"] = cogapsresult.var["Pattern4"] 26 | adata.obs["Pattern5"] = cogapsresult.var["Pattern5"] 27 | 28 | sc.pp.normalize_total(adata, inplace=True) 29 | sc.pp.log1p(adata) 30 | sc.pp.highly_variable_genes(adata, flavor="seurat") 31 | 32 | sc.pp.pca(adata) 33 | sc.pl.pca(adata) 34 | sc.pl.pca_variance_ratio(adata, log=True) 35 | 36 | sc.pp.neighbors(adata) 37 | sc.tl.umap(adata) 38 | sc.tl.leiden(adata, key_added="clusters", resolution=0.5) 39 | 40 | import matplotlib.pyplot as plt 41 | 42 | plt.rcParams["figure.figsize"] = (8, 8) 43 | sc.pl.spatial( 44 | adata, 45 | color=["Pattern1", "Pattern2", "Pattern3", "Pattern4", "Pattern5"], 46 | ) 47 | -------------------------------------------------------------------------------- /tests/test_distributed.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append(".") # Adds higher directory to python modules path. 3 | 4 | from PyCoGAPS import * 5 | 6 | path = "./data/GIST.csv" 7 | params = CoParams(path) 8 | adata = toAnndata(path) 9 | 10 | singlethreadres = CoGAPS(path, params) 11 | 12 | if __name__ == '__main__': 13 | params.setDistributedParams(nSets=2, minNS=1, cut=3) 14 | params.coparams['subsetIndices'] = subset_data.createSets(adata, params) 15 | result = distributedCoGAPS(path, params, None) 16 | print(result) 17 | print("length: ", len(result)) 18 | # print("Parallel chisqhistory:", result[0]["GapsResult"].chisqHistory, "\n") 19 | # print("Single-thread chisqhistory:", singlethreadres["GapsResult"].chisqHistory, "\n") 20 | # assert(singlethreadres["GapsResult"].chisqHistory == result[0]["GapsResult"].chisqHistory) 21 | # assert (singlethreadres["anndata"].shape == result[0]["anndata"].shape) 22 | 23 | 24 | 25 | # test pickling / unpickling python objects so they can be handled by spawned processes 26 | # leaving this commented out because it's a pain to test, but it's here if we need it 27 | # import pickle 28 | # # params.gaps.print() 29 | # print("Now attempting to pickle GapsParameters object.....") 30 | # pickle.dump(params.gaps, open("./data/save.p", "wb"))d 31 | # print("Done. Now attempting to unpickle...") 32 | # d 33 | # unpickled.print() 34 | -------------------------------------------------------------------------------- /tests/read_pydata.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append(".") # Adds higher directory to python modules path. 3 | 4 | import anndata 5 | import pycogaps 6 | import scipy.io 7 | import scipy.sparse 8 | import numpy as np 9 | from PyCoGAPS import * 10 | 11 | # placeholder until we have anndata samples 12 | # maybe also read files into an anndata object? 13 | path = './data/GIST.csv' 14 | prm = pycogaps.GapsParameters(path) 15 | 16 | adata = anndata.read_csv(path) 17 | adataX = adata.X 18 | 19 | if scipy.sparse.issparse(adataX): 20 | adataX = adataX.toarray() 21 | 22 | # create Matrix object from anndata X 23 | matrix = pycogaps.Matrix(adataX) 24 | 25 | result = pycogaps.runCogapsFromMatrix(matrix, prm) 26 | 27 | # convert Amean and Pmean results to numpy arrays 28 | Amean = toNumpy(result.Amean) 29 | Pmean = toNumpy(result.Pmean) 30 | 31 | # anndata labels 32 | print('obs names: ', adata.obs_names) 33 | print('var names: ', adata.var_names) 34 | pattern_labels = ["Pattern" + str(i) for i in range(1, prm.nPatterns+1)] 35 | 36 | # load adata obs and var from Amean and Pmean results 37 | A_mat = pd.DataFrame(data=Amean, index=adata.obs_names, columns=pattern_labels) 38 | adata.obs = A_mat 39 | 40 | P_mat = pd.DataFrame(data=Pmean, index=adata.var_names, columns=pattern_labels) 41 | adata.var = P_mat 42 | 43 | print("~~~~~~~~~~~~~ testing CoGAPS Pattern Markers ~~~~~~~~~~~~~~") 44 | print(patternMarkers(adata)) -------------------------------------------------------------------------------- /tests/test_disttime.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append(".") # Adds higher directory to python modules path. 3 | 4 | from PyCoGAPS import * 5 | import time 6 | import numpy as np 7 | import h5py 8 | # # 9 | path = "data/GIST.csv" 10 | # np.savetxt("src/CoGAPS/inst/extdata/retina_subset_1.csv", h5py.File(path)['counts'], '%g', ',') 11 | # replace with the path to your data, or use this provided example 12 | # path = "./data/GIST.csv" 13 | # df = pd.read_hdf(path) 14 | # df.to_csv("src/CoGAPS/inst/extdata/retina_subset_1.csv", index=False) 15 | # csvpath = "src/CoGAPS/inst/extdata/retina_subset_1.csv" 16 | params = CoParams(path) 17 | 18 | setParams(params, {"seed": 0, 19 | "nIterations": 10000, 20 | "nPatterns": 10, 21 | "useSparseOptimization": True, 22 | "hdfKey": "counts", 23 | "hdfColKey": "geneNames", 24 | "hdfRowKey": "cellNames" 25 | }) 26 | 27 | 28 | start = time.time() 29 | if __name__ == '__main__': 30 | params.setDistributedParams() 31 | result = distributedCoGAPS(path, params, None) 32 | end = time.time() 33 | print("TIME:", end - start) 34 | # plotResiduals(result) 35 | # binaryA(result, threshold=3) 36 | # plotPatternMarkers(result, colDendrogram=True) 37 | # plotPatternMarkers(result, rowDendrogram=True) 38 | # print(result) 39 | # CoGAPS(path, params, transposeData=True) 40 | # Subset data 41 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | *.o 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *.cover 48 | .hypothesis/ 49 | .pytest_cache/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | local_settings.py 58 | db.sqlite3 59 | 60 | # Flask stuff: 61 | instance/ 62 | .webassets-cache 63 | 64 | # Scrapy stuff: 65 | .scrapy 66 | 67 | # Sphinx documentation 68 | docs/_build/ 69 | 70 | # PyBuilder 71 | target/ 72 | 73 | # Jupyter Notebook 74 | .ipynb_checkpoints 75 | 76 | # pyenv 77 | .python-version 78 | 79 | # celery beat schedule file 80 | celerybeat-schedule 81 | 82 | # SageMath parsed files 83 | *.sage.py 84 | 85 | # Environments 86 | .env 87 | .venv 88 | env/ 89 | venv/ 90 | ENV/ 91 | env.bak/ 92 | venv.bak/ 93 | 94 | # Spyder project settings 95 | .spyderproject 96 | .spyproject 97 | 98 | # Rope project settings 99 | .ropeproject 100 | 101 | # mkdocs documentation 102 | /site 103 | 104 | # mypy 105 | .mypy_cache/ 106 | -------------------------------------------------------------------------------- /analyzepdac.py: -------------------------------------------------------------------------------- 1 | import anndata 2 | import pandas as pd 3 | import scanpy as sc 4 | # import pandas as pd 5 | 6 | cogapsresult = anndata.read_h5ad("data/cogapsresult.h5ad") 7 | # pdac = anndata.read_h5ad("data/PDAC.h5ad") 8 | # pdac_peng_epi = anndata.read_h5ad("data/PDAC_Peng_Epi.h5ad") 9 | 10 | # coldata = pd.read_csv("data/PDACcoldata.csv") 11 | # Index(['Unnamed: 0', 'barcode_raw', 'celltype', 'sample_ID', 12 | # 'sample_ID_celltype', 'TN', 'TN_manuscript', 'manuscript', 'nCount_RNA', 13 | # 'nFeature_RNA', 'percent.mt', 'Size_Factor', 'TN_cluster_resolution_5', 14 | # 'TN_assigned_cell_type', 'TN_assigned_cell_type_immune', 15 | # 'TN_assigned_cell_type_immune_specific', 16 | # 'TN_assigned_cell_type_immune_broad', 'cc', 'ccstage', 17 | # 'Classifier_T_duct', 'Classifier_T_Fibroblast_only', 18 | # 'Classifier_T_Fibroblast_Stellate'], 19 | # dtype='object') 20 | # 21 | # adata = pycogapsresult 22 | # # get readable gene names from original object 23 | # adata_original = sc.read_h5ad("data/PDAC_Peng_Epi.h5ad").T 24 | 25 | # adata = pycogapsresult.T 26 | 27 | # adata.obs["cell type"] = list(coldata["TN_assigned_cell_type_immune_specific"]) 28 | adata = cogapsresult.T 29 | from PyCoGAPS.analysis_functions import * 30 | plotPatternUMAP(adata) 31 | 32 | # pm = patternMarkers(adata, threshold="cut") 33 | # add cell type annotations 34 | adata.var["cell_type"] = adata_original.var["TN_assigned_cell_type_immune_broad"] 35 | 36 | # from PyCoGAPS.parameters import * 37 | # from PyCoGAPS.pycogaps_main import CoGAPS 38 | pm = patternMarkers(adata, threshold="all") 39 | # trying to get hallmark results 40 | markers = pm["PatternMarkers"] 41 | # colnames = list(markers) 42 | # pattern_names = {sub for sub in colnames if sub.startswith('Pattern')} 43 | p1_markers = list(markers["Pattern1"]) -------------------------------------------------------------------------------- /tests/testing.py: -------------------------------------------------------------------------------- 1 | ''' 2 | a script for testing the python interface to cogaps 3 | jeanette johnson 6/14/21 4 | before running: 5 | navigate to your pycogaps direcotry and run 'pip install .' 6 | ''' 7 | import sys 8 | sys.path.append(".") # Adds higher directory to python modules path. 9 | 10 | from PyCoGAPS import * # gonna try to only use things from this module 11 | 12 | path = "./data/GIST.csv" 13 | 14 | print("-- Passing params object into runCogaps function --\n") 15 | prm = CoParams(path) 16 | prm.printAllParams() # c++ method to display all parameter values 17 | CoGAPS(path, prm) 18 | cogapsrunresult = CoGAPS(path) 19 | result = cogapsrunresult['GapsResult'] 20 | anndata = cogapsrunresult['anndata'] 21 | plotResiduals(cogapsrunresult) 22 | plotPatternMarkers(cogapsrunresult, legend_pos=None) 23 | plotPatternMarkers(cogapsrunresult, samplePalette=["green", "teal", "red", "violet", "crimson", "antiquewhite", "lightblue", "hotpink", "orange"], patternPalette=["pink", "teal", "gold"], 24 | legend_pos=None) 25 | binaryA(cogapsrunresult, threshold=3) 26 | binaryA(cogapsrunresult, threshold=3, cluster=True) 27 | print("AMEAN: ", result.Amean) 28 | print("chisqHistory: ", result.chisqHistory) 29 | print(getBuildReport()) 30 | print(isCheckpointsEnabled()) 31 | print(isCompiledWithOpenMPSupport()) 32 | print(getFileInfo(path)) 33 | 34 | print("--Testing CogapsResult Object\n") 35 | print("calling show(result)\n") 36 | show(anndata) 37 | print("calling plot(result)\n") 38 | plot(cogapsrunresult) 39 | 40 | calcZP = calcZ(anndata, "sampleFactors") 41 | print(calcZP) 42 | calcZA = calcZ(anndata, "featureLoadings") 43 | print(calcZA) 44 | getVersion() 45 | 46 | print("~~~~~~~~~~~~~ testing CoGAPS Stat Functions ~~~~~~~~~~~~~~") 47 | 48 | dict = calcCoGAPSStat(cogapsrunresult, sets=['Hs.101174', 'Hs.1012']) 49 | print(dict) 50 | 51 | outStats = calcGeneGSStat(cogapsrunresult, GStoGenes=['Hs.101174', 'Hs.1012'], numPerm=1000) 52 | print(outStats) 53 | 54 | finalStats = computeGeneGSProb(cogapsrunresult, GStoGenes=['Hs.101174', 'Hs.1012']) 55 | print(finalStats) -------------------------------------------------------------------------------- /tests/test_seed_consistency.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append(".") # Adds higher directory to python modules path. 3 | 4 | from PyCoGAPS import * 5 | 6 | def results_equal(res1, res2): 7 | res1 = res1['GapsResult'] 8 | res2 = res2['GapsResult'] 9 | assert(np.allclose(toNumpy(res1.Amean), toNumpy(res2.Amean), rtol=0.1)) 10 | assert(np.allclose(toNumpy(res1.Asd), toNumpy(res2.Asd), rtol=0.1)) 11 | assert(np.allclose(toNumpy(res1.Pmean), toNumpy(res2.Pmean), rtol=0.1)) 12 | assert(np.allclose(toNumpy(res1.Psd), toNumpy(res2.Psd), rtol=0.1)) 13 | assert(res1.atomHistoryA == res2.atomHistoryA) 14 | assert(res1.atomHistoryP == res2.atomHistoryP) 15 | 16 | mtx_path = "./data/GIST.mtx" 17 | 18 | # standard cogaps 19 | params = CoParams(mtx_path) 20 | setParams(params, {"nIterations": 100, 21 | "seed": 42}) 22 | res1 = CoGAPS(mtx_path, params, outputFrequency=10, messages=False) 23 | res2 = CoGAPS(mtx_path, params, outputFrequency=10, messages=False) 24 | results_equal(res1, res2) 25 | 26 | ''' 27 | # TODO: run once implement distributed cogaps 28 | # distributed cogaps 29 | params = CoParams(mtx_path) 30 | setParams(params, {"nIterations": 100, 31 | "seed": 42, 32 | "distributed": "genome-wide"}) 33 | res1 = CoGAPS(mtx_path, params, outputFrequency=10, messages=False) 34 | res2 = CoGAPS(mtx_path, params, outputFrequency=10, messages=False) 35 | results_equal(res1, res2) 36 | ''' 37 | 38 | # multiple threads, dense sampler 39 | params = CoParams(mtx_path) 40 | setParams(params, {"nIterations": 100, 41 | "seed": 42, 42 | "useSparseOptimization": False}) 43 | res1 = CoGAPS(mtx_path, params, outputFrequency=10, messages=False, nThreads=1) 44 | res2 = CoGAPS(mtx_path, params, outputFrequency=10, messages=False, nThreads =3) 45 | res3 = CoGAPS(mtx_path, params, outputFrequency=10, messages=False, nThreads =6) 46 | results_equal(res1, res2) 47 | results_equal(res1, res3) 48 | results_equal(res2, res3) 49 | 50 | # multiple threads, sparse sampler 51 | params = CoParams(mtx_path) 52 | setParams(params, {"nIterations": 100, 53 | "seed": 42, 54 | "useSparseOptimization": True}) 55 | res1 = CoGAPS(mtx_path, params, outputFrequency=10, messages=False, nThreads=1) 56 | res2 = CoGAPS(mtx_path, params, outputFrequency=10, messages=False, nThreads =3) 57 | res3 = CoGAPS(mtx_path, params, outputFrequency=10, messages=False, nThreads =6) 58 | results_equal(res1, res2) 59 | results_equal(res1, res3) 60 | results_equal(res2, res3) -------------------------------------------------------------------------------- /vignette.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This file runs PyCoGAPS using user-specified parameter inputs from params.yml 3 | This is intended for running development version of PyCoGAPS (from github) 4 | ''' 5 | 6 | if __name__ == '__main__': 7 | from PyCoGAPS.parameters import * 8 | from PyCoGAPS.pycogaps_main import CoGAPS 9 | 10 | import yaml 11 | import pickle 12 | print("This vignette was built using pycogaps version", getVersion()) 13 | 14 | # read parameter file 15 | with open("params.yaml", "r") as file: 16 | prm = yaml.safe_load(file) 17 | 18 | # if using AWS bucket server 19 | aws_prm = prm['aws_params'] 20 | if aws_prm['useAWS']: 21 | import boto3 22 | s3 = boto3.client('s3') 23 | with open(prm['path'], 'wb') as f: 24 | s3.download_fileobj(aws_prm['downloadBucket'], aws_prm['downloadKey'], f) 25 | 26 | # create CoParams object 27 | params = CoParams(path=prm['path'], transposeData=prm['run_params']['transposeData'], 28 | hdfKey=prm['additional_params']['hdfKey'], hdfRowKey=prm['additional_params']['hdfRowKey'], 29 | hdfColKey=prm['additional_params']['hdfColKey']) 30 | 31 | # set all standard, sparsity, additional parameters 32 | setParams(params, prm['standard_params']) 33 | setParams(params, prm['sparsity_params']) 34 | setParams(params, prm['additional_params']) 35 | 36 | # set fixed patterns from additional params 37 | if prm['additional_params']['fixedPatterns'] is not None: 38 | params.setFixedPatterns(fixedPatterns=prm['additional_params']['fixedPatterns'], whichMatrixFixed=prm['additional_params']['whichMatrixFixed']) 39 | 40 | # set distributed parameters 41 | dist_prm = prm['distributed_params'] 42 | setParam(params, 'distributed', dist_prm['distributed']) 43 | if dist_prm['distributed'] is not None: 44 | params.setAnnotationWeights(annotation=dist_prm['samplingAnnotation'], weight=dist_prm['samplingWeight']) 45 | params.setDistributedParams(nSets=dist_prm['nSets'], cut=dist_prm['cut'], minNS=dist_prm['minNS'], maxNS=dist_prm['maxNS']) 46 | 47 | # run CoGAPS 48 | result = CoGAPS(prm['path'], params) 49 | 50 | # save CoGAPS result 51 | print("Pickling...", end='\r') 52 | pickle.dump(result, open(prm['result_file'], "wb")) 53 | print("Pickling complete!") 54 | 55 | if aws_prm['useAWS']: 56 | with open(prm['result_file'], 'rb') as data: 57 | s3.upload_fileobj(data, aws_prm['uploadBucket'], prm['uploadKey']) 58 | -------------------------------------------------------------------------------- /vignette_from_args.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This file runs PyCoGAPS using user-specified parameter inputs from params.yml 3 | This is intended for running Docker image of PyCoGAPS (latest pushed) 4 | ''' 5 | 6 | if __name__ == '__main__': 7 | from PyCoGAPS.config import * 8 | from PyCoGAPS.parameters import * 9 | from PyCoGAPS.pycogaps_main import CoGAPS 10 | 11 | import yaml 12 | import pickle 13 | print("This vignette was built using pycogaps version", getVersion()) 14 | 15 | # get parameter file from command line input 16 | params_file = sys.argv[1] 17 | PWD = '/'.join(params_file.split('/')[:-1]) + '/' # leveraging $PWD tag in the run line 18 | outdir = PWD + 'output/' 19 | if not os.path.exists(outdir): 20 | os.mkdir(outdir) 21 | 22 | # read parameter file 23 | with open(params_file, "r") as file: 24 | prm = yaml.safe_load(file) 25 | 26 | # if using AWS bucket server 27 | aws_prm = prm['aws_params'] 28 | if aws_prm['useAWS']: 29 | import boto3 30 | s3 = boto3.client('s3') 31 | with open(prm['path'], 'wb') as f: 32 | s3.download_fileobj(aws_prm['downloadBucket'], aws_prm['downloadKey'], f) 33 | 34 | # create CoParams object 35 | 36 | # Note: since data_path=PWD+prm['path'], the path supplied in param.yaml must be relative to the working directory 37 | data_path = PWD+prm['path'] 38 | 39 | params = CoParams(path=data_path, transposeData=prm['run_params']['transposeData'], 40 | hdfKey=prm['additional_params']['hdfKey'], hdfRowKey=prm['additional_params']['hdfRowKey'], 41 | hdfColKey=prm['additional_params']['hdfColKey']) 42 | 43 | # set all standard, sparsity, additional parameters 44 | setParams(params, prm['standard_params']) 45 | setParams(params, prm['run_params']) 46 | setParams(params, prm['sparsity_params']) 47 | setParams(params, prm['additional_params']) 48 | 49 | # set fixed patterns from additional params 50 | if prm['additional_params']['fixedPatterns'] is not None: 51 | params.setFixedPatterns(fixedPatterns=prm['additional_params']['fixedPatterns'], whichMatrixFixed=prm['additional_params']['whichMatrixFixed']) 52 | 53 | # set distributed parameters 54 | dist_prm = prm['distributed_params'] 55 | setParam(params, 'distributed', dist_prm['distributed']) 56 | if dist_prm['distributed'] is not None: 57 | params.setAnnotationWeights(annotation=dist_prm['samplingAnnotation'], weight=dist_prm['samplingWeight']) 58 | params.setDistributedParams(nSets=dist_prm['nSets'], cut=dist_prm['cut'], minNS=dist_prm['minNS'], maxNS=dist_prm['maxNS']) 59 | 60 | # run CoGAPS 61 | result = CoGAPS(data_path, params) 62 | 63 | # save CoGAPS result 64 | print("Pickling...", end='\r') 65 | # pickle.dump(result, open(prm['result_file'], "wb")) 66 | pickle.dump(result, open(outdir+prm['result_file'], "wb")) 67 | print("Pickling complete!") 68 | 69 | if aws_prm['useAWS']: 70 | with open(prm['result_file'], 'rb') as data: 71 | s3.upload_fileobj(data, aws_prm['uploadBucket'], prm['uploadKey']) 72 | -------------------------------------------------------------------------------- /data/ModSimData.txt: -------------------------------------------------------------------------------- 1 | 0.077764 0.94742 4.2487 7.0608 4.873 4.2687 9.1061 12.602 9.0283 3.3217 0.63098 0.081912 0.076605 0.15584 0.2 0.15576 0.073576 0.02108 0.0036631 0.00038609 0.081467 0.99253 4.4507 7.3906 5.0387 4.1205 8.5843 11.869 8.5029 3.1288 0.59813 0.099451 0.15 0.31159 0.4 0.31152 0.14715 0.04216 0.0073263 0.00077218 0.08517 1.0376 4.6527 7.7204 5.2044 3.9723 8.0626 11.136 7.9775 2.9359 0.56529 0.11699 0.2234 0.46735 0.6 0.46728 0.22073 0.06324 0.010989 0.0011583 0.088873 1.0827 4.8547 8.0502 5.37 3.8241 7.5408 10.403 7.4521 2.743 0.53244 0.13453 0.2968 0.6231 0.8 0.62304 0.2943 0.084319 0.014653 0.0015444 0.092576 1.1279 5.0567 8.38 5.5357 3.6759 7.019 9.6695 6.9266 2.55 0.49959 0.15207 0.3702 0.77886 1 0.7788 0.36788 0.1054 0.018316 0.0019305 0.096279 1.173 5.2587 8.7098 5.7014 3.5277 6.4973 8.9362 6.4012 2.3571 0.46674 0.16961 0.4436 0.93462 1.2 0.93456 0.44146 0.12648 0.021979 0.0023165 0.099982 1.2181 5.4607 9.0396 5.867 3.3795 5.9755 8.203 5.8758 2.1642 0.4339 0.18715 0.517 1.0904 1.4 1.0903 0.51503 0.14756 0.025642 0.0027026 0.10368 1.2632 5.6627 9.3694 6.0327 3.2313 5.4538 7.4698 5.3503 1.9713 0.40105 0.20469 0.5904 1.2461 1.6 1.2461 0.58861 0.16864 0.029305 0.0030887 0.10739 1.3083 5.8647 9.6992 6.1984 3.0831 4.932 6.7366 4.8249 1.7784 0.3682 0.22223 0.6638 1.4019 1.8 1.4018 0.66218 0.18972 0.032968 0.0034748 0.11109 1.3534 6.0667 10.029 6.364 2.9349 4.4103 6.0034 4.2995 1.5854 0.33535 0.23977 0.7372 1.5576 2 1.5576 0.73576 0.2108 0.036631 0.0038609 0.044437 0.54138 2.4277 4.0319 2.7547 2.2811 4.7735 6.6014 4.7299 1.7517 0.44215 0.68534 2.2824 4.8286 6.2 4.8286 2.2809 0.65348 0.11356 0.011969 0.05581 0.67994 3.0488 5.0588 3.4079 2.5904 5.249 7.2493 5.1939 1.922 0.47021 0.66438 2.1985 4.6506 5.9714 4.6506 2.1968 0.62938 0.10937 0.011528 0.067184 0.8185 3.67 6.0857 4.0611 2.8996 5.7244 7.8973 5.6579 2.0922 0.49826 0.64341 2.1146 4.4726 5.7429 4.4725 2.1127 0.60529 0.10518 0.011086 0.078557 0.95707 4.2911 7.1127 4.7144 3.2089 6.1998 8.5453 6.1219 2.2625 0.52632 0.62245 2.0306 4.2946 5.5143 4.2945 2.0286 0.5812 0.101 0.010645 0.089931 1.0956 4.9122 8.1396 5.3676 3.5182 6.6752 9.1932 6.5859 2.4328 0.55438 0.60148 1.9467 4.1166 5.2857 4.1165 1.9445 0.55711 0.096811 0.010204 0.1013 1.2342 5.5333 9.1665 6.0208 3.8274 7.1506 9.8412 7.05 2.6031 0.58243 0.58052 1.8628 3.9386 5.0571 3.9385 1.8604 0.53302 0.092625 0.0097626 0.11268 1.3728 6.1545 10.193 6.674 4.1367 7.626 10.489 7.514 2.7733 0.61049 0.55955 1.7789 3.7606 4.8286 3.7605 1.7763 0.50893 0.088438 0.0093213 0.12405 1.5113 6.7756 11.22 7.3272 4.446 8.1014 11.137 7.978 2.9436 0.63855 0.53859 1.6949 3.5826 4.6 3.5825 1.6922 0.48484 0.084252 0.0088801 0.13542 1.6499 7.3967 12.247 7.9804 4.7552 8.5768 11.785 8.442 3.1139 0.6666 0.51762 1.611 3.4045 4.3714 3.4045 1.6082 0.46075 0.080066 0.0084388 0.1468 1.7884 8.0179 13.274 8.6337 5.0645 9.0523 12.433 8.906 3.2841 0.69466 0.49666 1.5271 3.2265 4.1429 3.2265 1.5241 0.43665 0.075879 0.0079976 0.15817 1.927 8.639 14.301 9.2869 5.3738 9.5277 13.081 9.37 3.4544 0.72272 0.47569 1.4431 3.0485 3.9143 3.0484 1.44 0.41256 0.071693 0.0075563 0.16955 2.0656 9.2601 15.328 9.9401 5.683 10.003 13.729 9.8341 3.6247 0.75077 0.45473 1.3592 2.8705 3.6857 2.8704 1.3559 0.38847 0.067506 0.0071151 0.18092 2.2041 9.8812 16.355 10.593 5.9923 10.478 14.377 10.298 3.7949 0.77883 0.43376 1.2753 2.6925 3.4571 2.6924 1.2718 0.36438 0.06332 0.0066739 0.19229 2.3427 10.502 17.382 11.247 6.3016 10.954 15.025 10.762 3.9652 0.80689 0.4128 1.1913 2.5145 3.2286 2.5144 1.1877 0.34029 0.059133 0.0062326 0.20367 2.4812 11.123 18.409 11.9 6.6108 11.429 15.673 11.226 4.1355 0.83494 0.39184 1.1074 2.3365 3 2.3364 1.1036 0.3162 0.054947 0.0057914 -------------------------------------------------------------------------------- /params.yaml: -------------------------------------------------------------------------------- 1 | ## This file holds all parameters to be passed into PyCoGAPS. 2 | ## To modify default parameters, simply replace parameter values below with user-specified values, and save file. 3 | 4 | # RELATIVE path to data -- make sure to move your data into the created data/ folder 5 | path: data/ModSimData.txt 6 | 7 | # result output file name (output saved as a .h5ad file) 8 | result_file: ModSimResult.h5ad 9 | 10 | standard_params: 11 | # number of patterns CoGAPS will learn 12 | nPatterns: 3 13 | # number of iterations for each phase of the algorithm 14 | nIterations: 1000 15 | # random number generator seed 16 | seed: 0 17 | # speeds up performance with sparse data (roughly >80% of data is zero), note this can only be used with the default uncertainty 18 | useSparseOptimization: False 19 | 20 | run_params: 21 | # maximum number of threads to run on 22 | nThreads: 1 23 | # T/F for displaying output 24 | messages: True 25 | # number of iterations between each output (set to 0 to disable status updates) 26 | outputFrequency: 500 27 | # uncertainty matrix - either a matrix or a supported file type 28 | uncertainty: null 29 | # name of the checkpoint file to create 30 | checkpointOutFile: gaps_checkpoint.out 31 | # number of iterations between each checkpoint (set to 0 to disable checkpoints) 32 | checkpointInterval: 250 33 | # if this is provided, CoGAPS runs from the checkpoint contained in this file 34 | checkpointInFile: null 35 | # T/F for transposing data while reading it in - useful for data that is stored as samples x genes since CoGAPS requires data to be genes x samples 36 | transposeData: False 37 | # if calling CoGAPS in parallel the worker ID can be specified 38 | workerID: 1 39 | # enable asynchronous updating which allows for multi-threaded runs 40 | asynchronousUpdates: True 41 | # how many snapshots to take in each phase, setting this to 0 disables snapshots 42 | nSnapshots: 0 43 | # which phase to take snapsjots in e.g. "equilibration", "sampling", "all" 44 | snapshotPhase: sampling 45 | 46 | sparsity_params: 47 | # sparsity parameter for feature matrix 48 | alphaA: 0.01 49 | # sparsity parameter for sample matrix 50 | alphaP: 0.01 51 | # atomic mass restriction for feature matrix 52 | maxGibbsMassA: 100 53 | # atomic mass restriction for sample matrix 54 | maxGibbsMassP: 100 55 | 56 | distributed_params: 57 | # either null or genome-wide 58 | distributed: null 59 | # number of sets to break data into 60 | nSets: 4 61 | # number of branches at which to cut dendrogram used in pattern matching 62 | # default: nPatterns 63 | cut: null 64 | # minimum of individual set contributions a cluster must contain 65 | # default: math.ceil(cut / 2) 66 | minNS: null 67 | # maximum of individual set contributions a cluster can contain 68 | # default: minNS + nSets 69 | maxNS: null 70 | # specify subsets by index or name 71 | explicitSets: null 72 | # specify categories along the rows (cols) to use for weighted sampling 73 | samplingAnnotation: null 74 | # weights associated with samplingAnnotation 75 | samplingWeight: null 76 | 77 | additional_params: 78 | # set of indices to use from the data 79 | subsetIndices: null 80 | # which dimension (0=rows, 1=cols) to subset 81 | subsetDim: 0 82 | # vector of names of genes in data 83 | geneNames: null 84 | # vector of names of samples in data 85 | sampleNames: null 86 | # fix either 'A' or 'P' matrix to these values, in the context of distributed CoGAPS, the first phase is skipped and `fixedPatterns: 87 | # is used for all sets allowing manual pattern matching, as well as fixed runs of standard CoGAPS 88 | fixedPatterns: null 89 | # either 'A' or 'P', indicating which matrix is fixed 90 | whichMatrixFixed: null 91 | # whether or not to take PUMP samples 92 | takePumpSamples: False 93 | # for reading .h5 files 94 | hdfKey: null 95 | # for reading .h5 files 96 | hdfRowKey: null 97 | # for reading .h5 files 98 | hdfColKey: null 99 | 100 | aws_params: 101 | # whether or not to use AWS bucket server 102 | useAWS: False 103 | # name of bucket to download from 104 | downloadBucket: null 105 | # name of key to download from 106 | downloadKey: null 107 | # name of bucket to upload to 108 | uploadBucket: null 109 | # name of key to upload to 110 | uploadKey: null 111 | 112 | -------------------------------------------------------------------------------- /PyCoGAPS/subset_data.py: -------------------------------------------------------------------------------- 1 | from PyCoGAPS.config import * 2 | from PyCoGAPS.helper_functions import nrowHelper, ncolHelper, getDimNames 3 | 4 | # explicitSets either list of indices or names 5 | def sampleWithExplicitSets(allParams, data): 6 | """ Sample with user provided explicit sets 7 | 8 | Args: 9 | allParams (CoParams): a CoParams object 10 | data (anndata): anndata object of data 11 | 12 | Raises: 13 | Exception: If some named genes in explicitSets not found 14 | 15 | Returns: 16 | list: list of subsets 17 | """ 18 | explicit_sets = allParams.coparams['explicitSets'] 19 | 20 | if all(isinstance(item, np.ndarray) for item in explicit_sets): 21 | print("using provided indexed subsets") 22 | return explicit_sets 23 | 24 | if all(isinstance(item, str) for item in explicit_sets): 25 | print("using provided named subsets") 26 | getDimNames(data, allParams) 27 | if allParams.coparams['distributed'] == "genome-wide": 28 | allNames = allParams.coparams['geneNames'] 29 | else: 30 | allNames = allParams.coparams['sampleNames'] 31 | 32 | for item in explicit_sets: 33 | if item not in allNames: 34 | raise Exception("some named genes in explicitSets not found") 35 | 36 | return [list(allNames).index(i) for i in explicit_sets] 37 | 38 | 39 | def sampleWithAnnotationWeights(allParams, setSize): 40 | """ subset rows (cols) proportional to the user provided weights 41 | 42 | Args: 43 | allParams (CoParams): CoParams object 44 | setSize (int): size of each subset of the total 45 | 46 | Returns: 47 | list: list of subsets 48 | """ 49 | 50 | # samplingWeight is a dictionary (name: weight) 51 | weight = allParams.coparams['samplingWeight'] 52 | sorted_weight = [] 53 | for key in sorted(weight): 54 | sorted_weight.append(weight[key]) 55 | groups = np.unique(allParams.coparams['samplingAnnotation']) 56 | groups = np.sort(groups) 57 | 58 | sets = [] 59 | 60 | for i in range(allParams.coparams['nSets']): 61 | groupCount = np.random.choice(groups, size=setSize, replace=True, p=(sorted_weight/np.array(sorted_weight).sum())) 62 | subset = [] 63 | for g in groups: 64 | groupNdx = np.argwhere(g == np.array(allParams.coparams['samplingAnnotation'])) 65 | sub = np.random.choice(groupNdx.flatten(), size=sum(groupCount == g), replace=True) 66 | if sub.size != 0: 67 | subset.append(sub) 68 | subset = np.sort(np.concatenate(subset)) 69 | sets.append(subset) 70 | 71 | return sets 72 | 73 | 74 | def sampleUniformly(allParams, total, setSize): 75 | """ subset data by uniformly partitioning rows (cols) 76 | 77 | Args: 78 | allParams (CoParams): CoParams object 79 | total (int): total number of rows (cols) that are being paritioned 80 | setSize (int): size of each subset of the total 81 | 82 | Returns: 83 | list: list of subsets 84 | """ 85 | 86 | sets = [None] * (allParams.coparams['nSets']) 87 | remaining = np.arange(0,total) 88 | for n in range(allParams.coparams['nSets'] - 1): 89 | selected = np.random.choice(list(remaining), setSize, replace=False) 90 | sets[n] = np.sort(selected) 91 | remaining = set(remaining).difference(set(selected)) 92 | sets[allParams.coparams['nSets'] - 1] = np.sort(list(remaining)) 93 | return sets 94 | 95 | 96 | def createSets(data, allParams): 97 | """ either genes or samples or partitioned depending on the type 98 | of distributed CoGAPS (i.e. genome-wide or single-cell) 99 | 100 | Args: 101 | data (anndata): anndata object of data 102 | allParams (CoParams): CoParams object 103 | 104 | Raises: 105 | Exception: If nSets does not match number of explicit sets given 106 | 107 | Returns: 108 | list: list of sets 109 | """ 110 | subsetRows = allParams.gaps.transposeData != allParams.coparams['distributed'] == "genome-wide" 111 | if subsetRows: 112 | total = nrowHelper(data) 113 | else: 114 | total = ncolHelper(data) 115 | setSize = math.floor(total / allParams.coparams['nSets']) 116 | 117 | print('Creating subsets...') 118 | 119 | if allParams.coparams['explicitSets'] is not None: 120 | if len(allParams.coparams['explicitSets']) != allParams.coparams['nSets']: 121 | raise Exception('nSets does not match number of explicit sets given') 122 | sets = sampleWithExplicitSets(allParams, data) 123 | 124 | elif allParams.coparams['samplingAnnotation'] is not None: 125 | print('sampling with annotation weights') 126 | sets = sampleWithAnnotationWeights(allParams, setSize) 127 | 128 | else: 129 | sets = sampleUniformly(allParams, total, setSize) 130 | 131 | # print('shape: ', sets.shape) 132 | # print('set sizes (min, mean, max): (', 133 | # min(len(sets)), ', ', 134 | # np.mean(len(sets)), ', ', 135 | # max(len(sets)), ')\n') 136 | 137 | return sets -------------------------------------------------------------------------------- /tests/test_top_level.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append(".") # Adds higher directory to python modules path. 3 | 4 | from PyCoGAPS import * 5 | 6 | def no_na_in_result(result): 7 | return not (np.isnan(toNumpy(result['GapsResult'].Amean)).any()+ 8 | np.isnan(toNumpy(result['GapsResult'].Asd)).any()+ 9 | np.isnan(toNumpy(result['GapsResult'].Pmean)).any()+ 10 | np.isnan(toNumpy(result['GapsResult'].Psd)).any()) 11 | 12 | csv_path = "./data/GIST.csv" 13 | mtx_path = "./data/GIST.mtx" 14 | tsv_path = "./data/GIST.tsv" 15 | 16 | # standard running 17 | print('\n ### Testing Standard Run ###\n') 18 | 19 | csv_params = CoParams(csv_path) 20 | mtx_params = CoParams(mtx_path) 21 | tsv_params = CoParams(tsv_path) 22 | 23 | adata = anndata.read_csv(csv_path) 24 | 25 | 26 | setParams(csv_params, {"nIterations": 100, 27 | "nPatterns": 7, 28 | 'hdfKey': 'counts'}) 29 | setParams(mtx_params, {"nIterations": 100}) 30 | setParams(tsv_params, {"nIterations": 100}) 31 | 32 | csv_params.printAllParams() 33 | 34 | res = [None] * 3 35 | res[0] = CoGAPS(csv_path, csv_params, outputFrequency=50, messages=False) 36 | res[1] = CoGAPS(mtx_path, mtx_params, outputFrequency=50, messages=False) 37 | res[2] = CoGAPS(tsv_path, tsv_params, outputFrequency=50, messages=False) 38 | 39 | for r in res: 40 | assert(no_na_in_result(r) == True) 41 | 42 | assert(toNumpy(res[0]['GapsResult'].Amean).shape[0] == 1363) 43 | assert(toNumpy(res[0]['GapsResult'].Amean).shape[1] == 7) 44 | assert(toNumpy(res[0]['GapsResult'].Pmean).shape[0] == 9) 45 | assert(toNumpy(res[0]['GapsResult'].Pmean).shape[1] == 7) 46 | 47 | # transposing data 48 | print('\n ### Testing Transpose Run ###\n') 49 | res = [None] * 3 50 | res[0] = CoGAPS(csv_path, transposeData=True, outputFrequency=50, messages=False) 51 | res[1] = CoGAPS(mtx_path, transposeData=True, outputFrequency=50, messages=False) 52 | res[2] = CoGAPS(tsv_path, transposeData=True, outputFrequency=50, messages=False) 53 | 54 | for r in res: 55 | assert(no_na_in_result(r) == True) 56 | 57 | assert(toNumpy(res[0]['GapsResult'].Amean).shape[0] == 9) 58 | assert(toNumpy(res[0]['GapsResult'].Amean).shape[1] == 3) 59 | assert(toNumpy(res[0]['GapsResult'].Pmean).shape[0] == 1363) 60 | assert(toNumpy(res[0]['GapsResult'].Pmean).shape[1] == 3) 61 | 62 | # multiple threads 63 | print('\n ### Testing Multiple Threads Run ###\n') 64 | res = [None] * 3 65 | res[0] = CoGAPS(csv_path, outputFrequency=50, messages=False, nThreads=2) 66 | res[1] = CoGAPS(csv_path, outputFrequency=50, messages=False, nThreads=6) 67 | res[2] = CoGAPS(csv_path, outputFrequency=50, messages=False, nThreads=12) 68 | for r in res: 69 | assert(no_na_in_result(r) == True) 70 | 71 | 72 | 73 | # test running with fixed matrix 74 | print('\n ### Testing Fixed Matrix Run ###\n') 75 | nPat = 3 76 | fixedA = np.random.uniform(1, 10, (adata.X.shape[0], nPat)) 77 | fixedP = np.random.uniform(1, 10, (adata.X.shape[1], nPat)) 78 | params = CoParams(csv_path) 79 | params.setFixedPatterns(fixedA, "A") 80 | setParams(params, {'nIterations': 100, 81 | 'seed': 42, 82 | 'nPatterns': nPat}) 83 | res = CoGAPS(csv_path, params, outputFrequency=100, messages=False) 84 | 85 | assert(toNumpy(res['GapsResult'].Amean).shape == fixedA.shape) 86 | for i in range(fixedA.shape[1]): 87 | fixedA[:,i] = fixedA[:,i] * (toNumpy(res['GapsResult'].Amean)[0,i] / fixedA[0,i]) 88 | 89 | assert(np.allclose(fixedA, toNumpy(res['GapsResult'].Amean), rtol=1e-3)) 90 | 91 | 92 | params = CoParams(csv_path) 93 | params.setFixedPatterns(fixedP, "P") 94 | setParams(params, {'nIterations': 100, 95 | 'seed': 42, 96 | 'nPatterns': nPat}) 97 | res = CoGAPS(csv_path, params, outputFrequency=100, messages=False) 98 | 99 | assert(toNumpy(res['GapsResult'].Pmean).shape == fixedP.shape) 100 | for i in range(fixedP.shape[1]): 101 | fixedP[:,i] = fixedP[:,i] * (toNumpy(res['GapsResult'].Pmean)[0,i] / fixedP[0,i]) 102 | 103 | assert(np.allclose(fixedP, toNumpy(res['GapsResult'].Pmean), rtol=1e-3)) 104 | 105 | 106 | # testing that None gets converted to NULL for distributed 107 | print('\n ### Testing Distributed is None Run ###\n') 108 | params = CoParams(csv_path) 109 | setParams(params, {'nIterations': 100, 110 | 'seed': 42, 111 | 'nPatterns': 3, 112 | 'distributed': None}) 113 | res = CoGAPS(csv_path, params, outputFrequency=100, messages=False) 114 | 115 | ''' 116 | # test using saved parameters in file 117 | # use pickle ?? 118 | # pickle isn't compatible with c++ objects (GapsParameters) 119 | # if needed can add support for this later 120 | matP = getSampleFactors(res['anndata']) 121 | params = CoParams(csv_path) 122 | setParams(params, {'nPatterns': matP.shape[1], 123 | 'nIterations': 175, 124 | 'seed': 42, 125 | 'useSparseOptimization': True, 126 | 'distributed': "genome-wide", 127 | 'explicitSets': [np.arange(0,201), np.arange(201, 401), np.arange(401, 601), np.arange(601,801), np.arange(801,1001)]}) 128 | params.setDistributedParams(nSets=5, cut=matP.shape[1] + 1) 129 | params.setFixedPatterns(matP, "P") 130 | 131 | 132 | with open('temp_params.pkl', 'wb') as outp: 133 | pickle.dump(params, outp) 134 | 135 | with open('temp_params.pkl', 'rb') as inp: 136 | temp_params = pickle.load(inp) 137 | 138 | res1 = CoGAPS(csv_path, params) 139 | res2 = CoGAPS(csv_path, temp_params) 140 | ''' -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | from setuptools import setup 3 | from setuptools.command.build_ext import build_ext 4 | import sys 5 | import pybind11 6 | from pybind11.setup_helpers import Pybind11Extension, build_ext 7 | 8 | __version__ = '0.0.1' 9 | import sys 10 | print(sys.path) 11 | 12 | class get_pybind_include(object): 13 | """Helper class to determine the pybind11 include path 14 | 15 | The purpose of this class is to postpone importing pybind11 16 | until it is actually installed, so that the ``get_include()`` 17 | method can be invoked. """ 18 | 19 | def __init__(self, user=False): 20 | self.user = user 21 | 22 | def __str__(self): 23 | import pybind11 24 | return pybind11.get_include(self.user) 25 | 26 | 27 | ext_modules = [ 28 | Pybind11Extension("pycogaps", 29 | ['src/bindings.cpp', 30 | 'src/CoGAPS/src/GapsParameters.cpp', 31 | 'src/CoGAPS/src/GapsResult.cpp', 32 | 'src/CoGAPS/src/GapsRunner.cpp', 33 | 'src/CoGAPS/src/GapsStatistics.cpp', 34 | 'src/CoGAPS/src/atomic/Atom.cpp', 35 | 'src/CoGAPS/src/atomic/ConcurrentAtom.cpp', 36 | 'src/CoGAPS/src/atomic/AtomicDomain.cpp', 37 | 'src/CoGAPS/src/atomic/ConcurrentAtomicDomain.cpp', 38 | 'src/CoGAPS/src/atomic/ProposalQueue.cpp', 39 | 'src/CoGAPS/src/data_structures/HashSets.cpp', 40 | 'src/CoGAPS/src/data_structures/HybridMatrix.cpp', 41 | 'src/CoGAPS/src/data_structures/HybridVector.cpp', 42 | 'src/CoGAPS/src/data_structures/Matrix.cpp', 43 | 'src/CoGAPS/src/data_structures/SparseIterator.cpp', 44 | 'src/CoGAPS/src/data_structures/SparseMatrix.cpp', 45 | 'src/CoGAPS/src/data_structures/SparseVector.cpp', 46 | 'src/CoGAPS/src/data_structures/Vector.cpp', 47 | 'src/CoGAPS/src/file_parser/CharacterDelimitedParser.cpp', 48 | 'src/CoGAPS/src/file_parser/FileParser.cpp', 49 | 'src/CoGAPS/src/file_parser/MtxParser.cpp', 50 | 'src/CoGAPS/src/file_parser/MatrixElement.cpp', 51 | 'src/CoGAPS/src/gibbs_sampler/DenseNormalModel.cpp', 52 | 'src/CoGAPS/src/gibbs_sampler/SparseNormalModel.cpp', 53 | 'src/CoGAPS/src/gibbs_sampler/AlphaParameters.cpp', 54 | 'src/CoGAPS/src/math/Math.cpp', 55 | 'src/CoGAPS/src/math/MatrixMath.cpp', 56 | 'src/CoGAPS/src/math/Random.cpp', 57 | 'src/CoGAPS/src/math/VectorMath.cpp', 58 | 'src/CoGAPS/src/test-runner.cpp' 59 | ], 60 | include_dirs=[ 61 | # Path to pybind11 headers 62 | get_pybind_include(), 63 | get_pybind_include(user=True), 64 | 'src/CoGAPS/src/include/', 65 | 'src/CoGAPS/src/', 66 | 'src/CoGAPS/src/data_structures/' 67 | ], 68 | language="c++" 69 | ), 70 | ] 71 | 72 | 73 | # As of Python 3.6, CCompiler has a `has_flag` method. 74 | # cf http://bugs.python.org/issue26689 75 | def has_flag(compiler, flagname): 76 | """Return a boolean indicating whether a flag name is supported on 77 | the specified compiler. 78 | """ 79 | import tempfile 80 | with tempfile.NamedTemporaryFile('w', suffix='.cpp') as f: 81 | f.write('int main (int argc, char **argv) { return 0; }') 82 | try: 83 | compiler.compile([f.name], extra_postargs=[flagname]) 84 | except setuptools.distutils.errors.CompileError: 85 | return False 86 | return True 87 | 88 | 89 | def cpp_flag(compiler): 90 | """Return the -std=c++[11/14] compiler flag. 91 | 92 | The c++14 is required. 93 | """ 94 | if has_flag(compiler, '-std=c++14'): 95 | return '-std=c++14' 96 | else: 97 | raise RuntimeError('Unsupported compiler -- C++14 support ' 98 | 'is needed!') 99 | 100 | 101 | class BuildExt(build_ext): 102 | """A custom build extension for adding compiler-specific options.""" 103 | c_opts = { 104 | 'msvc': ['/EHsc'], 105 | 'unix': [], 106 | } 107 | 108 | if sys.platform == 'darwin': 109 | c_opts['unix'] += ['-stdlib=libc++', '-mmacosx-version-min=10.13'] 110 | 111 | def build_extensions(self): 112 | ct = self.compiler.compiler_type 113 | opts = self.c_opts.get(ct, []) 114 | opts.append("-I/src/CoGAPS/src/include/") 115 | opts.append("-I/src/CoGAPS/src/*") 116 | opts.append("-I/src/CoGAPS/src/data_structures/") 117 | if ct == 'unix': 118 | opts.append('-DVERSION_INFO="%s"' % self.distribution.get_version()) 119 | opts.append(cpp_flag(self.compiler)) 120 | if has_flag(self.compiler, '-fvisibility=hidden'): 121 | opts.append('-fvisibility=hidden') 122 | elif ct == 'msvc': 123 | opts.append('/DVERSION_INFO=\\"%s\\"' % self.distribution.get_version()) 124 | for ext in self.extensions: 125 | ext.extra_compile_args = opts 126 | build_ext.build_extensions(self) 127 | 128 | 129 | setup( 130 | name='pycogaps', 131 | version=__version__, 132 | author='Jeanette Johnson', 133 | author_email='jjohn450@jhmi.edu', 134 | url='https://github.com/FertigLab/pycogaps', 135 | description='Python interface to the Non-Negative Matrix Factorization Algorithm CoGAPS', 136 | long_description='', 137 | ext_modules=ext_modules, 138 | install_requires=['pybind11>=2.2'], 139 | cmdclass={'build_ext': BuildExt}, 140 | zip_safe=False, 141 | language="c++" 142 | ) 143 | -------------------------------------------------------------------------------- /PyCoGAPS/distributed_functions.py: -------------------------------------------------------------------------------- 1 | from PyCoGAPS.config import * 2 | from PyCoGAPS.helper_functions import * 3 | from PyCoGAPS.subset_data import * 4 | 5 | import itertools 6 | from sklearn.cluster import AgglomerativeClustering 7 | from scipy.stats.stats import pearsonr 8 | 9 | def findConsensusMatrix(unmatched, params): 10 | # print("FINDING CONSENSUS MATRIX") 11 | # allpatterns = pd.DataFrame(np.hstack(unmatched)) 12 | allpatterns = pd.DataFrame(unmatched) 13 | comb = expandgrid(range(params.coparams["nSets"]), range(params.gaps.nPatterns)) 14 | comb = list(comb.values()) 15 | comb = pd.DataFrame(comb) 16 | comb = pd.DataFrame.transpose(comb) 17 | comb = comb.to_numpy() 18 | names = [] 19 | # print("COMB", comb) 20 | for i in range(comb.shape[0]): 21 | names.append(str(comb[i, 0] + 1) + "." + str(comb[i, 1] + 1)) 22 | allpatterns.columns = names 23 | # print("NAMES", names) 24 | return patternMatch(allpatterns, params) 25 | 26 | 27 | def expandgrid(*itrs): 28 | product = list(itertools.product(*itrs)) 29 | return {'Var{}'.format(i + 1): [x[i] for x in product] for i in range(len(itrs))} 30 | 31 | 32 | def patternMatch(allpatterns, params): 33 | # print("IN PATTERNMATCH") 34 | clusters = corcut(allpatterns, params.coparams["cut"], params.coparams["minNS"]) 35 | maxNS = params.coparams["maxNS"] 36 | # print("MAXNS", maxNS) 37 | 38 | def splitcluster(allpatterns, index, minNS): 39 | # print("IN SPLIT CLUSTER") 40 | # print("LIST", allpatterns) 41 | # print("INDEX", index) 42 | for i in np.arange(len(allpatterns)): 43 | if len(allpatterns[i].columns.intersection(index.columns)) == allpatterns[i].shape[1]: 44 | subindex= i 45 | split = corcut(allpatterns[subindex], 2, minNS) 46 | # print("Length of split", len(split)) 47 | allpatterns[subindex] = split[0] 48 | if len(split) > 1: 49 | allpatterns.append(split[1]) 50 | return allpatterns 51 | 52 | def toolarge(x): 53 | if x is None: 54 | return False 55 | return x.shape[1] > maxNS 56 | 57 | indx = [c for c in clusters if toolarge(c)] 58 | 59 | while len(indx) > 0: 60 | clusters = splitcluster(clusters, indx[0], params.coparams["minNS"]) 61 | indx = [c for c in clusters if toolarge(c)] 62 | # print("SHAPE OF INDX:", len(indx)) 63 | 64 | # print("AFTER SPlITTING--CLUSTERS\n", clusters) 65 | # create matrix of mean patterns - weighted by correlation to mean pattern 66 | 67 | meanpatterns = np.empty((len(clusters[0]), len(clusters))) 68 | for i in range(len(clusters)): 69 | cluster = clusters[i] 70 | if cluster is not None: 71 | cr = np.array(corrToMeanPattern(cluster))**3 72 | meanpat = np.empty((cluster.shape[0])) 73 | for row in range(cluster.shape[0]): 74 | meanpat[row] = np.average(cluster.iloc[row], weights=cr) 75 | meanpatterns[:,i] = meanpat 76 | meanpatterns = np.divide(meanpatterns, np.max(meanpatterns, axis=0)) 77 | meanpatterns = pd.DataFrame(data=meanpatterns) 78 | 79 | # returned patterns after scaling max to 1 80 | result = { 81 | "clustered": clusters, 82 | 'consensus': meanpatterns 83 | } 84 | return result 85 | 86 | 87 | def corrToMeanPattern(cluster): 88 | # print("IN CORR TO MEAN PATTERN") 89 | # print("cluster:", cluster) 90 | meanpat = cluster.mean(axis=1, skipna=True) 91 | corrmat = [] 92 | for column in cluster: 93 | corrmat.append(pearsonr(cluster[column], meanpat)[0]) 94 | return corrmat 95 | 96 | 97 | 98 | def stitchTogether(finalresult, result, params, sets, adata): 99 | """ 100 | concatenate final results across subsets 101 | @param result: list of CogapsResult objects 102 | @param params: CoParams object (params used to generate these results) 103 | @param sets: sets used to break apart data 104 | @return final GapsResult object 105 | """ 106 | 107 | print("Stitching results together...") 108 | if params.coparams["distributed"] == "genome-wide": 109 | Amean = finalresult[0].obs 110 | Asd = finalresult[0].uns['asd'] 111 | for r in finalresult[1:]: 112 | Amean = pd.concat([Amean, r.obs]) 113 | Asd = pd.concat([Asd, r.uns["asd"]]) 114 | Amean.reindex(adata.obs_names_make_unique()) 115 | Asd.reindex(adata.obs_names_make_unique()) 116 | Pmean = finalresult[0].var.reindex(adata.var_names) 117 | Psd = finalresult[0].uns['psd'].reindex(adata.var_names) 118 | 119 | else: 120 | Pmean = finalresult[0].var 121 | Psd = finalresult[0].uns['psd'] 122 | for r in finalresult[1:]: 123 | Pmean = pd.concat([Pmean, r.var]) 124 | Psd = pd.concat([Psd,r.uns["psd"]]) 125 | Pmean.reindex_like(adata.var) 126 | Psd.reindex_like(adata.var) 127 | Amean = finalresult[0].obs.reindex(adata.obs_names) 128 | Asd = finalresult[0].uns['asd'].reindex(adata.obs_names) 129 | 130 | # Pmean = np.array(finalresult[0].var) 131 | # Psd = np.array(finalresult[0].uns['psd']) 132 | # for r in finalresult[1:]: 133 | # df1 = np.array(r.var) 134 | # df2 = np.array(r.uns['psd']) 135 | # Pmean = np.append(Pmean, df1, axis=0) 136 | # Psd = np.append(Psd, df2, axis=0) 137 | # Pmean = np.array(finalresult[0].obs) 138 | # Psd = np.array(finalresult[0].uns['psd']) 139 | 140 | reslist = { 141 | "Amean": Amean, 142 | "Asd": Asd, 143 | "Pmean": Pmean, 144 | "Psd": Psd 145 | } 146 | return reslist 147 | 148 | 149 | def corcut(allpatterns, cut, minNS): 150 | dist = allpatterns.corr() 151 | dist = 1 - dist 152 | if dist.isnull().values.any(): 153 | warnings.warn("NaN values in correlation of patterns... Aborting") 154 | return 155 | clusters = AgglomerativeClustering(affinity="precomputed", linkage="average", n_clusters=cut).fit(dist) 156 | clustid = [] 157 | for id in set(clusters.labels_): 158 | if np.count_nonzero(clusters.labels_ == id) >= minNS: 159 | indices = [a for a, x in enumerate(clusters.labels_) if x == id] 160 | thislist = allpatterns.iloc[:, indices] 161 | clustid.append(thislist) 162 | else: 163 | warnings.warn("cluster did not meet minNS threshold and will be excluded") 164 | # print("CORCUT cluster IDs:", clustid) 165 | return clustid 166 | 167 | -------------------------------------------------------------------------------- /PyCoGAPS/run_pycogaps.py: -------------------------------------------------------------------------------- 1 | ''' 2 | this script reads parameters from the command line to run CoGAPS 3 | supports integration with genepattern notebook 4 | ''' 5 | 6 | if __name__ == '__main__': 7 | from PyCoGAPS.parameters import * 8 | from PyCoGAPS.pycogaps_main import CoGAPS 9 | import pickle 10 | import argparse 11 | 12 | print("This vignette was built using pycogaps version", getVersion()) 13 | 14 | ''' 15 | command line args which are all parameters to CoGAPS 16 | - only --path arg is required 17 | - all other args are optional, have default values 18 | ''' 19 | parser = argparse.ArgumentParser() 20 | 21 | ## initial params ## 22 | # path to data 23 | parser.add_argument('--path', type=str, required=True) 24 | # result output file name (output saved as a .pkl file) 25 | parser.add_argument('--resultFile', type=str, default='result.pkl') 26 | 27 | ## standard params ## 28 | # number of patterns CoGAPS will learn 29 | parser.add_argument('--nPatterns', type=int, default=3) 30 | # number of iterations for each phase of the algorithm 31 | parser.add_argument('--nIterations', type=int, default=1000) 32 | # random number generator seed 33 | parser.add_argument('--seed', type=int, default=0) 34 | # speeds up performance with sparse data (roughly >80% of data is zero), note this can only be used with the default uncertainty 35 | parser.add_argument('--useSparseOptimization', type=bool, default=False) 36 | 37 | ## run params ## 38 | # maximum number of threads to run on 39 | parser.add_argument('--nThreads', type=bool, default=1) 40 | # T/F for displaying output 41 | parser.add_argument('--messages', type=bool, default=True) 42 | # number of iterations between each output (set to 0 to disable status updates) 43 | parser.add_argument('--outputFrequency', type=int, default=500) 44 | # uncertainty matrix - either a matrix or a supported file type 45 | parser.add_argument('--uncertainty', type=str, default=None) 46 | # name of the checkpoint file to create 47 | parser.add_argument('--checkpointOutFile', type=str, default='gaps_checkpoint.out') 48 | # if this is provided, CoGAPS runs from the checkpoint contained in this file 49 | parser.add_argument('--checkpointInFile', type=str, default="") 50 | # T/F for transposing data while reading it in - useful for data that is stored as samples x genes since CoGAPS requires data to be genes x samples 51 | parser.add_argument('--transposeData', type=bool, default=False) 52 | # if calling CoGAPS in parallel the worker ID can be specified 53 | parser.add_argument('--workerID', type=int, default=1) 54 | # enable asynchronous updating which allows for multi-threaded runs 55 | parser.add_argument('--asynchronousUpdates', type=bool, default=False) 56 | # how many snapshots to take in each phase, setting this to 0 disables snapshots 57 | parser.add_argument('--nSnapshots', type=int, default=0) 58 | # which phase to take snapsjots in e.g. "equilibration", "sampling", "all" 59 | parser.add_argument('--snapshotPhase', type=str, default='sampling', choices=['sampling', 'equilibration', 'all']) 60 | 61 | ## sparsity params ## 62 | # sparsity parameter for feature matrix 63 | parser.add_argument('--alphaA', type=float, default=0.01) 64 | # sparsity parameter for sample matrix 65 | parser.add_argument('--alphaP', type=float, default=0.01) 66 | # atomic mass restriction for feature matrix 67 | parser.add_argument('--maxGibbsMassA', type=float, default=100) 68 | # atomic mass restriction for sample matrix 69 | parser.add_argument('--maxGibbsMassP', type=float, default=100) 70 | 71 | ## distributed params ## 72 | # either null or genome-wide 73 | parser.add_argument('--distributed', type=str, default=None) 74 | # number of sets to break data into 75 | parser.add_argument('--nSets', type=int, default=4) 76 | # number of branches at which to cut dendrogram used in pattern matching 77 | # default: nPatterns 78 | parser.add_argument('--cut', type=int, default=None) 79 | # minimum of individual set contributions a cluster must contain 80 | # default: math.ceil(cut / 2) 81 | parser.add_argument('--minNS', type=int, default=None) 82 | # maximum of individual set contributions a cluster can contain 83 | # default: minNS + nSets 84 | parser.add_argument('--maxNS', type=int, default=None) 85 | # specify subsets by index or name 86 | parser.add_argument('--explicitSets', type=list, default=None) 87 | # specify categories along the rows (cols) to use for weighted sampling 88 | parser.add_argument('--samplingAnnotation', type=list, default=None) 89 | # weights associated with samplingAnnotation 90 | parser.add_argument('--samplingWeight', type=list, default=None) 91 | 92 | ## additional params ## 93 | # set of indices to use from the data 94 | parser.add_argument('--subsetIndices', type=set, default=None) 95 | # which dimension (0=rows, 1=cols) to subset 96 | parser.add_argument('--subsetDim', type=int, default=0, choices=[0,1]) 97 | # vector of names of genes in data 98 | parser.add_argument('--geneNames', type=list, default=None) 99 | # vector of names of samples in data 100 | parser.add_argument('--sampleNames', type=list, default=None) 101 | # fix either 'A' or 'P' matrix to these values, in the context of distributed CoGAPS, the first phase is skipped and `fixedPatterns: 102 | # is used for all sets allowing manual pattern matching, as well as fixed runs of standard CoGAPS 103 | parser.add_argument('--fixedPatterns', default=None) 104 | # either 'A' or 'P', indicating which matrix is fixed 105 | parser.add_argument('--whichMatrixFixed', type=str, default=None, choices=['A', 'P']) 106 | # whether or not to take PUMP samples 107 | parser.add_argument('--takePumpSamples', type=bool, default=False) 108 | # for reading .h5 files 109 | parser.add_argument('--hdfKey', type=str, default=None) 110 | # for reading .h5 files 111 | parser.add_argument('--hdfRowKey', type=str, default=None) 112 | # for reading .h5 files 113 | parser.add_argument('--hdfColKey', type=str, default=None) 114 | 115 | initial_params = ["path", "resultFile"] 116 | 117 | standard_params = ["nPatterns", "nIterations", "seed", "useSparseOptimization"] 118 | 119 | run_params = ["nThreads", "messages", "outputFrequency", "uncertainty", "checkpointOutFile", "checkpointInterval", 120 | "checkpointInFile", "transposeData", "workerID", "asynchronousUpdates", 121 | "nSnapshots", "snapshotPhase"] 122 | 123 | sparsity_params = ["alphaA", "alphaP", "maxGibbsMassA", "maxGibbsMassP"] 124 | 125 | distributed_params = ["distributed", "nSets", "cut", "minNS", "maxNS", 126 | "explicitSets", "samplingAnnotation", "samplingWeight"] 127 | 128 | additional_params = ["subsetIndices", "subsetDim", "geneNames", "sampleNames", 129 | "fixedPatterns", "whichMatrixFixed", "takePumpSamples", 130 | "hdfKey", "hdfRowKey", "hdfColKey"] 131 | 132 | ''' 133 | parse all args and set as parameters for CoGAPS 134 | ''' 135 | args = parser.parse_args() 136 | 137 | data_path = args.path 138 | 139 | params = CoParams(path=data_path, transposeData=args.transposeData, hdfKey=args.hdfKey, hdfRowKey=args.hdfRowKey, 140 | hdfColKey=args.hdfColKey) 141 | 142 | prm_dict = vars(args) 143 | 144 | for k,v in prm_dict.items(): 145 | if ((k not in initial_params) and (k not in distributed_params) and (k not in ("fixedPatterns", "uncertainty"))): 146 | setParam(params, k, v) 147 | 148 | # set fixed patterns from additional params 149 | if args.fixedPatterns is not None: 150 | params.setFixedPatterns(fixedPatterns=args.fixedPatterns, whichMatrixFixed=args.whichMatrixFixed) 151 | 152 | # set distributed parameters 153 | setParam(params, 'distributed', args.distributed) 154 | if args.distributed is not None: 155 | params.setAnnotationWeights(annotation=args.samplingAnnotation, weight=args.samplingWeight) 156 | params.setDistributedParams(nSets=args.nSets, cut=args.cut, minNS=args.minNS, maxNS=args.maxNS) 157 | 158 | ''' 159 | run CoGAPS, save result 160 | ''' 161 | result = CoGAPS(data_path, params, uncertainty=args.uncertainty) 162 | 163 | # save CoGAPS result 164 | print("Pickling...", end='\r') 165 | pickle.dump(result, open(args.resultFile, "wb")) 166 | print("Pickling complete!") 167 | -------------------------------------------------------------------------------- /PyCoGAPS/parameters.py: -------------------------------------------------------------------------------- 1 | from PyCoGAPS.config import * 2 | from PyCoGAPS.helper_functions import * 3 | from PyCoGAPS.subset_data import * 4 | 5 | class CoParams: 6 | """ Encapsulates all parameters for PyCoGAPS. 7 | 8 | """ 9 | 10 | def __init__(self, adata=None, path=None, matrix=None, transposeData=False, hdfKey=None, hdfRowKey=None, 11 | hdfColKey=None): 12 | """ Initializes CoParams object. 13 | self.gaps : GapsParameters object 14 | self.cogaps : dictionary of additional parameters (not in GapsParameters) 15 | 16 | Args: 17 | path (str, optional): Path to data. Defaults to None. 18 | matrix (anndata, optional): AnnData object containing supplied data matrix. Defaults to None. 19 | transposeData (bool, optional): Expects genes x samples. Defaults to False. 20 | hdfKey (str, optional): For reading .h5 files. Defaults to None. 21 | hdfRowKey (str, optional): For reading .h5 files. Defaults to None. 22 | hdfColKey (str, optional): For reading .h5 files. Defaults to None. 23 | 24 | Raises: 25 | Exception: If path or params not passed as an argument. 26 | """ 27 | 28 | if adata is not None: 29 | matrix = pycogaps.Matrix(adata.X) 30 | self.gaps = GapsParameters(matrix) 31 | elif matrix is not None: 32 | self.gaps = GapsParameters(pycogaps.Matrix(matrix.X)) 33 | adata = matrix 34 | elif path is not None: 35 | if isinstance(path, str): 36 | if path.lower().endswith(".h5"): 37 | adata = toAnndata(path, hdfKey, hdfRowKey, hdfColKey, transposeData=transposeData) 38 | elif path.lower().endswith(".txt"): 39 | table = pd.read_table(path, header=None) 40 | adata = anndata.AnnData(table) 41 | # adata.obs_names = table["symbol"] 42 | # we cannot assume this will be in the text file and will trip an error 43 | else: 44 | adata = toAnndata(path, transposeData=transposeData) 45 | matrix = pycogaps.Matrix(adata.X) 46 | self.gaps = GapsParameters(matrix) 47 | else: 48 | raise Exception('initialize with path= or params=') 49 | 50 | self.coparams = {'cut': self.gaps.nPatterns, 51 | 'nSets': 4, 52 | 'minNS': None, 53 | 'maxNS': None, 54 | 'explicitSets': None, 55 | 'samplingAnnotation': None, 56 | 'samplingWeight': None, 57 | 'subsetIndices': None, 58 | 'subsetDim': 0, 59 | 'geneNames': adata.obs_names, 60 | 'sampleNames': adata.var_names, 61 | 'fixedPatterns': None, 62 | 'distributed': None, 63 | 'hdfKey': hdfKey, 64 | 'hdfRowKey': hdfRowKey, 65 | 'hdfColKey': hdfColKey, 66 | 'useSparseOptimization': None, 67 | 'transposeData': transposeData, 68 | } 69 | self.coparams['minNS'] = math.ceil(self.coparams['cut'] / 2) 70 | self.coparams['maxNS'] = self.coparams['minNS'] + self.coparams['nSets'] 71 | 72 | def setDistributedParams(self, nSets=None, cut=None, minNS=None, maxNS=None): 73 | """ Sets parameters for running distributed CoGAPS. 74 | 75 | Args: 76 | nSets (int, optional): Number of sets to break data into. Defaults to None. 77 | cut (int, optional): Number of branches at which to cut dendrogram used in pattern matching. Defaults to None. 78 | minNS (int, optional): [description]. Minimum of individual set contributions a cluster must contain. Defaults to None. 79 | maxNS (int, optional): [description]. Maximum of individual set contributions a cluster can contain. Defaults to None. 80 | """ 81 | 82 | print("setting distributed parameters - call this again if you change nPatterns") 83 | if self.coparams['distributed'] != "genome-wide": 84 | print("if you wish to perform genome-wide distributed cogaps, please run setParams(params, " 85 | "\"distributed\", ""\"genome-wide\")") 86 | if nSets is None: 87 | self.coparams['nSets'] = self.coparams['nSets'] 88 | else: 89 | self.coparams['nSets'] = nSets 90 | if cut is None: 91 | self.coparams['cut'] = self.gaps.nPatterns 92 | else: 93 | self.coparams['cut'] = cut 94 | if minNS is None: 95 | self.coparams['minNS'] = math.ceil(self.coparams['cut'] / 2) 96 | else: 97 | self.coparams['minNS'] = minNS 98 | if maxNS is None: 99 | self.coparams['maxNS'] = self.coparams['minNS'] + self.coparams['nSets'] 100 | else: 101 | self.coparams['maxNS'] = maxNS 102 | 103 | # samplingWeight is a dictionary 104 | # can use: dict(zip(names, weights)) 105 | def setAnnotationWeights(self, annotation, weight): 106 | """ Set annotation weights for distributed CoGAPS. 107 | 108 | Args: 109 | annotation (str list): Specify categories along the rows (cols) to use for weighted sampling. 110 | weight (int list): Weights associated with samplingAnnotation 111 | """ 112 | 113 | self.coparams['samplingAnnotation'] = annotation 114 | self.coparams['samplingWeight'] = weight 115 | 116 | def setFixedPatterns(self, fixedPatterns, whichMatrixFixed): 117 | """ Fix either 'A' or 'P' matrix to given values. 118 | 119 | Args: 120 | fixedPatterns (arr): Fix either 'A' or 'P' matrix to these values, 121 | in the context of distributed CoGAPS, the first phase is skipped and 122 | fixedPatterns is used for all sets - allowing manual pattern matching, 123 | as well as fixed runs of standard CoGAPS. 124 | whichMatrixFixed (str): Either 'A' or 'P', indicating which matrix is fixed 125 | """ 126 | 127 | self.coparams['fixedPatterns'] = fixedPatterns 128 | self.coparams['whichMatrixFixed'] = whichMatrixFixed 129 | self.gaps.useFixedPatterns = True 130 | self.gaps.fixedPatterns = pycogaps.Matrix(fixedPatterns) 131 | self.gaps.whichMatrixFixed = whichMatrixFixed 132 | 133 | def printParams(self): 134 | """ Print standard and sparsity parameters, and distributed if set. 135 | """ 136 | print('\n-- Standard Parameters --') 137 | print('nPatterns: ', self.gaps.nPatterns) 138 | print('nIterations: ', self.gaps.nIterations) 139 | print('seed: ', self.gaps.seed) 140 | print('sparseOptimization: ', self.gaps.useSparseOptimization) 141 | print('\n') 142 | print('-- Sparsity Parameters --') 143 | print('alpha: {:0.2f}'.format(self.gaps.alphaA)) 144 | print('maxGibbsMass: ', self.gaps.maxGibbsMassA) 145 | print('\n') 146 | if self.gaps.runningDistributed: 147 | print('-- Distributed Parameters --') 148 | print('cut: ', self.coparams['cut']) 149 | print('nSets: ', self.coparams['nSets']) 150 | print('minNS: ', self.coparams['minNS']) 151 | print('maxNS: ', self.coparams['maxNS']) 152 | print('\n') 153 | 154 | def printAllParams(self): 155 | """ Print all GapsParameters. 156 | """ 157 | self.gaps.print() 158 | 159 | def setParams(paramobj: CoParams, list): 160 | """ Set CoParams from a list. 161 | 162 | Args: 163 | paramobj (CoParams): CoParams object. 164 | list (dict): Dictionary of parameter, value pairings for each parameter you wish to set. 165 | """ 166 | 167 | for (k, v) in list.items(): 168 | setParam(paramobj, k, v) 169 | 170 | 171 | # class CogapsParams 172 | # constructor has default values for each parameter 173 | def setParam(paramobj: CoParams, whichParam, value): 174 | """ Sets CoParams parameters. 175 | 176 | Args: 177 | paramobj (CoParams): a CoParams object 178 | whichParam (str): the name of the parameter you wish to change 179 | value ([type]): the value to set whichParam as 180 | 181 | Returns: 182 | CoParams: the modified CoParams object. 183 | """ 184 | 185 | coparam_params = ['hdfKey', 'hdfRowKey', 'hdfColKey', 'explicitSets', 'subsetDim', 'geneNames', 'sampleNames', 'subsetIndices'] 186 | if whichParam == "alpha": 187 | paramobj.gaps.alphaA = value 188 | paramobj.gaps.alphaP = value 189 | elif whichParam == "maxGibbsMass": 190 | paramobj.gaps.maxGibbsMassA = value 191 | paramobj.gaps.maxGibbsMassP = value 192 | elif whichParam in coparam_params: 193 | if value is not None: 194 | paramobj.coparams[whichParam] = value 195 | elif whichParam == "distributed": 196 | if value == "genome-wide": 197 | print("running genome-wide. if you wish to perform single-cell distributed cogaps, please run setParams(params, " 198 | "\"distributed\", ""\"single-cell\")") 199 | paramobj.gaps.runningDistributed = True 200 | paramobj.coparams['distributed'] = value 201 | elif value == "single-cell": 202 | print("running single-cell. if you wish to perform genome-wide distributed cogaps, please run setParams(params, " 203 | "\"distributed\", ""\"genome-wide\")") 204 | paramobj.coparams['distributed'] = 'single-cell' 205 | paramobj.gaps.runningDistributed = True 206 | else: 207 | paramobj.gaps.runningDistributed = False 208 | paramobj.coparams['distributed'] = None 209 | elif whichParam in ("nSets", "cut", "minNS", "maxNS"): 210 | paramobj.gaps.runningDistributed = True 211 | print("please set \'", whichParam, "\' with setDistributedParams") 212 | return 213 | elif whichParam in ("samplingAnnotation", "samplingWeight"): 214 | print("please set \'", whichParam, "\' with setAnnotationWeights") 215 | return 216 | elif whichParam in ("fixedPatterns", "whichMatrixFixed"): 217 | # print("please set \'", whichParam, "\' with setFixedPatterns") 218 | return 219 | elif whichParam == "singleCell": 220 | print(whichParam, " has been deprecated, this parameter will be ignored") 221 | return 222 | elif whichParam == "nThreads": 223 | whichParam = "maxThreads" 224 | setattr(paramobj.gaps, whichParam, value) 225 | elif whichParam == "messages": 226 | whichParam = "printMessages" 227 | setattr(paramobj.gaps, whichParam, value) 228 | elif whichParam == "nSnapshots": 229 | whichParam = "snapshotFrequency" 230 | setattr(paramobj.gaps, whichParam, value) 231 | elif whichParam == "checkpointInFile": 232 | whichParam = "checkpointFile" 233 | setattr(paramobj.gaps, whichParam, value) 234 | elif whichParam == "snapshotPhase": 235 | if value == "sampling": 236 | value = pycogaps.GAPS_SAMPLING_PHASE 237 | elif value == "equilibration": 238 | value = pycogaps.GAPS_EQUILIBRATION_PHASE 239 | elif value == "all": 240 | value = pycogaps.GAPS_ALL_PHASES 241 | # else: 242 | # print("The snapshot phase you indicated is not recognized.") 243 | # print("Please choose one of: sampling, equilibration, all") 244 | # return 245 | setattr(paramobj.gaps, whichParam, value) 246 | else: 247 | setattr(paramobj.gaps, whichParam, value) 248 | return paramobj 249 | 250 | 251 | def getParam(paramobj, whichParam): 252 | """ Get parameter info and values. 253 | 254 | Args: 255 | paramobj (CoParams): a CoParams object. 256 | whichParam (str): which parameter to get the info of. 257 | 258 | Returns: 259 | [type]: the value of the parameter. 260 | """ 261 | return getattr(paramobj, whichParam) 262 | -------------------------------------------------------------------------------- /PyCoGAPS/pycogaps_main.py: -------------------------------------------------------------------------------- 1 | from PyCoGAPS.config import * 2 | from PyCoGAPS.helper_functions import * 3 | from PyCoGAPS.subset_data import * 4 | from PyCoGAPS.parameters import * 5 | from PyCoGAPS.distributed_functions import * 6 | 7 | import multiprocessing 8 | 9 | 10 | print("""\ 11 | 12 | ______ _____ _____ ___ ______ _____ 13 | | ___ \ / __ \ | __ \ / _ \ | ___ \/ ___| 14 | | |_/ / _| / \/ ___ | | \// /_\ \| |_/ /\ `--. 15 | | __/ | | | | / _ \| | __ | _ || __/ `--. | 16 | | | | |_| | \__/\ (_) | |_\ \| | | || | /\__/ / 17 | \_| \__, |\____/\___/ \____/\_| |_/\_| \____/ 18 | __/ | 19 | |___/ 20 | 21 | """) 22 | 23 | 24 | def CoGAPS(path, params=None, nThreads=1, messages=True, 25 | outputFrequency=1000, uncertainty=None, checkpointOutFile="", 26 | checkpointInterval=0, checkpointInFile="", transposeData=False, 27 | BPPARAM=None, workerID=1, asynchronousUpdates=None, nSnapshots=0, 28 | snapshotPhase='sampling'): 29 | """ Python wrapper to run either standardCoGAPS or distributedCoGAPS. 30 | 31 | Args: 32 | See standardCoGAPS Args. 33 | 34 | Returns: 35 | CogapsResult: A CogapsResult object. 36 | """ 37 | 38 | 39 | if params.coparams['distributed'] is not None: 40 | result = distributedCoGAPS(path, params, uncertainty=None) 41 | else: 42 | result = standardCoGAPS(path, params=params, nThreads=nThreads, messages=messages, 43 | outputFrequency=outputFrequency, uncertainty=uncertainty, checkpointOutFile=checkpointOutFile, 44 | checkpointInterval=checkpointInterval, checkpointInFile=checkpointInFile, transposeData=transposeData, 45 | BPPARAM=BPPARAM, workerID=workerID, asynchronousUpdates=asynchronousUpdates, nSnapshots=nSnapshots, 46 | snapshotPhase=snapshotPhase) 47 | 48 | return result 49 | 50 | 51 | def standardCoGAPS(path, params=None, nThreads=1, messages=True, 52 | outputFrequency=1000, uncertainty=None, checkpointOutFile="", 53 | checkpointInterval=0, checkpointInFile="", transposeData=False, 54 | BPPARAM=None, workerID=1, asynchronousUpdates=None, nSnapshots=0, 55 | snapshotPhase='sampling'): 56 | """ Python wrapper to run CoGAPS via bindings 57 | 58 | Args: 59 | path (str): Path to data. 60 | params (CoParams, optional): CoParams object of parameters. Defaults to None. 61 | nThreads (int, optional): Number of threads to use. Defaults to 1. 62 | messages (bool, optional): Whether to print messages. Defaults to True. 63 | outputFrequency (int, optional): How often to output messages. Defaults to 1000. 64 | uncertainty (arr, optional): Optional uncertainty matrix. Defaults to None. 65 | checkpointOutFile (str, optional): Path to where checkpoint info should be written. Defaults to "". 66 | checkpointInterval (int, optional): How often to make a checkpoint. Defaults to 0. 67 | checkpointInFile (str, optional): Path to existing checkpoint file to run CoGAPS from. Defaults to "". 68 | transposeData (bool, optional): Expects genes x samples. Defaults to False. 69 | BPPARAM ([type], optional): BiocParallel backend . Defaults to None. 70 | workerID (int, optional): If calling CoGAPS in parallel the worker ID can be specified, 71 | only worker 1 prints output and each worker outputs when it finishes, this 72 | is not neccesary when using the default parallel methods (i.e. distributed 73 | CoGAPS) but only when the user is manually calling CoGAPS in parallel. Defaults to 1. 74 | asynchronousUpdates (bool, optional): Enable asynchronous updating which allows for multi-threaded runs. Defaults to None. 75 | nSnapshots (int, optional): How many snapshots to take in each phase, setting this to 0 disables snapshots. Defaults to 0. 76 | snapshotPhase (str, optional): One of "sampling", "equilibration", "all". Defaults to 'sampling'. 77 | 78 | Raises: 79 | Exception: If transposeData=True is not passed as an argument to both CoParams and CoGAPS. 80 | 81 | Returns: 82 | CogapsResult: A CogapsResult object. 83 | """ 84 | 85 | # check OpenMP support 86 | if isCompiledWithOpenMPSupport() is False: 87 | if asynchronousUpdates is not None and nThreads > 1: 88 | print("requesting multi-threaded version of CoGAPS but compiler did not support OpenMP") 89 | asynchronousUpdates = False 90 | nThreads = 1 91 | # # convert sampling phase to enum 92 | # if snapshotPhase == "sampling": 93 | # snapshotPhase = pycogaps.GAPS_SAMPLING_PHASE 94 | # elif snapshotPhase == "equilibration": 95 | # snapshotPhase = pycogaps.GAPS_EQUILIBRATION_PHASE 96 | # elif snapshotPhase == "all": 97 | # snapshotPhase = pycogaps.GAPS_ALL_PHASES 98 | # else: 99 | # print("The snapshot phase you indicated is not recognized.") 100 | # print("Please choose one of: sampling, equilibration, all") 101 | # return 102 | 103 | gapsresultobj = None 104 | 105 | # convert data to anndata and matrix obj 106 | if isinstance(path, str): 107 | 108 | if params is not None: 109 | adata = toAnndata(path, params.coparams['hdfKey'], params.coparams['hdfRowKey'], 110 | params.coparams['hdfColKey'], transposeData=transposeData) 111 | else: 112 | adata = toAnndata(path, transposeData=transposeData) 113 | else: 114 | adata = path 115 | 116 | matrix = pycogaps.Matrix(adata.X) 117 | 118 | if params is None: 119 | prm = CoParams(matrix=adata, transposeData=transposeData) 120 | else: 121 | prm = params 122 | 123 | opts = { 124 | 'maxThreads': nThreads, 125 | 'printMessages': messages, 126 | 'outputFrequency': outputFrequency, 127 | 'checkpointOutFile': checkpointOutFile, 128 | 'checkpointInterval': checkpointInterval, 129 | 'checkpointFile': checkpointInFile, 130 | 'transposeData': transposeData, 131 | 'workerID': workerID, 132 | 'asynchronousUpdates': asynchronousUpdates, 133 | 'snapshotFrequency': nSnapshots, 134 | 'snapshotPhase': snapshotPhase, 135 | } 136 | setParams(prm, opts) 137 | 138 | ''' 139 | make sure uncertainty matrix processed the same way as adata input 140 | ''' 141 | if uncertainty is not None: 142 | unc = toAnndata(uncertainty) 143 | unc = pycogaps.Matrix(unc.X) 144 | else: 145 | unc = pycogaps.Matrix() 146 | 147 | if prm.coparams["subsetIndices"] is None: 148 | prm = getDimNames(adata, prm) 149 | 150 | # check data input 151 | checkData(adata, prm.gaps, uncertainty) 152 | checkInputs(uncertainty, prm) 153 | 154 | startupMessage(prm, path) 155 | gapsresultobj = pycogaps.runCogapsFromMatrix(matrix, prm.gaps, unc) 156 | prm.gaps.transposeData = transposeData 157 | 158 | if prm.gaps.transposeData != prm.coparams["transposeData"]: 159 | raise Exception("make sure to pass transposeData=True argument in both CoParams() and CoGAPS()") 160 | # no longer returning the legacy formatted object 161 | result = GapsResultToAnnData(gapsresultobj, adata, prm) 162 | show(result) 163 | return result 164 | 165 | 166 | def distributedCoGAPS(path, params, uncertainty=None): 167 | if isinstance(path, str): 168 | data = toAnndata(path, hdf_counts_key=params.coparams["hdfKey"], 169 | hdf_dim1_key=params.coparams["hdfRowKey"], 170 | hdf_dim2_key=params.coparams["hdfColKey"], 171 | transposeData=params.coparams["transposeData"]) 172 | else: 173 | data = path 174 | sets = createSets(data, params) 175 | if min(map(len, sets)) < params.gaps.nPatterns: 176 | warnings.warn("Data subset dimension less than nPatterns. Aborting.") 177 | return 1 178 | 179 | # setParams(params, {'checkpointOutFile': ""}) 180 | 181 | if params.coparams["fixedPatterns"] is None: 182 | print("Running Across Subsets...\n\n") 183 | with multiprocessing.get_context("spawn").Pool(processes=len(sets)) as pool: 184 | 185 | # make a list of parameters for each function call so they can easily be mapped to processes 186 | paramlst = [] 187 | for i in range(len(sets)): 188 | paramlst.append([data, params, i, sets[i], uncertainty]) 189 | result = pool.imap_unordered(callInternalCoGAPS, paramlst) 190 | pool.close() 191 | pool.join() 192 | result = list(result) 193 | # print("POOL IS NOW CLOSED") 194 | if params.coparams['distributed'] == "genome-wide": 195 | unmatched = np.array(result[0].var) 196 | for i in range(1, len(result)): 197 | arr = np.array(result[i].var) 198 | unmatched = np.append(unmatched, arr, axis=1) 199 | else: 200 | unmatched = np.array(result[0].obs) 201 | for i in range(1, len(result)): 202 | arr = np.array(result[i].obs) 203 | unmatched = np.append(unmatched, arr, axis=1) 204 | print("Matching patterns across subsets...\n") 205 | matched = findConsensusMatrix(unmatched, params) 206 | else: 207 | matched = params.gaps.fixedPatterns 208 | 209 | params.gaps.nPatterns = matched["consensus"].shape[1] 210 | params.gaps.useFixedPatterns = True 211 | params.gaps.fixedPatterns = pycogaps.Matrix(matched["consensus"]) 212 | 213 | # print('=== DEBUG MATRIX FP ===') 214 | # print('np FP: ', matched["consensus"]) 215 | # print('Matrix FP: ', toNumpy(params.gaps.fixedPatterns)) 216 | # print('=== END DEBUG ===') 217 | 218 | # print("FIXED PATTERNS\n", matched["consensus"]) 219 | if params.coparams["distributed"] == "genome-wide": 220 | params.gaps.whichMatrixFixed = "P" 221 | else: 222 | params.gaps.whichMatrixFixed = "A" 223 | 224 | print("Running final stage...") 225 | with multiprocessing.get_context("spawn").Pool(processes=len(sets)) as pool: 226 | paramlst = [] 227 | for i in range(len(sets)): 228 | paramlst.append([data, params, i, sets[i], uncertainty]) 229 | finalresult = pool.imap_unordered(callInternalCoGAPS, paramlst) 230 | pool.close() 231 | pool.join() 232 | finalresult = list(finalresult) 233 | 234 | stitched = stitchTogether(finalresult, result, params, sets, data) 235 | 236 | 237 | adata = data 238 | adata.obs = stitched["Amean"] 239 | adata.var = stitched["Pmean"] 240 | adata.uns["asd"] = stitched["Asd"] 241 | adata.uns["psd"] = stitched["Psd"] 242 | adata.uns["atomhistoryA"] = finalresult[0].uns["atomhistoryA"] 243 | adata.uns["atomhistoryP"] = finalresult[0].uns["atomhistoryP"] 244 | adata.uns["averageQueueLengthA"] = finalresult[0].uns["averageQueueLengthA"] 245 | adata.uns["averageQueueLengthP"] = finalresult[0].uns["averageQueueLengthP"] 246 | adata.uns["chisqHistory"] = finalresult[0].uns["chisqHistory"] 247 | adata.uns["equilibrationSnapshotsA"] = finalresult[0].uns["equilibrationSnapshotsA"] 248 | adata.uns["equilibrationSnapshotsP"] = finalresult[0].uns["equilibrationSnapshotsP"] 249 | adata.uns["meanChiSq"] = finalresult[0].uns["meanChiSq"] 250 | adata.uns["meanPatternAssignment"] = finalresult[0].uns["meanPatternAssignment"] 251 | adata.uns["pumpMatrix"] = finalresult[0].uns["pumpMatrix"] 252 | adata.uns["samplingSnapshotsA"] = finalresult[0].uns["samplingSnapshotsA"] 253 | adata.uns["samplingSnapshotsP"] = finalresult[0].uns["samplingSnapshotsP"] 254 | adata.uns["seed"] = finalresult[0].uns["seed"] 255 | adata.uns["totalRunningTime"] = finalresult[0].uns["totalRunningTime"] 256 | adata.uns["totalUpdates"] = finalresult[0].uns["totalUpdates"] 257 | 258 | # gapsresult = pycogaps.GapsResult 259 | # if params.coparams["distributed"] == "genome-wide": 260 | # gapsresult.Amean = pycogaps.Matrix((stitched["Amean"])) 261 | # gapsresult.Asd = pycogaps.Matrix((stitched["Asd"])) 262 | # gapsresult.Pmean = pycogaps.Matrix((stitched["Pmean"])) 263 | # gapsresult.Psd = pycogaps.Matrix((stitched["Psd"])) 264 | # 265 | # else: 266 | # gapsresult.Amean = pycogaps.Matrix((stitched["Amean"])) 267 | # gapsresult.Asd = pycogaps.Matrix((stitched["Asd"])) 268 | # gapsresult.Pmean = pycogaps.Matrix((stitched["Pmean"])) 269 | # gapsresult.Psd = pycogaps.Matrix((stitched["Psd"])) 270 | 271 | return adata 272 | 273 | 274 | 275 | def callInternalCoGAPS(paramlst): 276 | # take out parameters passed as a list to the worker process 277 | # print("IN CALL INTERNAL COGAPS") 278 | path = paramlst[0] 279 | params = paramlst[1] 280 | workerID = paramlst[2] 281 | subsetIndices = paramlst[3] 282 | uncertainty = paramlst[4] 283 | if isinstance(path, str): 284 | adata = toAnndata(path) 285 | else: 286 | adata = path 287 | if subsetIndices is None: 288 | print("No subset indices provided; generating random sets...") 289 | subsetIndices = createSets(adata, params) 290 | if params.coparams['distributed'] == "genome-wide": 291 | genes = np.array(params.coparams['geneNames']) 292 | genesubset = np.take(genes, subsetIndices) 293 | params.coparams['geneNames'] = set(genesubset) 294 | adata = adata[subsetIndices, :] 295 | params.coparams['subsetDim'] = 1 296 | else: 297 | samples = np.array(params.coparams['sampleNames']) 298 | samplesubset = np.take(samples, subsetIndices) 299 | params.coparams['sampleNames'] = set(samplesubset) 300 | adata = adata[:, subsetIndices] 301 | params.coparams['subsetDim'] = 2 302 | 303 | params.coparams['subsetIndices'] = subsetIndices 304 | params.gaps.workerID = workerID 305 | params.gaps.asynchronousUpdates = False 306 | # params.gaps.maxThreads = 1 307 | print("Calling internal CoGAPS...\n") 308 | gapsresult = standardCoGAPS(adata, params, uncertainty, transposeData=params.coparams["transposeData"]) 309 | 310 | return gapsresult 311 | 312 | -------------------------------------------------------------------------------- /src/bindings.cpp: -------------------------------------------------------------------------------- 1 | #include "CoGAPS/src/GapsRunner.h" 2 | #include "CoGAPS/src/utils/GlobalConfig.h" 3 | #include "CoGAPS/src/GapsParameters.h" 4 | #include "CoGAPS/src/GapsResult.h" 5 | #include "CoGAPS/src/math/Random.h" 6 | #include "CoGAPS/src/cpp_tests/catch.h" 7 | #include "CoGAPS/src/file_parser/FileParser.h" 8 | #include "CoGAPS/src/GapsStatistics.h" 9 | #include "CoGAPS/src/data_structures/Matrix.h" 10 | #include "CoGAPS/src/data_structures/Vector.h" 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | #include 19 | 20 | #include 21 | #include 22 | #include 23 | 24 | #define STRINGIFY(x) #x 25 | #define MACRO_STRINGIFY(x) STRINGIFY(x) 26 | namespace py = pybind11; 27 | 28 | 29 | // overload, with given params 30 | GapsResult runCogaps(const std::string &path, GapsParameters params) 31 | { 32 | GapsRandomState randState(params.seed); 33 | GapsResult result(gaps::run(path, params, std::string(), &randState)); 34 | return result; 35 | } 36 | 37 | GapsResult runCogapsFromMatrix(Matrix mat, GapsParameters params, Matrix unc) 38 | { 39 | GapsRandomState randState(params.seed); 40 | GapsResult result(gaps::run(mat, params, unc, &randState)); 41 | return result; 42 | } 43 | 44 | std::string getBuildReport() 45 | { 46 | return buildReport(); 47 | } 48 | 49 | bool isCheckpointsEnabled() 50 | { 51 | #ifdef GAPS_DISABLE_CHECKPOINTS 52 | return false; 53 | #else 54 | return true; 55 | #endif 56 | } 57 | 58 | bool isCompiledWithOpenMPSupport() 59 | { 60 | #ifdef __GAPS_OPENMP__ 61 | return true; 62 | #else 63 | return false; 64 | #endif 65 | } 66 | 67 | std::string getFileInfo(const std::string &path) 68 | { 69 | FileParser fp(path); 70 | return "dimensions: " + std::to_string(fp.nRow()) + ", " + std::to_string(fp.nCol()) 71 | + "\nrowNames: " + boost::algorithm::join(fp.rowNames(), " ") + "\ncolNames: " + boost::algorithm::join(fp.colNames(), " "); 72 | return 0; 73 | } 74 | 75 | void runCPPTests() 76 | { 77 | std::cout << "running CPPTests"; 78 | } 79 | 80 | float getElement(Vector v, unsigned i) { 81 | return v[i]; 82 | } 83 | 84 | 85 | PYBIND11_MODULE(pycogaps, m) 86 | { 87 | m.doc() = "CoGAPS Python Package"; 88 | m.def("runCogaps", &runCogaps, "Run CoGAPS Algorithm"); 89 | m.def("runCogapsFromMatrix", &runCogapsFromMatrix, "Run CoGAPS Algorithm"); 90 | m.def("runCPPTests", &runCPPTests, "Run CoGAPS C++ Tests"); 91 | m.def("getElement", &getElement, "Get an element of a Vector"); 92 | py::enum_(m, "GapsAlgorithmPhase") 93 | .value("GAPS_EQUILIBRATION_PHASE", GAPS_EQUILIBRATION_PHASE) 94 | .value("GAPS_SAMPLING_PHASE", GAPS_SAMPLING_PHASE) 95 | .value("GAPS_ALL_PHASES", GAPS_ALL_PHASES) 96 | .export_values(); 97 | py::enum_(m, "PumpThreshold") 98 | .value("PUMP_UNIQUE", PUMP_UNIQUE) 99 | .value("PUMP_CUT", PUMP_CUT) 100 | .export_values(); 101 | py::class_(m, "GapsParameters") 102 | .def(py::init<>()) 103 | .def(py::init()) 104 | .def(py::init()) 105 | .def("print", &GapsParameters::print) 106 | .def_readwrite("checkpointOutFile", &GapsParameters::checkpointOutFile) 107 | .def_readwrite("checkpointFile", &GapsParameters::checkpointFile) 108 | .def_readwrite("seed", &GapsParameters::seed) 109 | .def_readwrite("nGenes", &GapsParameters::nGenes) 110 | .def_readwrite("nSamples", &GapsParameters::nSamples) 111 | .def_readwrite("nPatterns", &GapsParameters::nPatterns) 112 | .def_readwrite("nIterations", &GapsParameters::nIterations) 113 | .def_readwrite("maxThreads", &GapsParameters::maxThreads) 114 | .def_readwrite("outputFrequency", &GapsParameters::outputFrequency) 115 | .def_readwrite("checkpointInterval", &GapsParameters::checkpointInterval) 116 | .def_readwrite("snapshotFrequency", &GapsParameters::snapshotFrequency) 117 | .def_readwrite("alphaA", &GapsParameters::alphaA) 118 | .def_readwrite("alphaP", &GapsParameters::alphaP) 119 | .def_readwrite("maxGibbsMassA", &GapsParameters::maxGibbsMassA) 120 | .def_readwrite("maxGibbsMassP", &GapsParameters::maxGibbsMassP) 121 | .def_readwrite("pumpThreshold", &GapsParameters::pumpThreshold) 122 | .def_readwrite("snapshotPhase", &GapsParameters::snapshotPhase) 123 | .def_readwrite("useFixedPatterns", &GapsParameters::useFixedPatterns) 124 | .def_readwrite("subsetData", &GapsParameters::subsetData) 125 | .def_readwrite("useCheckPoint", &GapsParameters::useCheckPoint) 126 | .def_readwrite("transposeData", &GapsParameters::transposeData) 127 | .def_readwrite("printMessages", &GapsParameters::printMessages) 128 | .def_readwrite("subsetGenes", &GapsParameters::subsetGenes) 129 | .def_readwrite("printThreadUsage", &GapsParameters::printThreadUsage) 130 | .def_readwrite("useSparseOptimization", &GapsParameters::useSparseOptimization) 131 | .def_readwrite("takePumpSamples", &GapsParameters::takePumpSamples) 132 | .def_readwrite("asynchronousUpdates", &GapsParameters::asynchronousUpdates) 133 | .def_readwrite("whichMatrixFixed", &GapsParameters::whichMatrixFixed) 134 | .def_readwrite("workerID", &GapsParameters::workerID) 135 | .def_readwrite("runningDistributed", &GapsParameters::runningDistributed) 136 | .def_readwrite("dataIndicesSubset", &GapsParameters::dataIndicesSubset) 137 | .def_readwrite("fixedPatterns", &GapsParameters::fixedPatterns) 138 | .def(py::pickle( 139 | [](const GapsParameters &prm) { 140 | return py::make_tuple(prm.checkpointOutFile, prm.checkpointFile, prm.seed, prm.nGenes, 141 | prm.nSamples, prm.nPatterns, prm.nIterations, prm.maxThreads, prm.outputFrequency, 142 | prm.checkpointInterval, prm.snapshotFrequency, prm.alphaA, prm.alphaP, prm.maxGibbsMassA, 143 | prm.maxGibbsMassP, prm.pumpThreshold, prm.snapshotPhase, prm.useFixedPatterns, 144 | prm.subsetData, prm.useCheckPoint, prm.transposeData, prm.printMessages, prm.subsetGenes, 145 | prm.printThreadUsage, prm.useSparseOptimization, prm.takePumpSamples, prm.asynchronousUpdates, 146 | prm.whichMatrixFixed, 147 | prm.workerID, 148 | prm.runningDistributed, 149 | prm.dataIndicesSubset, prm.fixedPatterns); 150 | }, 151 | [](py::tuple t) { 152 | if (t.size() != 32){ 153 | throw std::runtime_error("Invalid state!"); 154 | } 155 | GapsParameters prm; 156 | prm.checkpointOutFile = t[0].cast(); 157 | prm.checkpointFile = t[1].cast(); 158 | prm.seed = t[2].cast(); 159 | prm.nGenes = t[3].cast(); 160 | prm.nSamples = t[4].cast(); 161 | prm.nPatterns = t[5].cast(); 162 | prm.nIterations = t[6].cast(); 163 | prm.maxThreads = t[7].cast(); 164 | prm.outputFrequency = t[8].cast(); 165 | prm.checkpointInterval = t[9].cast(); 166 | prm.snapshotFrequency = t[10].cast(); 167 | prm.alphaA = t[11].cast(); 168 | prm.alphaP = t[12].cast(); 169 | prm.maxGibbsMassA = t[13].cast(); 170 | prm.maxGibbsMassP = t[14].cast(); 171 | prm.pumpThreshold = t[15].cast(); 172 | prm.snapshotPhase = t[16].cast(); 173 | prm.useFixedPatterns = t[17].cast(); 174 | prm.subsetData = t[18].cast(); 175 | prm.useCheckPoint = t[19].cast(); 176 | prm.transposeData = t[20].cast(); 177 | prm.printMessages = t[21].cast(); 178 | prm.subsetGenes = t[22].cast(); 179 | prm.printThreadUsage = t[23].cast(); 180 | prm.useSparseOptimization = t[24].cast(); 181 | prm.takePumpSamples = t[25].cast(); 182 | prm.asynchronousUpdates = t[26].cast(); 183 | prm.whichMatrixFixed = t[27].cast(); 184 | prm.workerID = t[28].cast(); 185 | prm.runningDistributed = t[29].cast(); 186 | prm.dataIndicesSubset = t[30].cast>(); 187 | prm.fixedPatterns = t[31].cast(); 188 | return prm; 189 | } 190 | )); 191 | m.def("getBuildReport", &getBuildReport, "Return build report."); 192 | m.def("isCheckpointsEnabled", &isCheckpointsEnabled, "Return whether checkpoints enabled."); 193 | m.def("isCompiledWithOpenMPSupport", &isCompiledWithOpenMPSupport, "Return whether compiled with Open MP Support."); 194 | m.def("getFileInfo", &getFileInfo, "Get info of inputted file."); 195 | 196 | py::class_(m, "GapsResult") 197 | .def(py::init<>()) 198 | .def(py::init()) 199 | .def("writeToFile", &GapsResult::writeToFile) 200 | .def_readwrite("Amean", &GapsResult::Amean) 201 | .def_readwrite("Asd", &GapsResult::Asd) 202 | .def_readwrite("Pmean", &GapsResult::Pmean) 203 | .def_readwrite("Psd", &GapsResult::Psd) 204 | .def_readwrite("pumpMatrix", &GapsResult::pumpMatrix) 205 | .def_readwrite("meanPatternAssignment", &GapsResult::meanPatternAssignment) // Matrix 206 | .def_readwrite("equilibrationSnapshotsA", &GapsResult::equilibrationSnapshotsA) 207 | .def_readwrite("equilibrationSnapshotsP", &GapsResult::equilibrationSnapshotsP) 208 | .def_readwrite("samplingSnapshotsA", &GapsResult::samplingSnapshotsA) 209 | .def_readwrite("samplingSnapshotsP", &GapsResult::samplingSnapshotsP) // std::vector 210 | .def_readwrite("chisqHistory", &GapsResult::chisqHistory) 211 | .def_readwrite("atomHistoryA", &GapsResult::atomHistoryA) 212 | .def_readwrite("atomHistoryP", &GapsResult::atomHistoryP) 213 | .def_readwrite("totalUpdates", &GapsResult::totalUpdates) 214 | .def_readwrite("seed", &GapsResult::seed) 215 | .def_readwrite("totalRunningTime", &GapsResult::totalRunningTime) 216 | .def_readwrite("meanChiSq", &GapsResult::meanChiSq) 217 | .def_readwrite("averageQueueLengthA", &GapsResult::averageQueueLengthA) 218 | .def_readwrite("averageQueueLengthP", &GapsResult::averageQueueLengthP) 219 | .def(py::pickle( 220 | [](const GapsResult &r) { // __getstate__ 221 | /* Return a tuple that fully encodes the state of the object */ 222 | return py::make_tuple(r.Amean, r.Asd, r.Pmean, r.Psd, r.pumpMatrix, 223 | r.meanPatternAssignment, r.equilibrationSnapshotsA, r.equilibrationSnapshotsP, 224 | r.samplingSnapshotsA, r.samplingSnapshotsP, r.chisqHistory, r.atomHistoryA, 225 | r.atomHistoryP, r.totalUpdates, r.seed, r.totalRunningTime, r.meanChiSq, 226 | r.averageQueueLengthA, r.averageQueueLengthP); 227 | }, 228 | [](py::tuple t) { // __setstate__ 229 | // if (t.size() != 2) 230 | // throw std::runtime_error("Invalid state!"); 231 | 232 | /* Create a new C++ instance */ 233 | GapsResult r; 234 | r.Amean = t[0].cast(); 235 | r.Asd = t[1].cast(); 236 | r.Pmean = t[2].cast(); 237 | r.Psd = t[3].cast(); 238 | r.pumpMatrix = t[4].cast(); 239 | r.meanPatternAssignment = t[5].cast(); 240 | r.equilibrationSnapshotsA = t[6].cast>(); 241 | r.equilibrationSnapshotsP = t[7].cast>(); 242 | r.samplingSnapshotsA = t[8].cast>(); 243 | r.samplingSnapshotsP = t[9].cast>(); 244 | r.chisqHistory = t[10].cast>(); 245 | r.atomHistoryA = t[11].cast>(); 246 | r.atomHistoryP = t[12].cast>(); 247 | r.totalUpdates = t[13].cast(); 248 | r.seed = t[14].cast(); 249 | r.totalRunningTime = t[15].cast(); 250 | r.meanChiSq = t[16].cast(); 251 | r.averageQueueLengthA = t[17].cast(); 252 | r.averageQueueLengthP = t[18].cast(); 253 | return r; 254 | } 255 | )); 256 | 257 | py::class_(m, "Vector") 258 | .def(py::init()) 259 | .def("size", &Vector::size); 260 | 261 | 262 | py::class_(m, "Matrix", py::buffer_protocol()) 263 | .def(py::init<>()) 264 | .def(py::init()) 265 | .def(py::init &>()) 267 | .def(py::init &>()) 269 | // Matrix constructed from numpy array 270 | .def(py::init([](py::array_t b) { 271 | py::buffer_info info = b.request(); 272 | 273 | if (info.ndim != 2) 274 | { 275 | throw std::runtime_error("Incompatible buffer dimension"); 276 | } 277 | 278 | Matrix mat = Matrix(info.shape[0], info.shape[1]); 279 | float *ptr = static_cast(info.ptr); 280 | 281 | for(int i = 0; i < info.shape[0]; i++) 282 | { 283 | for (int j = 0; j < info.shape[1]; j++) 284 | { 285 | mat.operator()(i,j) = ptr[i*info.shape[1] + j]; 286 | } 287 | } 288 | 289 | return mat.getMatrix(); 290 | })) 291 | 292 | .def("nCol", &Matrix::nCol) 293 | .def("nRow", &Matrix::nRow) 294 | .def("getCol", static_cast(&Matrix::getCol), "Get a column of the matrix") 295 | 296 | .def_buffer([](Matrix &m) -> py::buffer_info { 297 | return py::buffer_info( 298 | &(m.getMatrix().operator()(0,0)), 299 | sizeof(float), 300 | py::format_descriptor::format(), 301 | 2, 302 | {m.nRow(), m.nCol()}, 303 | {sizeof(float) * m.nCol(), sizeof(float)} 304 | ); 305 | }) 306 | .def(py::pickle( 307 | [](const Matrix &m) { // __getstate__ 308 | std::vector> a(m.nRow(), std::vector(m.nCol())); 309 | for (int i = 0; i < m.nRow(); i++) { 310 | for (int j = 0; j < m.nCol(); j++) { 311 | a[i][j] = m.operator()(i,j); 312 | } 313 | } 314 | return py::make_tuple(m.nCol(), m.nRow(), a); 315 | }, 316 | [](py::tuple t) { // __setstate__ 317 | if (t.size() != 3) 318 | throw std::runtime_error("Invalid state!"); 319 | 320 | /* Create a new C++ instance */ 321 | unsigned ncol = t[0].cast(); 322 | unsigned nrow = t[1].cast(); 323 | Matrix m(nrow, ncol); 324 | std::vector> ptr = t[2].cast>>(); 325 | for(int i = 0; i < (int)nrow; i++) 326 | { 327 | for (int j = 0; j < (int)ncol; j++) 328 | { 329 | m.operator()(i,j) = ptr[i][j]; 330 | } 331 | } 332 | return m; 333 | } 334 | )); 335 | } 336 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Lesser General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | 282 | How to Apply These Terms to Your New Programs 283 | 284 | If you develop a new program, and you want it to be of the greatest 285 | possible use to the public, the best way to achieve this is to make it 286 | free software which everyone can redistribute and change under these terms. 287 | 288 | To do so, attach the following notices to the program. It is safest 289 | to attach them to the start of each source file to most effectively 290 | convey the exclusion of warranty; and each file should have at least 291 | the "copyright" line and a pointer to where the full notice is found. 292 | 293 | 294 | Copyright (C) 295 | 296 | This program is free software; you can redistribute it and/or modify 297 | it under the terms of the GNU General Public License as published by 298 | the Free Software Foundation; either version 2 of the License, or 299 | (at your option) any later version. 300 | 301 | This program is distributed in the hope that it will be useful, 302 | but WITHOUT ANY WARRANTY; without even the implied warranty of 303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 304 | GNU General Public License for more details. 305 | 306 | You should have received a copy of the GNU General Public License along 307 | with this program; if not, write to the Free Software Foundation, Inc., 308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 309 | 310 | Also add information on how to contact you by electronic and paper mail. 311 | 312 | If the program is interactive, make it output a short notice like this 313 | when it starts in an interactive mode: 314 | 315 | Gnomovision version 69, Copyright (C) year name of author 316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 317 | This is free software, and you are welcome to redistribute it 318 | under certain conditions; type `show c' for details. 319 | 320 | The hypothetical commands `show w' and `show c' should show the appropriate 321 | parts of the General Public License. Of course, the commands you use may 322 | be called something other than `show w' and `show c'; they could even be 323 | mouse-clicks or menu items--whatever suits your program. 324 | 325 | You should also get your employer (if you work as a programmer) or your 326 | school, if any, to sign a "copyright disclaimer" for the program, if 327 | necessary. Here is a sample; alter the names: 328 | 329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 330 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 331 | 332 | , 1 April 1989 333 | Ty Coon, President of Vice 334 | 335 | This General Public License does not permit incorporating your program into 336 | proprietary programs. If your program is a subroutine library, you may 337 | consider it more useful to permit linking proprietary applications with the 338 | library. If this is what you want to do, use the GNU Lesser General 339 | Public License instead of this License. 340 | -------------------------------------------------------------------------------- /PyCoGAPS/helper_functions.py: -------------------------------------------------------------------------------- 1 | from PyCoGAPS.config import * 2 | 3 | import h5py 4 | import scipy.io 5 | import pkg_resources # part of setuptools 6 | from pycogaps import getElement 7 | 8 | def supported(file): 9 | """ Checks whether file is supported type 10 | 11 | Args: 12 | file (str): path to data 13 | 14 | Returns: 15 | bool: true/false if file is supported 16 | """ 17 | return file.lower().endswith((".tsv", ".csv", ".mtx", ".h5ad", ".h5", ".gct", ".txt")) 18 | 19 | 20 | def checkData(adata, params, uncertainty=None): 21 | """ Check validity of inputted data 22 | 23 | Args: 24 | adata (anndata): data as anndata object 25 | params (CoParams): CoParams object 26 | uncertainty (arr, optional): optional uncertainty matrix. Defaults to None. 27 | 28 | Raises: 29 | Exception: If NA values are present in data 30 | Exception: If data is not numeric 31 | Exception: If negative values are in data matrix 32 | Exception: If negative values in uncertainty matrix 33 | Warning: If small values in uncertainty matrix 34 | Exception: If nPatterns is greater than dimensions of data 35 | """ 36 | data = adata.X 37 | 38 | if np.isnan(data).any(): 39 | raise Exception('NA values in data') 40 | if not np.issubdtype(data.dtype, np.number): 41 | raise Exception('data is not numeric') 42 | if np.any((data < 0)): 43 | raise Exception('negative values in data matrix') 44 | if uncertainty != None: 45 | if np.any((uncertainty < 0)): 46 | raise Exception('negative values in uncertainty matrix') 47 | if np.any(uncertainty < 1e-5): 48 | raise Warning('small values in uncertainty matrix detected') 49 | if data.shape[0] <= params.nPatterns | data.shape[1] <= params.nPatterns: 50 | raise Exception('nPatterns must be less than dimensions of data') 51 | 52 | 53 | def toAnndata(file, hdf_counts_key=None, hdf_dim1_key=None, hdf_dim2_key=None, transposeData=False): 54 | """ Converts file to anndata object 55 | 56 | Args: 57 | file (str): path to data 58 | hdf_counts_key (str, optional): if .h5 data inputted. Defaults to None. 59 | hdf_dim1_key (str, optional): if .h5 data inputted. Defaults to None. 60 | hdf_dim2_key (str, optional): if .h5 data inputted. Defaults to None. 61 | transposeData (bool, optional): if data should be transposed. Defaults to False. 62 | 63 | Raises: 64 | Exception: if unsupported data type 65 | Exception: if dataset name from hdf file is not provided to CoParams 66 | 67 | Returns: 68 | anndata: anndata object 69 | """ 70 | 71 | ''' 72 | TODO: CHANGE TO READ AS NUMPY ARRAY FIRST, THEN MAKE DATAFRAME, THEN MAKE ANNDATA 73 | ''' 74 | if not supported(file): 75 | raise Exception("unsupported data type") 76 | if file.lower().endswith(".csv"): 77 | adata = anndata.read_csv(file) 78 | elif file.lower().endswith(".txt"): 79 | # table = pd.read_table(file) 80 | # adata = anndata.AnnData(table.iloc[:, 2:]) 81 | # adata.obs_names = table["symbol"] 82 | pd_table = pd.read_table(file, header=None) 83 | table = pd.DataFrame(data=pd_table.values, index=pd_table.index, columns=pd_table.columns) 84 | adata = anndata.AnnData(table) 85 | elif file.lower().endswith(".tsv"): 86 | csv_table = pd.read_table(file,sep='\t') 87 | csv_table.to_csv('file.csv', index=False) 88 | adata = anndata.read_csv('{}.csv'.format(os.path.splitext(file)[0])) 89 | elif file.lower().endswith(".mtx"): 90 | adata = anndata.read_mtx(file) 91 | elif file.lower().endswith(".h5ad"): 92 | adata = anndata.read_h5ad(file) 93 | elif file.lower().endswith(".h5"): 94 | if hdf_counts_key is None: 95 | raise Exception("set dataset name from hdf file to use with params = CoParams(path=filename, hdfKey=key") 96 | adata = anndata.read_hdf(file, hdf_counts_key) # user supplied keydata 97 | if transposeData: 98 | if hdf_dim1_key is not None: 99 | adata.obs_names = h5py.File(file, 'r')[hdf_dim1_key] 100 | if hdf_dim2_key is not None: 101 | adata.var_names = h5py.File(file, 'r')[hdf_dim2_key] 102 | else: 103 | if hdf_dim1_key is not None: 104 | adata.var_names = h5py.File(file, 'r')[hdf_dim1_key] 105 | if hdf_dim2_key is not None: 106 | adata.obs_names = h5py.File(file, 'r')[hdf_dim2_key] 107 | elif file.lower().endswith(".gct"): 108 | csv_table = pd.read_csv(file, sep='\t', skiprows=2) 109 | csv_table.to_csv('file.csv', index=False) 110 | adata = anndata.read_csv('{}.csv'.format(os.path.splitext(file)[0])) 111 | 112 | if scipy.sparse.issparse(adata.X): 113 | adata.X = (adata.X).toarray() 114 | 115 | if transposeData: 116 | adata = adata.transpose() 117 | 118 | return adata 119 | 120 | 121 | def checkInputs(uncertainty, allParams): 122 | """ Check validity of inputs to CoGAPS. 123 | 124 | Args: 125 | uncertainty (arr): uncertainty matrix 126 | allParams (CoParams): CoParams object 127 | 128 | Raises: 129 | Exception: If unsupported file extension provided for uncertainty matrix. 130 | Exception: If default uncertainty not used with useSparseOptimization=True 131 | Exception: If CoGAPS was built with checkpoints disabled 132 | Exception: If checkpoints not supported for distributed CoGAPS 133 | """ 134 | if uncertainty is not None and not supported(uncertainty): 135 | raise Exception("unsupported file extension for uncertainty") 136 | if uncertainty is not None and allParams.coparams["useSparseOptimization"] is True: 137 | raise Exception("must use default uncertainty when enabling useSparseOptimization") 138 | if allParams.gaps.checkpointFile is not None and not isCheckpointsEnabled(): 139 | raise Exception("CoGAPS was built with checkpoints disabled") 140 | if allParams.gaps.snapshotFrequency > 0: 141 | warnings.warn("snapshots slow down computatioin and shouldo nly be used for testing") 142 | 143 | if allParams.coparams["distributed"] is not None: 144 | if allParams.gaps.maxThreads > 1: 145 | warnings.warn("can't run multi-threaded and distributed CoGAPS at the same time, ignoring nThreads") 146 | if allParams.gaps.checkpointFile != "": 147 | raise Exception("checkpoints not supported for distributed CoGAPS") 148 | 149 | 150 | def nrowHelper(data): 151 | return data.shape[0] 152 | 153 | 154 | def ncolHelper(data): 155 | return data.shape[1] 156 | 157 | 158 | def getGeneNames(data, transpose): 159 | """ Return gene names 160 | 161 | Args: 162 | data (anndata): data as matrix 163 | transpose (bool): if data was transposed 164 | 165 | Returns: 166 | list: list of names 167 | """ 168 | if transpose: 169 | return getSampleNames(data, False) 170 | names = data.obs_names 171 | 172 | if names.all() == None or len(names) == 0: 173 | return ["Gene" + str(i) for i in range(1, nrowHelper(data))] 174 | return names 175 | 176 | 177 | def getSampleNames(data, transpose): 178 | """ Return sample names 179 | 180 | Args: 181 | data (anndata): data as matrix 182 | transpose (bool): if data was transposed 183 | 184 | Returns: 185 | list: list of names 186 | """ 187 | if transpose: 188 | return getGeneNames(data, False) 189 | names = data.var_names 190 | 191 | if names.all() == None or len(names) == 0: 192 | return ["Sample" + str(i) for i in range(1, ncolHelper(data))] 193 | return names 194 | 195 | 196 | 197 | def getDimNames(data, allParams): 198 | # support both path and anndata object as data input 199 | """ Get dimension names 200 | 201 | Args: 202 | data (arr): data as matrix 203 | allParams (CoParams): CoParams object 204 | 205 | Returns: 206 | CoParams: updated CoParams object 207 | """ 208 | if isinstance(data, str): 209 | if not supported(data): 210 | raise Exception("unsupported data type") 211 | else: 212 | data = toAnndata(data).X 213 | 214 | geneNames = getGeneNames(data, allParams.gaps.transposeData) 215 | sampleNames = getSampleNames(data, allParams.gaps.transposeData) 216 | 217 | if allParams.gaps.transposeData: 218 | nGenes = ncolHelper(data) 219 | nSamples = nrowHelper(data) 220 | else: 221 | nGenes = nrowHelper(data) 222 | nSamples = ncolHelper(data) 223 | 224 | if allParams.coparams['subsetDim'] == 1: 225 | nGenes = len(allParams.coparams['subsetIndices']) 226 | geneNames = np.take(geneNames, allParams.coparams['subsetIndices']) 227 | elif allParams.coparams['subsetDim'] == 2: 228 | nSamples = len(allParams.coparams['subsetIndices']) 229 | sampleNames = np.take(sampleNames, allParams.coparams['subsetIndices']) 230 | 231 | if len(geneNames) != nGenes: 232 | raise Exception(len(geneNames), " != ", nGenes, " incorrect number of gene names given") 233 | if len(sampleNames) != nSamples: 234 | raise Exception(len(sampleNames), " != ", nSamples, " incorrect number of sample names given") 235 | 236 | # store processed gene/sample names directly in allParams list 237 | # this is an important distinction - allParams@gaps contains the 238 | # gene/sample names originally passed by the user, allParams contains 239 | # the procseed gene/sample names to be used when labeling the result 240 | 241 | allParams.coparams['geneNames'] = geneNames 242 | allParams.coparams['sampleNames'] = sampleNames 243 | return (allParams) 244 | 245 | def startupMessage(params, path): 246 | """ Message to display at startup 247 | 248 | Args: 249 | params (CoParams): CoParams object 250 | path (str): path to data 251 | """ 252 | print("\nThis is ", end='') 253 | getVersion() 254 | 255 | dist_message = "Standard" 256 | if params.coparams["distributed"] is not None and params.coparams["distributed"] is not False: 257 | dist_message = params.coparams["distributed"] 258 | if isinstance(path, str): 259 | data_name = os.path.basename(path) 260 | else: 261 | data_name = "provided data object" 262 | 263 | print("Running", dist_message, "CoGAPS on", data_name, "(", len(params.coparams['geneNames']), "genes and", len(params.coparams['sampleNames']),"samples)", 264 | "with parameters: ") 265 | params.printParams() 266 | 267 | 268 | def show(obj: anndata): 269 | """ Concluding message after CoGAPS completes run 270 | 271 | Args: 272 | obj (anndata): anndata object 273 | """ 274 | nfeatures = obj.n_obs 275 | nsamples = obj.n_vars 276 | npatterns = len(obj.obs_keys()) 277 | print("\nGapsResult result object with", nfeatures, "features and", nsamples, "samples") 278 | print(npatterns, "patterns were learned\n") 279 | return 280 | 281 | 282 | 283 | def getFeatureLoadings(object: anndata): 284 | """ Get feature loadings matrix 285 | 286 | Args: 287 | object (anndata): anndata object 288 | 289 | Returns: 290 | arr: array of matrix 291 | """ 292 | return object.obs 293 | 294 | 295 | def getAmplitudeMatrix(object): 296 | """ Get amplitude matrix 297 | 298 | Args: 299 | object (anndata): anndata object 300 | 301 | Returns: 302 | arr: array of matrix 303 | """ 304 | return object.obs 305 | 306 | 307 | def getSampleFactors(object): 308 | """ Get sample factors matrix 309 | 310 | Args: 311 | object (anndata): anndata object 312 | 313 | Returns: 314 | arr: array of matrix 315 | """ 316 | return object.var 317 | 318 | 319 | def getPatternMatrix(object): 320 | """ Get pattern matrix 321 | 322 | Args: 323 | object (anndata): anndata object 324 | 325 | Returns: 326 | arr: array of matrix 327 | """ 328 | return object.var 329 | 330 | def getMeanChiSq(object): 331 | """ get mean chi sq 332 | 333 | Args: 334 | object (CogapsResult): CogapsResult object 335 | 336 | Returns: 337 | [type]: mean chi sq value 338 | """ 339 | object = object["GapsResult"] 340 | return object.meanChiSq 341 | 342 | 343 | def getVersion(): 344 | """ Prints version of PyCoGAPS package 345 | 346 | Returns: 347 | str: version number 348 | """ 349 | version = pkg_resources.require("pycogaps")[0].version 350 | print("pycogaps version ", version) 351 | return version 352 | 353 | ''' 354 | # TODO need to access original params through anndata 355 | def getOriginalParameters(object: GapsResult): 356 | print("Not yet implemented") 357 | return 358 | 359 | 360 | def getUnmatchedPatterns(object): 361 | print("Not yet implemented") 362 | return 363 | 364 | 365 | def getClusteredPatterns(object): 366 | print("Not yet implemented") 367 | return 368 | 369 | 370 | def getCorrelationToMeanPattern(object): 371 | print("Not yet implemented") 372 | return 373 | 374 | 375 | def getSubsets(object): 376 | print("Not yet implemented") 377 | return 378 | ''' 379 | 380 | 381 | # convert matrix object to numpy array 382 | def toNumpy(matrix): 383 | """ Convert matrix object to numpy array 384 | 385 | Args: 386 | matrix (Matrix): a Matrix object 387 | 388 | Returns: 389 | arr: a numpy array 390 | """ 391 | all_vecdata = np.empty((matrix.nRow(), matrix.nCol())) 392 | for i in range(matrix.nCol()): 393 | vector = matrix.getCol(i) 394 | vecdata = [] 395 | for j in range(vector.size()): 396 | vecdata.append(getElement(vector, j)) 397 | all_vecdata[:, i] = vecdata 398 | return all_vecdata 399 | 400 | def toNumpyFromVector(vector): 401 | """ Convert vector object to numpy array 402 | 403 | Args: 404 | vector (Matrix): a vector object 405 | 406 | Returns: 407 | arr: a numpy array 408 | """ 409 | arr = np.empty(len(vector)) 410 | for j in range(len(vector)): 411 | matrix = getElement(vector, j) 412 | arr[j] = toNumpy(matrix) 413 | return arr 414 | 415 | def GapsResultToAnnData(gapsresult, adata, prm): 416 | """ Converts a CogapsResult object to anndata object. 417 | Args: 418 | gapsresult (CogapsResult): Dictionary result object. 419 | adata (anndata): Anndata object populated by CoGAPS. 420 | prm (CoParams): CoParams object. 421 | Returns: 422 | anndata: An anndata object. 423 | """ 424 | # need to subset matrices based on which dimension we're in... 425 | if prm.coparams['subsetDim'] == 1: 426 | Amean = toNumpy(gapsresult.Amean)[prm.coparams["subsetIndices"], :] 427 | # if prm.coparams["subsetIndices"] is not None: 428 | # adata = adata[prm.coparams["subsetIndices"], :] 429 | Pmean = toNumpy(gapsresult.Pmean) 430 | Asd = toNumpy(gapsresult.Asd)[prm.coparams["subsetIndices"], :] 431 | Psd = toNumpy(gapsresult.Psd) 432 | else: 433 | Amean = toNumpy(gapsresult.Amean) 434 | Pmean = toNumpy(gapsresult.Pmean)[prm.coparams["subsetIndices"], :] 435 | # if prm.coparams["subsetIndices"] is not None: 436 | # adata = adata[:, prm.coparams["subsetIndices"]] 437 | Asd = toNumpy(gapsresult.Asd) 438 | Psd = toNumpy(gapsresult.Psd)[prm.coparams["subsetIndices"], :] 439 | pattern_labels = ["Pattern" + str(i) for i in range(1, prm.gaps.nPatterns + 1)] 440 | # load adata obs and var from Amean and Pmean results 441 | if len(Pmean.shape) > 2: 442 | Pmean = Pmean[0, :, :] 443 | Psd = Psd[0, :, :] 444 | # if prm.coparams["distributed"] == "genome-wide": 445 | adata.obs = pd.DataFrame(data=Amean, index=adata.obs_names, columns=pattern_labels) 446 | adata.var = pd.DataFrame(data=Pmean, index=adata.var_names, columns=pattern_labels) 447 | adata.uns["asd"] = pd.DataFrame(data=Asd, index=adata.obs_names, columns=pattern_labels) 448 | adata.uns["psd"] = pd.DataFrame(data=Psd, index=adata.var_names, columns=pattern_labels) 449 | # else: 450 | # adata.obs = pd.DataFrame(data=Pmean, index=adata.obs_names, columns=pattern_labels) 451 | # adata.var = pd.DataFrame(data=Amean, index=adata.var_names, columns=pattern_labels) 452 | # adata.uns["asd"] = pd.DataFrame(data=Asd, index=adata.var_names, columns=pattern_labels) 453 | # adata.uns["psd"] = pd.DataFrame(data=Psd, index=adata.obs_names, columns=pattern_labels) 454 | adata.uns["atomhistoryA"] = list(gapsresult.atomHistoryA) 455 | adata.uns["atomhistoryP"] = list(gapsresult.atomHistoryP) 456 | adata.uns["averageQueueLengthA"] = float(gapsresult.averageQueueLengthA) 457 | adata.uns["averageQueueLengthP"] = float(gapsresult.averageQueueLengthP) 458 | adata.uns["chisqHistory"] = list(gapsresult.chisqHistory) 459 | adata.uns["equilibrationSnapshotsA"] = toNumpyFromVector(gapsresult.equilibrationSnapshotsA) 460 | adata.uns["equilibrationSnapshotsP"] = toNumpyFromVector(gapsresult.equilibrationSnapshotsP) 461 | adata.uns["meanChiSq"] = float(gapsresult.meanChiSq) 462 | adata.uns["meanPatternAssignment"] = toNumpy(gapsresult.meanPatternAssignment) 463 | adata.uns["pumpMatrix"] = toNumpy(gapsresult.pumpMatrix) 464 | adata.uns["samplingSnapshotsA"] = toNumpyFromVector(gapsresult.samplingSnapshotsA) 465 | adata.uns["samplingSnapshotsP"] = toNumpyFromVector(gapsresult.samplingSnapshotsP) 466 | adata.uns["seed"] = int(gapsresult.seed) 467 | adata.uns["totalRunningTime"] = int(gapsresult.totalRunningTime) 468 | adata.uns["totalUpdates"] = int(gapsresult.totalUpdates) 469 | 470 | return adata 471 | 472 | 473 | 474 | def GapsParameters(path): 475 | """ Returns C++ GapsParameters object. 476 | 477 | Args: 478 | path (str): Path to data. 479 | 480 | Returns: 481 | GapsParameters: A GapsParameters object. 482 | """ 483 | return pycogaps.GapsParameters(path) 484 | 485 | 486 | def getBuildReport(): 487 | """ Returns information about how the package was compiled, i.e. which 488 | compiler/version was used, which compile time options were enabled, etc... 489 | 490 | Returns: 491 | str: String containing build report. 492 | """ 493 | return pycogaps.getBuildReport() 494 | 495 | 496 | def isCheckpointsEnabled(): 497 | """ Check if package was built with checkpoints enabled 498 | 499 | Returns: 500 | bool: true/false if checkpoints are enabled 501 | """ 502 | return pycogaps.isCheckpointsEnabled() 503 | 504 | 505 | def isCompiledWithOpenMPSupport(): 506 | """ Check if compiler supported OpenMP 507 | 508 | Returns: 509 | bool: true/false if OpenMP was supported 510 | """ 511 | return pycogaps.isCompiledWithOpenMPSupport() 512 | 513 | 514 | def getFileInfo(path): 515 | """ Get info of inputted file. 516 | 517 | Args: 518 | path (str): Path to data. 519 | 520 | Returns: 521 | str: string of file info. 522 | """ 523 | return pycogaps.getFileInfo(path) 524 | 525 | 526 | def current_milli_time(): 527 | """ Return current time in milliseconds. 528 | 529 | Returns: 530 | int: Current time in milliseconds. 531 | """ 532 | return round(time.time() * 1000) 533 | 534 | 535 | 536 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![pycogapslogo_transparent](https://github.com/FertigLab/pycogaps/assets/25310425/6fe64453-f52d-41e1-9def-63d8894c5c18) 2 | 3 | # **PyCoGAPS** 4 | 5 | Coordinated Gene Activity in Pattern Sets (CoGAPS) implements a Bayesian MCMC matrix factorization algorithm, GAPS, and links it to gene set statistic methods to infer biological process activity. It can be used to perform sparse matrix factorization on any data, and when this data represents biomolecules, to do gene set analysis. 6 | 7 | This package, PyCoGAPS, presents a unified Python interface, with a parallel, efficient underlying implementation in C++. The R implementation of CoGAPS can be found here: https://github.com/FertigLab/CoGAPS/ 8 | 9 | ## **Table of Contents** 10 | 11 | 1. [ Using the PyCoGAPS Library ](#1-using-the-pycogaps-library) 12 | 2. [ Running PyCoGAPS Using Docker](#2-running-pycogaps-using-docker) 13 | 3. [ Analyzing the PyCoGAPS Result ](#3-analyzing-the-pycogaps-result) 14 | 4. [ Additional Features of PyCoGAPS ](#4-additional-features-of-pycogaps) 15 | 5. [ Citing PyCoGAPS ](#5-citing-pycogaps) 16 | 17 | 18 | 19 | # **1. Using the PyCoGAPS library** 20 | Install: 21 | ``` 22 | git clone https://github.com/FertigLab/pycogaps.git --recursive 23 | cd pycogaps 24 | pip install -r requirements.txt 25 | python3 setup.py install 26 | ``` 27 | When PyCoGAPS has installed and built correctly, you should see this message: 28 | ``` 29 | Finished processing dependencies for pycogaps==0.0.1 30 | ``` 31 | Which means it is ready to use! You may need to install some Python dependencies before everything can build, so don’t be deterred if it takes a couple of tries to install. 32 | 33 | We'll first begin with setting up your working environment and running CoGAPS on an example dataset with default parameters. 34 | 35 | To use the PyCoGAPS python package, import dependencies as follows: 36 | 37 | ``` 38 | from PyCoGAPS.parameters import * 39 | from PyCoGAPS.pycogaps_main import CoGAPS 40 | import scanpy as sc 41 | ``` 42 | NOTE: if you wish to run distributed (parallel), please wrap all subsequent code in this check to avoid thread reentry issues: 43 | ``` 44 | if __name__ == "__main__": 45 | ``` 46 | Part 1 - ModSim Toy data 47 | Load input data (acceptable formats: h5ad, h5, csv, txt, mtx, tsv) 48 | ``` 49 | modsimpath = "data/ModSimData.txt" 50 | modsim = sc.read_text(modsimpath) 51 | ``` 52 | Create a params object and set simple parameters: 53 | ``` 54 | params = CoParams(path) 55 | setParams(params, { 56 | 'nIterations': 50000, 57 | 'seed': 42, 58 | 'nPatterns': 3 59 | }) 60 | ``` 61 | Run CoGAPS: 62 | ``` 63 | modsimresult = CoGAPS(modsim, params) 64 | ``` 65 | Part 2 - Single Cell Data 66 | Now that PyCoGAPS has been set up and run correctly, we can now proceed to analyzing experimental single-cell data. 67 | ``` 68 | path = "data/inputdata.h5ad" 69 | adata = sc.read_h5ad(path) 70 | ``` 71 | If data has not yet been log-normalized, please do so with the following command: 72 | ``` 73 | adata.X = adata.X.todense() 74 | sc.pp.log1p(adata) 75 | ``` 76 | Now, set run parameters by creating a CoParams object. 77 | ``` 78 | params = CoParams(path) 79 | 80 | setParams(params, { 81 | 'nIterations': 50000, 82 | 'seed': 42, 83 | 'nPatterns': 8, 84 | 'useSparseOptimization': True, 85 | 'distributed': "genome-wide", 86 | }) 87 | ``` 88 | If you are running in parallel, distributed parameters can be modified like this: 89 | ``` 90 | params.setDistributedParams(nSets=7) 91 | ``` 92 | Now, start your CoGAPS run by passing your data object and parameter object. Since CoGAPS runs can take significant time to complete, we recommend keeping track of how run times scale with increasing patterns and iterations. 93 | ``` 94 | start = time.time() 95 | result = CoGAPS(adata, params) 96 | end = time.time() 97 | 98 | print("TIME:", end - start) 99 | ``` 100 | While CoGAPS is running, you will see periodic status messages saying how many iterations have been completed, the current ChiSq value, and how much time has elapsed out of the estimated total runtime. 101 | ``` 102 | 1000 of 50000, Atoms: 5424(A), 21232(P), ChiSq: 138364000, Time: 00:03:47 / 11:13:32 103 | 2000 of 50000, Atoms: 5394(A), 20568(P), ChiSq: 133824536, Time: 00:03:46 / 11:10:34 104 | 3000 of 50000, Atoms: 5393(A), 21161(P), ChiSq: 133621048, Time: 00:03:51 / 11:25:24 105 | 4000 of 50000, Atoms: 5527(A), 22198(P), ChiSq: 137671296, Time: 00:04:00 / 11:52:06 106 | 5000 of 50000, Atoms: 5900(A), 20628(P), ChiSq: 137228688, Time: 00:03:58 / 11:46:10 107 | ``` 108 | When the run is finished, CoGAPS will print a message like this: 109 | ``` 110 | GapsResult result object with 5900 features and 20628 samples 111 | 8 patterns were learned 112 | ``` 113 | We strongly recommend saving your result object as soon as it returns. 114 | ``` 115 | result.write("data/my_pdac_result.h5ad") 116 | ``` 117 | To save as a .csv file, use the following line: 118 | ``` 119 | result.write_csvs(dirname=’./’, skip_data=True, sep=',') 120 | ``` 121 | Now you have successfully generated a CoGAPS result! To continue to visualization and analysis guides, please skip to the section below titled “Analyzing the PyCoGAPS Result” 122 | 123 | # **2. Running PyCoGAPS using Docker** 124 | 125 | The second option for running PyCoGAPS is using a Docker image, which we will pull from the Docker repository, and this contains a set of instructions to build and run PyCoGAPS. With this Docker image, there's no need to install any dependencies, import packages, etc. as the environment is already set up and directly ready to run on your computer. 126 | 127 | Please follow the steps below to run the PyCoGAPS vignette on Mac/Linux OS: 128 | 1. Install Docker at https://docs.docker.com/desktop/mac/install/ 129 | 2. Open the Docker application or paste the following in terminal: 130 | ``` 131 | docker run -d -p 80:80 docker/getting-started 132 | ``` 133 | 3. Copy the commands and paste in terminal (Tested via Mac OX) 134 | 135 | ``` 136 | docker pull fertiglab/pycogaps 137 | mkdir PyCoGAPS 138 | cd PyCoGAPS 139 | curl -O https://raw.githubusercontent.com/FertigLab/pycogaps/master/params.yaml 140 | mkdir data 141 | cd data 142 | curl -O https://raw.githubusercontent.com/FertigLab/pycogaps/master/data/GIST.csv 143 | cd .. 144 | docker run -v $PWD:$PWD fertiglab/pycogaps $PWD/params.yaml 145 | 146 | ``` 147 | For MARCC users, we'll be building the pycogaps package and installing all dependencies in a conda environment. 148 | 149 | Please follow the steps below to run the PyCoGAPS vignette on MARCC: 150 | 1. Copy the commands and paste in terminal 151 | **Note:** If you're encountering an error with `import pycogaps`, make sure that you have the intel/18.0 core module loaded instead of gcc. 152 | 153 | ``` 154 | git clone --recurse-submodules https://github.com/FertigLab/pycogaps.git 155 | ml anaconda 156 | conda create --name pycogaps python=3.8 157 | conda activate pycogaps 158 | cd pycogaps 159 | pip install -r requirements.txt --user 160 | python3 setup.py install 161 | python3 vignette.py 162 | 163 | ``` 164 | 165 | This produces a CoGAPS run on a simple dataset with default parameters. You should then see the following output: 166 | ``` 167 | This is pycogaps version 0.0.1 168 | Running Standard CoGAPS on GIST.csv (1363 genes and 9 samples) with parameters: 169 | 170 | -- Standard Parameters -- 171 | nPatterns: 3 172 | nIterations: 1000 173 | seed: 0 174 | sparseOptimization: False 175 | 176 | -- Sparsity Parameters -- 177 | alpha: 0.01 178 | maxGibbsMass: 100.0 179 | 180 | GapsResult result object with 1363 features and 9 samples 181 | 3 patterns were learned 182 | 183 | Pickling complete! 184 | ``` 185 | 186 | CoGAPS has successfully completed running and has saved the result file as `result.pkl` in a created `output/` folder. 187 | 188 | Your working directory is the `PyCoGAPS` folder with the following structure and files: 189 | ``` 190 | PyCoGAPS 191 | ├── data 192 | │ └── GIST.csv 193 | ├── params.yaml 194 | ├── output 195 | │ └── result.pkl 196 | ``` 197 | 198 | Now, you're ready to run CoGAPS for analysis on your own data with custom parameters. 199 | 200 | In order to analyze your desired data, we'll need to input it and modify the default parameters before running CoGAPS. All parameter values can be modified directly in the `params.yaml` file already downloaded earlier. 201 | 202 | Please follow the steps below to run PyCoGAPS with custom parameters: 203 | 1. Open `params.yaml` with any text or code editor 204 | 2. Modify the `path` parameter value by replacing the default `data/GIST.csv` with `data/your-datafile-name` 205 | **Note**: Make sure you have moved your data into the created `data/` folder 206 | 3. Modify any additional desired parameters and save 207 | 4. For Mac/Linux OS, run the following in terminal: 208 | ``` 209 | docker run -v $PWD:$PWD fertiglab/pycogaps $PWD/params.yaml 210 | ``` 211 | 4. For MARCC, run the following in terminal: 212 | ``` 213 | python3 vignette.py 214 | ``` 215 | ## **Example Snippet of `params.yaml`** 216 | 217 | A snippet of `params.yaml` is shown below, where we have changed some default parameter values to our own specified example values. 218 | 219 | ``` 220 | ## This file holds all parameters to be passed into PyCoGAPS. 221 | ## To modify default parameters, simply replace parameter values below with user-specified values, and save file. 222 | 223 | # RELATIVE path to data -- make sure to move your data into the created data/ folder 224 | path: data/liver_dataset.txt 225 | 226 | # result output file name 227 | result_file: liver_result.pkl 228 | 229 | standard_params: 230 | # number of patterns CoGAPS will learn 231 | nPatterns: 10 232 | # number of iterations for each phase of the algorithm 233 | nIterations: 5000 234 | # random number generator seed 235 | seed: 0 236 | # speeds up performance with sparse data (roughly >80% of data is zero), note this can only be used with the default uncertainty 237 | useSparseOptimization: True 238 | 239 | ... 240 | ``` 241 | 242 | ## **Running CoGAPS in Parallel** 243 | Non-Negative Matrix Factorization algorithms typically require long computation times and CoGAPS is no exception. In order to scale CoGAPS up to the size of data sets seen in practice we need to take advantage of modern hardware and parallelize the algorithm. 244 | 245 | ### **I. Multi-Threaded Parallelization** 246 | The simplest way to run CoGAPS in parallel is to modify the `nThreads` parameter in `params.yaml`. This allows the underlying algorithm to run on multiple threads and has no effect on the mathematics of the algorithm i.e. this is still standard CoGAPS. The precise number of threads to use depends on many things like hardware and data size. The best approach is to play around with different values and see how it effects the estimated time. 247 | 248 | A snippet of `params.yaml` is shown below where `nThreads` parameter is modified. 249 | 250 | ``` 251 | ## This file holds all parameters to be passed into PyCoGAPS. 252 | ... 253 | 254 | run_params: 255 | # maximum number of threads to run on 256 | nThreads: 4 257 | ``` 258 | 259 | Note this method relies on CoGAPS being compiled with OpenMP support, use `buildReport` to check. 260 | ```python 261 | print(getBuildReport()) 262 | ``` 263 | 264 | ### **II. Distributed CoGAPS** 265 | For large datasets (greater than a few thousand genes or samples) the multi-threaded parallelization isn’t enough. It is more efficient to break up the data into subsets and perform CoGAPS on each subset in parallel, stitching the results back together at the end (Stein-O’Brien et al. (2017)). 266 | 267 | In order to use these extensions, some additional parameters are required, specifically modifying the `distributed_params` in `params.yaml`. We first need to set `distributed` to be `genome-wide.` Next, `nSets` specifies the number of subsets to break the data set into. `cut`, `minNS`, and `maxNS` control the process of matching patterns across subsets and in general should not be changed from defaults. More information about these parameters can be found in the original papers. 268 | 269 | A snippet of `params.yaml` is shown below where `distributed_params` parameters are modified. 270 | 271 | ``` 272 | ## This file holds all parameters to be passed into PyCoGAPS. 273 | ... 274 | 275 | distributed_params: 276 | # either null or genome-wide 277 | distributed: genome-wide 278 | # number of sets to break data into 279 | nSets: 4 280 | # number of branches at which to cut dendrogram used in pattern matching 281 | cut: null 282 | # minimum of individual set contributions a cluster must contain 283 | minNS: null 284 | # maximum of individual set contributions a cluster can contain 285 | maxNS: null 286 | ``` 287 | 288 | Setting `nSets` requires balancing available hardware and run time against the size of your data. In general, `nSets` should be less than or equal to the number of nodes/cores that are available. If that is true, then the more subsets you create, the faster CoGAPS will run - however, some robustness can be lost when the subsets get too small. The general rule of thumb is to set `nSets` so that each subset has between 1000 and 5000 genes or cells. 289 | 290 | 291 | # **3. Analyzing the PyCoGAPS Result** 292 | 293 | ## **Breaking Down the Result Object from CoGAPS** 294 | 295 | anndata result obj 296 | 297 | A dictionary of the result as two representations is stored: an `anndata` object. CoGAPS stores the lower dimensional representation of the samples (P matrix) in the `.var` slot and the weight of the features (A matrix) in the `.obs` slot. The standard deviation across sample points for each matrix are stored in the `.uns` slots. 298 | 299 | ## **Analyzing the Result** 300 | We provide two ways to analyze the result from PyCoGAPS. The first includes an interactive notebook interface using the web-based GenePattern Notebook (recommended for less experienced python/programming users), and the secoond includes running a python script from the command line (recommended for more experienced python/programming users). 301 | 302 | ## **I. GenePattern Notebook** 303 | Here are the following steps to use the interactive GenePattern Notebook to analyze results: 304 | 1. Go to the PyCoGAPS Analysis Notebook found here: https://notebook.genepattern.org/hub/preview?id=440. 305 | 3. Click 'Run' and open 'PyCoGAPS Analysis.ipynb' 306 | 4. Follow the instructions in the notebook to run your analysis. 307 | 308 | ## **II. Python Script via Terminal** 309 | In order to analyze the data, we'll need to make sure to install the necessary dependencies and import the built-in PyCoGAPS functions. 310 | 311 | Make sure you're in the `PyCoGAPS` folder, and copy the following commands in terminal, which will save plots generated from the example data in the `output/` folder: 312 | ``` 313 | cd output 314 | curl -O https://raw.githubusercontent.com/FertigLab/pycogaps/master/PyCoGAPS/analysis_functions.py 315 | curl -O https://raw.githubusercontent.com/FertigLab/pycogaps/master/PyCoGAPS/requirements_analysis.txt 316 | pip install -r requirements_analysis.txt --user 317 | python3 analysis_functions.py result.pkl 318 | ``` 319 | To analyze a different result, replace `result.pkl` with the path to your desired result file in the command line. 320 | 321 | ## **More on the Analysis Functions** 322 | Below details each of the analysis functions included in the package. 323 | 324 | 3.2.1 [ Default Plot ](#241-default-plot) 325 | 3.2.2 [ Residuals Plot ](#242-residuals-plot) 326 | 3.2.3 [ Pattern Markers Plot ](#243-pattern-markers-plot) 327 | 3.2.4 [ Binary Plot ](#244-binarya-plot) 328 | 3.2.5 [ Calculate CoGAPS Statistics ](#245-calculate-cogaps-statistics) 329 | 3.2.6 [ Calculate Gene GSS Statistic ](#246-calculate-gene-gss-statistic) 330 | 3.2.7 [ Calculate Gene GS Probability ](#247-calculate-gene-gs-probability) 331 | 332 | 333 | 334 | ### **I. Default Plot** 335 | By default, the `plot` function displays how the patterns vary across the samples. 336 | 337 | ```python 338 | # plot result object returned from CoGAPS 339 | plot(result) 340 | ``` 341 | show result function 342 | 343 | 344 | 345 | 346 | 347 | ### **II. Residuals Plot** 348 | `plotResiduals` calculates residuals and produces a heatmap. 349 | 350 | ```python 351 | plotResiduals(result) 352 | ``` 353 | 354 | plot residuals 355 | 356 | 357 | ### **III. Pattern Markers Plot** 358 | `plotPatternMarkers` plots a heatmap of the original data clustered by the pattern markers statistic, which computes the most associated pattern for each gene. 359 | 360 | ```python 361 | plotPatternMarkers(result, legend_pos=None) 362 | ``` 363 | 364 | plot pattern markers 365 | 366 | 367 | ### **IV. Binary Plot** 368 | `binaryA` creates a binarized heatmap of the A matrix in which the value is 1 if the value in Amean is greater 369 | than `threshold * Asd` and 0 otherwise. 370 | 371 | ```python 372 | binaryA(result, threshold=3) 373 | ``` 374 | 375 | plot binary hm 376 | 377 | 378 | ```python 379 | # plotting clustered binary plot 380 | binaryA(result, threshold=3, cluster=True) 381 | ``` 382 | 383 | plot binary hm, cluster 384 | 385 | 386 | 387 | 388 | 389 | ### **V. Calculate CoGAPS Statistics** 390 | `calcCoGAPSStat` calculates a statistic to determine if a pattern is enriched in a a particular set of measurements or samples. 391 | 392 | ```python 393 | # sets is list of sets of measurements/samples 394 | stats = calcCoGAPSStat(result, sets=['Hs.101174', 'Hs.1012']) 395 | ``` 396 | 397 | ``` 398 | {'twoSidedPValue': 399 | Pattern1 0.496 400 | Pattern2 0.353 401 | Pattern3 0.289, 402 | 'GSUpreg': 403 | Pattern1 0.496 404 | Pattern2 0.647 405 | Pattern3 0.711, 406 | 'GSDownreg': 407 | Pattern1 0.504 408 | Pattern2 0.353 409 | Pattern3 0.289, 410 | 'GSActEst': 411 | Pattern1 0.008 412 | Pattern2 -0.294 413 | Pattern3 -0.422} 414 | ``` 415 | 416 | ### **VI. Calculate Gene GSS Statistic** 417 | `calcGeneGSStat` calculates the probability that a gene listed in a gene set behaves like other genes in the set within the given data set. 418 | 419 | ```python 420 | stats = calcGeneGSStat(result, GStoGenes=['Hs.101174', 'Hs.1012'], numPerm=1000) 421 | ``` 422 | 423 | ``` 424 | Hs.101174 0.422955 425 | Hs.1012 0.391747 426 | ``` 427 | 428 | ### **VII. Compute Gene GS Probability** 429 | 430 | `computeGeneGSProb` computes the p-value for gene set membership using the CoGAPS-based statistics developed in Fertig et al. (2012). This statistic refines set membership for each candidate gene in a set specified in GSGenes by comparing the inferred activity of that gene to the average activity of the set. 431 | 432 | ```python 433 | stats = computeGeneGSProb(result, GStoGenes=['Hs.101174', 'Hs.1012']) 434 | ``` 435 | 436 | ``` 437 | Hs.101174 0.617193 438 | Hs.1012 0.887583 439 | ``` 440 | 441 | # **4. Additional Features of PyCoGAPS** 442 | 443 | ## **Checkpoint System: Saving/Loading CoGAPS Runs** 444 | CoGAPS allows the user to save their progress throughout the run, and restart from the latest saved “checkpoint”. This is intended so that if the server crashes in the middle of a long run it doesn’t need to be restarted from the beginning. Set the `checkpointInterval` parameter to save checkpoints and pass a file name as `checkpointInFile` to load from a checkpoint. 445 | 446 | A snippet of `params.yaml` is shown where we enable the checkpoint system, saving CoGAPS run. 447 | ``` 448 | ## This file holds all parameters to be passed into PyCoGAPS. 449 | ... 450 | 451 | run_params: 452 | checkpointOutFile: gaps_checkpoint.out 453 | # number of iterations between each checkpoint (set to 0 to disable checkpoints) 454 | checkpointInterval: 250 455 | # if this is provided, CoGAPS runs from the checkpoint contained in this file 456 | checkpointInFile: null 457 | ``` 458 | 459 | A snippet of `params.yaml` is shown where we now load the saved CoGAPS checkpoint file to continue the run. 460 | ``` 461 | ## This file holds all parameters to be passed into PyCoGAPS. 462 | ... 463 | 464 | run_params: 465 | checkpointOutFile: null 466 | # number of iterations between each checkpoint (set to 0 to disable checkpoints) 467 | checkpointInterval: null 468 | # if this is provided, CoGAPS runs from the checkpoint contained in this file 469 | checkpointInFile: gaps_checkpoint.out 470 | ``` 471 | 472 | ## **Transposing Data** 473 | If your data is stored as samples x genes, CoGAPS allows you to pass `transposeData: True` and will automatically read the transpose of your data to get the required genes x samples configuration. 474 | 475 | A snippet of `params.yaml` is shown where we now load the saved CoGAPS checkpoint file to continue the run. 476 | ``` 477 | ## This file holds all parameters to be passed into PyCoGAPS. 478 | ... 479 | 480 | run_params: 481 | # T/F for transposing data while reading it in - useful for data that is stored as samples x genes since CoGAPS requires data to be genes x samples 482 | transposeData: True 483 | ``` 484 | 485 | ## **Passing Uncertainty Matrix** 486 | In addition to providing the data, the user can also specify an uncertainty measurement - the standard deviation of each entry in the data matrix. By default, CoGAPS assumes that the standard deviation matrix is 10% of the data matrix. This is a reasonable heuristic to use, but for specific types of data you may be able to provide better information. Make sure to save your uncertainty file into the `data/` file. 487 | 488 | A snippet of `params.yaml` is shown where we now load the saved CoGAPS checkpoint file to continue the run. 489 | ``` 490 | ## This file holds all parameters to be passed into PyCoGAPS. 491 | ... 492 | 493 | run_params: 494 | # uncertainty matrix - either a matrix or a supported file type 495 | uncertainty: data/GIST_uncertainty.csv 496 | ``` 497 | 498 | ## **Subsetting Data** 499 | The default method for subsetting the data is to uniformly break up the rows (cols) of the data. There is an alternative option where the user provides an annotation vector for the rownames (colnames) of the data and gives a weight to each category in the annotation vector. Equal sized subsets are then drawn by sampling all rows (cols) according to the weight of each category. 500 | 501 | A snippet of `params.yaml` is shown below where `distributed_params` parameters are modified to subset the data. 502 | ``` 503 | ## This file holds all parameters to be passed into PyCoGAPS. 504 | ... 505 | 506 | distributed_params: 507 | # specify categories along the rows (cols) to use for weighted sampling 508 | samplingAnnotation: ['IM00', 'IM02', 'IM00'] 509 | # weights associated with samplingAnnotation 510 | samplingWeight: {'IM00': 2, 'IM02': 0.5} 511 | ``` 512 | 513 | Finally, the user can set `explicitSets` which is a list of character or numeric vectors indicating which names or indices of the data should be put into each set. Make sure to set nSets to the correct value before passing `explicitSets`. 514 | 515 | A snippet of `params.yaml` is shown below where `distributed_params` parameters are modified to subset the data. 516 | ``` 517 | ## This file holds all parameters to be passed into PyCoGAPS. 518 | ... 519 | 520 | distributed_params: 521 | # number of sets to break data into 522 | nSets: 2 523 | # specify subsets by index or name 524 | explicitSets: ['IM00', 'IM02'] 525 | ``` 526 | 527 | # **5. Citing PyCoGAPS** 528 | If you use PyCoGAPS for your analysis, please ciite Johnson et al. (2022) 529 | 530 | If you use the CoGAPS package for your analysis, please cite Fertig et al. (2010) 531 | 532 | If you use the gene set statistic, please cite Ochs et al. (2009) 533 | 534 | # **References** 535 | Johnson, Jeanette, Ashley Tsang, Jacob Mitchell, David Zhou, et al. Inferring cellular and molecular processes in single-cell data with non-negative matrix factorization using Python, R, and GenePattern Notebook implementations of CoGAPS. bioRxiv (2022) doi:10.1101/2022.07.09.499398. 536 | 537 | Fertig, Elana J., Jie Ding, Alexander V. Favorov, Giovanni Parmigiani, and Michael F. Ochs. 2010. “CoGAPS: An R/C++ Package to Identify Patterns and Biological Process Activity in Transcriptomic Data.” Bioinformatics 26 (21): 2792–3. https://doi.org/10.1093/bioinformatics/btq503. 538 | 539 | Ochs, Michael F., Lori Rink, Chi Tarn, Sarah Mburu, Takahiro Taguchi, Burton Eisenberg, and Andrew K. Godwin. 2009. “Detection of Treatment-Induced Changes in Signaling Pathways in Gastrointestinal Stromal Tumors Using Transcriptomic Data.” Cancer Research 69 (23): 9125–32. https://doi.org/10.1158/0008-5472.CAN-09-1709. 540 | 541 | Seung, Sebastian, and Daniel D. Lee. 1999. “Learning the Parts of Objects by Non-Negative Matrix Factorization.” Nature 401 (6755): 788–91. https://doi.org/10.1038/44565. 542 | 543 | Stein-O’Brien, Genevieve L., Jacob L. Carey, Wai S. Lee, Michael Considine, Alexander V. Favorov, Emily Flam, Theresa Guo, et al. 2017. “PatternMarkers & Gwcogaps for Novel Data-Driven Biomarkers via Whole Transcriptome Nmf.” Bioinformatics 33 (12): 1892–4. https://doi.org/10.1093/bioinformatics/btx058. 544 | -------------------------------------------------------------------------------- /PyCoGAPS/analysis_functions.py: -------------------------------------------------------------------------------- 1 | # from PyCoGAPS.config import * 2 | # from PyCoGAPS.helper_functions import * 3 | import pandas as pd 4 | import numpy as np 5 | import anndata 6 | import warnings 7 | import matplotlib as mpl 8 | import matplotlib.pyplot as plt 9 | import seaborn as sns 10 | from scipy.stats import zscore 11 | 12 | def plot(obj, groups=None, title=None, fn=""): 13 | """ Plots how patterns vary across samples 14 | 15 | Args: 16 | obj (CogapsResult): CogapsResult object 17 | groups (str list, optional): list of groups. Defaults to None. 18 | title (str, optional): title of plot. Defaults to None. 19 | 20 | Returns: 21 | fig: figure of plot 22 | """ 23 | 24 | if groups is not None: 25 | if len(groups) == len(obj.var_names): 26 | obj.var_names = groups 27 | else: 28 | warnings.warn("length of groups does not match number of samples, aborting...") 29 | return 30 | samples = obj.var 31 | samplenames = list(set(obj.var_names)) 32 | patterns = list(obj.var.columns) 33 | fig = plt.figure() 34 | ax = fig.add_subplot(111) 35 | 36 | for pattern in patterns: 37 | groupavgs = [] 38 | for name in samplenames: 39 | groupavgs.append(samples.loc[name][pattern].mean()) 40 | ax.plot(np.array(range(1, len(samplenames) + 1)), groupavgs, label=pattern) 41 | ax.legend() 42 | plt.xlabel("Groups") 43 | plt.ylabel("Relative Amplitude") 44 | plt.xticks(np.arange(1, len(samplenames) + 1), samplenames, rotation=45, ha="right") 45 | plt.subplots_adjust(bottom=0.15) 46 | if title is not None: 47 | ax.set_title(title) 48 | else: 49 | ax.set_title('Patterns over Samples') 50 | 51 | else: 52 | samples = obj.var 53 | nsamples = np.shape(samples)[0] 54 | fig = plt.figure() 55 | ax = fig.add_subplot(111) 56 | for factor in list(samples): 57 | ax.plot(np.array(range(1, nsamples + 1)), samples[factor], label=factor) 58 | ax.legend() 59 | plt.xlabel("Samples") 60 | plt.ylabel("Relative Amplitude") 61 | if title is not None: 62 | ax.set_title(title) 63 | else: 64 | ax.set_title('Patterns over Samples') 65 | plt.savefig("{}_plot.png".format(fn)) 66 | plt.show() 67 | return fig 68 | 69 | def patternGSEA(obj, patternmarkers=None, verbose=True, gene_sets = ['MSigDB_Hallmark_2020'], organism="human"): 70 | """ Run pygsea enrichr API on gene list for each pattern, return dictionary of results 71 | 72 | Args: 73 | obj (anndata): Anndata CoGAPSresult object 74 | patternmarkers (optional, default = None): output from patternMarkers() function 75 | verbose (optional, default = True): Indicates whether to print messages 76 | gene_sets (optional, default = ['MSigDB_Hallmark_2020']): a list of one or more gene sets for enrichr to use 77 | organism (optional, default = "human"): which organism your data comes from 78 | Returns: 79 | gseares: a dictionary of pygsea enrichr API results for each pattern in obj 80 | """ 81 | import pandas as pd 82 | pd.options.mode.chained_assignment = None # default='warn' 83 | if patternmarkers is None: 84 | patternmarkers = patternMarkers(obj, threshold="all") 85 | 86 | markers = patternmarkers["PatternMarkers"] 87 | import gseapy as gp 88 | print("This is a wrapper function around the pygsea enrichr API. \n" 89 | "Documentation can be found at: https://gseapy.readthedocs.io/en/latest/introduction.html") 90 | gsea_results = dict() 91 | for pattern in markers: 92 | geneset = list(markers[pattern]) 93 | if(verbose): 94 | print("\nUsing " + str(len(geneset)) +" markers of "+ pattern + ":\n") 95 | print(geneset) 96 | gsea_enr_res = gp.enrichr(gene_list=geneset, 97 | gene_sets=gene_sets, 98 | organism=organism, 99 | outdir=None, # don't write to disk 100 | verbose=verbose, 101 | ) 102 | 103 | gsea_enr_df = pd.DataFrame(gsea_enr_res.results) 104 | # plot_enr_df = gsea_enr_df[gsea_enr_df["P-value"] < 0.05] 105 | plot_enr_df = gsea_enr_df 106 | neg_log_q = (-10) * np.log10(list(plot_enr_df["P-value"])) 107 | plot_enr_df["neg.log.q"] = neg_log_q 108 | 109 | gsea_results[pattern] = plot_enr_df 110 | # sns.barplot(data=plot_enr_df, x="neg.log.q", y="Term") 111 | return gsea_results 112 | 113 | def plotPatternGSEA(patternGSEAResults, whichPattern): 114 | import seaborn as sns 115 | patternName = list(patternGSEAResults.keys())[whichPattern - 1] 116 | 117 | gsea_enr_df = patternGSEAResults[patternName] 118 | plot_df = gsea_enr_df[gsea_enr_df["P-value"] < 0.05] # only want to see significant terms 119 | sns.barplot(data=plot_df, x="neg.log.q", y="Term").set_title(patternName + " Enriched Terms") 120 | plt.show() 121 | return plt 122 | 123 | def MANOVA(obj, orig, interested_vars): 124 | """ performs MANOVA test on user given dependent variables 125 | 126 | Args: 127 | obj (anndata): Anndata CoGAPSresult object 128 | orig (anndata): Anndata original data 129 | interested_vars (list, default = None): output from patternMarkers() function 130 | Returns: 131 | manova_result: manova result output from calling statsmodels.multivariate.manova function 132 | """ 133 | from statsmodels.multivariate.manova import MANOVA 134 | 135 | # create formula string from interested groups 136 | formula = '' 137 | for i in interested_vars: 138 | formula += ' + ' + i 139 | formula = formula[3:] 140 | 141 | # patterns 142 | pmat = obj.var 143 | npatterns = len(pmat.columns) 144 | 145 | # get columns of interest 146 | interested = orig.var[interested_vars] 147 | 148 | for p in range(1,npatterns): 149 | pattern = 'Pattern_' + str(p) 150 | data = pd.concat([pmat[pattern], interested], axis=1) 151 | 152 | manova_result = MANOVA.from_formula(formula + ' ~ ' + pattern, data) 153 | 154 | print(pattern + ' MANOVA result:') 155 | print(manova_result.mv_test()) 156 | 157 | return manova_result 158 | 159 | def patternBoxPlot(obj, groups, fn=""): 160 | """ generate a boxplot where each subplot displays amplitudes for each group for each pattern 161 | 162 | Args: 163 | obj (CogapsResult): CogapsResult object 164 | groups (str list): list of groups. 165 | 166 | Returns: 167 | fig: figure of plot 168 | """ 169 | 170 | # obj = obj['anndata'] 171 | if len(groups) == len(obj.var_names): 172 | obj.var_names = groups 173 | else: 174 | warnings.warn("length of groups does not match number of samples, aborting...") 175 | return 176 | samples = obj.var 177 | samplenames = list(set(obj.var_names)) 178 | patterns = list(obj.var.columns) 179 | for i in np.arange(0,4): 180 | thispattern = samples[patterns[i]] 181 | data = [] 182 | for name in samplenames: 183 | data.append(thispattern.loc[name].values) 184 | df = pd.DataFrame(data) 185 | df = df.transpose() 186 | df.columns = samplenames 187 | ax = plt.subplot(2,2,i+1) 188 | ax.set_title(patterns[i]) 189 | ax.set_xlabel("Groups") 190 | ax.set_ylabel("Amplitude") 191 | plt.tight_layout() 192 | df.boxplot(ax=ax, rot=20, fontsize=6) 193 | plt.savefig("{}_patternBoxPlot.png".format(fn)) 194 | return df 195 | 196 | 197 | def calcZ(object: anndata, whichMatrix): 198 | """ Calculates the Z-score for each element based on input mean and standard deviation matrices 199 | 200 | Args: 201 | object (anndata): Anndata result object 202 | whichMatrix (str): either "featureLoadings" or "sampleFactors" indicating which matrix to calculate the z-score for 203 | 204 | Returns: 205 | arr: matrix of z scores 206 | """ 207 | if whichMatrix in "sampleFactors": 208 | mean = object.var 209 | stddev = object.uns["psd"] 210 | elif whichMatrix in "featureLoadings": 211 | mean = object.obs 212 | stddev = object.uns["asd"] 213 | else: 214 | print('whichMatrix must be either \'featureLoadings\' or \'sampleFactors\'') 215 | return 216 | if 0 in stddev.values: 217 | print("zeros detected in the standard deviation matrix; they have been replaced by small values") 218 | stddev[stddev == 0] = 1 ** -6 219 | return mean / stddev 220 | 221 | 222 | def reconstructGene(object: anndata, genes=None): 223 | """[summary] 224 | 225 | Args: 226 | object (anndata): Anndata result object 227 | genes (int, optional): an index of the gene or genes of interest. Defaults to None. 228 | 229 | Returns: 230 | arr: the D' estimate of a gene or set of genes 231 | """ 232 | D = np.dot(object.obs, np.transpose(object.var)) 233 | if genes is not None: 234 | D = D[genes, ] 235 | return D 236 | 237 | 238 | def binaryA(object, threshold, nrows="all", cluster=False, fn=""): 239 | """ plots a binary heatmap with each entry representing whether 240 | that position in the A matrix has a value greater than (black) 241 | or lesser than (white) the specified threshold * the standard 242 | deviation for that element 243 | 244 | Args: 245 | object (CogapsResult): A CogapsResult object 246 | threshold (float): threshold to compare A/Asd 247 | nrows (str, optional): how many rows should be plotted (for very long 248 | and skinny feature matrices). Defaults to "all". 249 | cluster (bool, optional): True or False, whether rows should be clustered 250 | (results in huge black and white blocks). Defaults to False. 251 | 252 | Returns: 253 | fig: a matplotlib plot object 254 | """ 255 | 256 | # object = object["anndata"] 257 | binA = calcZ(object, whichMatrix="featureLoadings") 258 | if nrows != "all": 259 | binA = binA[1:nrows, :] 260 | overthresh = binA > threshold 261 | underthresh = binA < threshold 262 | binA[overthresh] = 1 263 | binA[underthresh] = 0 264 | if cluster: 265 | hm = sns.clustermap(binA, cbar_pos=None) 266 | else: 267 | hm = sns.heatmap(binA, cbar=False) 268 | plt.title('Binary Heatmap') 269 | plt.savefig("{}_binaryA.png".format(fn)) 270 | plt.show() 271 | return hm 272 | 273 | 274 | def plotResiduals(object, uncertainty=None, legend=False, groups=None, ids=None, fn=""): 275 | """ Generate a residual plot 276 | 277 | Args: 278 | object (CogapsResult): A CogapsResult object 279 | uncertainty (arr, optional): original SD matrix with which GAPS was run. Defaults to None. 280 | legend (bool, optional): Add legend to plot. Defaults to False. 281 | groups (list, optional): group genes for plotting. Defaults to None. 282 | ids (list, optional): [description]. Defaults to None. 283 | 284 | Returns: 285 | fig: matplotlib figure 286 | """ 287 | 288 | # object = object["anndata"] 289 | # if groups is not None: 290 | # 291 | rawdata = object.X 292 | if uncertainty is None: 293 | uncertainty = np.where(rawdata * 0.1 > 0.1, rawdata * 0.1, 0.1) 294 | uncertainty = np.array(uncertainty) 295 | 296 | markerlabels = object.obs_names 297 | samplelabels = object.var_names 298 | M = reconstructGene(object) 299 | residual = (rawdata - M) / uncertainty 300 | residual = pd.DataFrame(residual, columns=samplelabels, index=markerlabels) 301 | hm = sns.heatmap(residual, cmap="Spectral", cbar=legend) 302 | plt.title('Residuals Plot') 303 | plt.savefig("{}_residualsPlot.png".format(fn)) 304 | plt.show() 305 | return hm 306 | 307 | 308 | def unitVector(n, length): 309 | """ Return unit vector of length with value 1 at pos n 310 | 311 | Args: 312 | n (int): pos of value 1 313 | length (int): length of unit vector 314 | 315 | Returns: 316 | arr: returns numpy array 317 | """ 318 | vec = np.repeat(0, length) 319 | vec[n] = 1 320 | return vec 321 | 322 | 323 | def patternMarkers(adata, threshold='all', lp=None, axis=1): 324 | """ calculate the most associated pattern for each gene 325 | 326 | Args: 327 | adata (anndata): anndata result object 328 | threshold (str, optional): the type of threshold to be used. The default "all" will 329 | distribute genes into pattern with the lowest ranking. The "cut" thresholds 330 | by the first gene to have a lower ranking, i.e. better fit to, a pattern.. Defaults to 'all'. 331 | lp (arr, optional): a vector of weights for each pattern to be used for finding 332 | markers. If NA markers for each pattern of the A matrix will be used.. Defaults to None. 333 | axis (int, optional): either 0 or 1, specifying if pattern markers should be calculated using 334 | the rows of the data (1) or the columns of the data (2). Defaults to 1. 335 | 336 | Raises: 337 | Exception: If threshold is not 'cut' or 'all' 338 | Exception: If lp length is not equal to number of patterns 339 | Exception: If axis is not either 0 or 1 340 | 341 | Returns: 342 | dict: A dictionary of PatternMarkers, PatternMarkerRanks, PatternMarkerScores 343 | """ 344 | if threshold.lower() not in ["cut", "all"]: 345 | raise Exception("threshold must be either 'cut' or 'all'") 346 | if lp is not None and (np.size(lp) != adata.obs.shape[1]): 347 | raise Exception("lp length must equal the number of patterns") 348 | if axis not in [1, 2]: 349 | raise Exception("axis must be either 0 or 1") 350 | 351 | patterns = list() 352 | obs_columns = list(adata.obs.columns) 353 | for column in obs_columns: 354 | if (column.startswith("Pattern")): 355 | patterns.append(column) 356 | 357 | if axis == 1: 358 | resultMatrix = adata.obs[patterns] 359 | otherMatrix = adata.var[patterns] 360 | else: 361 | resultMatrix = adata.var[patterns] 362 | otherMatrix = adata.obs[patterns] 363 | 364 | # Replacing infinite with 0 365 | resultMatrix.replace([np.inf, -np.inf, np.nan], 0, inplace=True) 366 | resultMatrix.replace([np.inf, -np.inf, np.nan], 0, inplace=True) 367 | row_max = np.nanmax(resultMatrix.values, axis=1, keepdims=True) 368 | row_max = np.where(row_max == 0, 1, row_max) 369 | 370 | normedMatrix = resultMatrix / row_max 371 | 372 | if lp is not None: 373 | markerScores = pd.DataFrame(np.sqrt(np.sum((normedMatrix.values - lp) ** 2, axis=1)), index=normedMatrix.index) 374 | markersByPattern = markerScores.sort_values(0).index.values 375 | dict = {"PatternMarkers": markersByPattern, "PatternMarkerRanks": np.argsort(markerScores, axis=0), 376 | "PatternMarkerScores": markerScores} 377 | return dict 378 | 379 | markerScores_arr = np.empty_like(normedMatrix) 380 | for i in range(normedMatrix.shape[1]): 381 | lp = unitVector(i, normedMatrix.shape[1]) 382 | markerScores_arr[:, i] = np.sqrt(np.sum((normedMatrix.values - lp) ** 2, axis=1)) 383 | 384 | markerScores = pd.DataFrame(markerScores_arr, index=normedMatrix.index, columns=normedMatrix.columns) 385 | 386 | markerRanks = pd.DataFrame(np.argsort(markerScores.values, axis=0), index=markerScores.index, 387 | columns=markerScores.columns) 388 | 389 | rankCutoff = np.empty(markerRanks.shape[1]) 390 | markersByPattern = {} 391 | if threshold == "cut": 392 | def simplicityGENES(As, Ps): 393 | # Using MinMaxScaler 394 | As.replace([np.inf, -np.inf, np.nan], 0, inplace=True) 395 | Ps.replace([np.inf, -np.inf, np.nan], 0, inplace=True) 396 | from sklearn.preprocessing import MinMaxScaler 397 | scaler = MinMaxScaler(feature_range=(0,1)) 398 | # Stack everything into a single column to scale by the global min / max 399 | tmp = Ps.to_numpy().reshape(-1, 1) 400 | scaled2 = scaler.fit_transform(tmp).reshape(Ps.shape) 401 | pscale = pd.DataFrame(scaled2, index=Ps.index, columns=Ps.columns) 402 | # TODO: figure out how to translate this: As < - sweep(As, 2, pscale, FUN="*") 403 | # pmax is the most significant pattern for each gene 404 | # Arowmax is the A matrix with every element divided by the max in the row 405 | pmax = np.nanmax(As.values, axis=1, keepdims=True) 406 | Arowmax = As / pmax 407 | 408 | ssl = pd.DataFrame().reindex_like(As) 409 | import math 410 | for i in np.arange(As.shape[1]): 411 | lp = np.repeat(0, As.shape[1]) 412 | lp[i] = 1 413 | def stat(x): 414 | return (math.sqrt(np.matmul(np.transpose(x-lp), (x-lp)))) 415 | 416 | ssl.stat = Arowmax.apply(func=stat, axis=1) 417 | order = np.argsort(ssl.stat) 418 | ssl[patterns[i]] = order.values 419 | 420 | return ssl[ssl >= 0] 421 | 422 | simGenes = simplicityGENES(As=resultMatrix, Ps=otherMatrix) 423 | nP = simGenes.shape[1] 424 | 425 | 426 | for i in np.arange(nP): 427 | # order gene names by significance for this pattern 428 | pname = patterns[i] 429 | sortSim = simGenes[pname].sort_values().index 430 | sortedGenes = simGenes.loc[sortSim, :] 431 | globalmins = sortedGenes.min(axis=1) 432 | thispattern = simGenes.loc[sortSim, pname] 433 | 434 | geneThresh = int(thispattern[thispattern > globalmins].min()) 435 | 436 | markerGenes = sortSim[1:geneThresh] 437 | markersByPattern[pname] = list(markerGenes.values) 438 | 439 | elif threshold == "all": 440 | patternsByMarker = markerScores.columns[np.argmin(markerScores.values, axis=1)] 441 | for i in range(markerScores.shape[1]): 442 | markersByPattern['Pattern' + str(i + 1)] = markerScores[ 443 | markerScores.columns[i] == patternsByMarker].index.values 444 | 445 | dict = {"PatternMarkers": (markersByPattern), "PatternMarkerRanks": np.argsort(markerScores, axis=0), 446 | "PatternMarkerScores": markerScores} 447 | return dict 448 | 449 | 450 | def calcCoGAPSStat(object, sets, whichMatrix='featureLoadings', numPerm=1000): 451 | """ calculates a statistic to determine if a pattern is enriched in a 452 | a particular set of measurements or samples. 453 | 454 | Args: 455 | object (CogapsResult): a CogapsResult object 456 | sets (list): list of sets of measurements/samples 457 | whichMatrix (str, optional): either "featureLoadings" or "sampleFactors" indicating which matrix 458 | to calculate the statistics. Defaults to 'featureLoadings'. 459 | numPerm (int, optional): number of permutations to use when calculating p-value. Defaults to 1000. 460 | 461 | Raises: 462 | Exception: If sets are not a list of measurements or samples 463 | 464 | Returns: 465 | dict: dict of gene set statistics for each column of A 466 | """ 467 | 468 | if not isinstance(sets, list): 469 | raise Exception("Sets must be a list of either measurements or samples") 470 | 471 | zMatrix = calcZ(object, whichMatrix) 472 | 473 | pattern_labels = (object.obs).columns 474 | 475 | zMatrix = pd.DataFrame(zMatrix, index=object.obs_names, columns=pattern_labels) 476 | pvalUpReg = [] 477 | 478 | lessThanCount = np.zeros(zMatrix.shape[1]) 479 | arr = zMatrix.loc[sets].values 480 | actualZScore = np.mean(arr[np.isfinite(arr)]) 481 | for n in range(numPerm): 482 | permutedIndices = np.random.choice(np.arange(1, zMatrix.shape[0]), size=len(sets), replace=False) 483 | permutedZScore = np.mean(zMatrix.iloc[permutedIndices,:].values, axis=0) 484 | lessThanCount = lessThanCount + (actualZScore < permutedZScore) 485 | pvalUpReg.append(lessThanCount / numPerm) 486 | 487 | pvalUpReg = np.array(pvalUpReg) 488 | pvalDownReg = 1 - pvalUpReg 489 | activityEstimate = 1 - 2 * pvalUpReg 490 | 491 | dict = {'twoSidedPValue': pd.DataFrame((np.maximum(np.minimum(pvalDownReg, pvalUpReg), 1 / numPerm)).T, index=pattern_labels), 492 | 'GSUpreg': pd.DataFrame(pvalUpReg.T, index=pattern_labels), 493 | 'GSDownreg': pd.DataFrame(pvalDownReg.T, index=pattern_labels), 494 | 'GSActEst': pd.DataFrame(activityEstimate.T, index=pattern_labels)} 495 | 496 | return dict 497 | 498 | 499 | def calcGeneGSStat(object, GStoGenes, numPerm, Pw=None, nullGenes=False): 500 | """ calculates the probability that a gene 501 | listed in a gene set behaves like other genes in the set within 502 | the given data set 503 | 504 | Args: 505 | object (CogapsResult): a CogapsResult object 506 | GStoGenes (list): list with gene sets 507 | numPerm (int): number of permutations for null 508 | Pw (arr, optional): weight on genes. Defaults to None. 509 | nullGenes (bool, optional): logical indicating gene adjustment. Defaults to False. 510 | 511 | Raises: 512 | Exception: If weighting is invalid 513 | 514 | Returns: 515 | dataframe: gene similiarity statistic 516 | """ 517 | featureLoadings = object.obs 518 | 519 | # adata = object 520 | 521 | if Pw is None: 522 | Pw = np.ones(featureLoadings.shape[1]) 523 | gsStat = calcCoGAPSStat(object, GStoGenes, numPerm=numPerm) 524 | gsStat = gsStat['GSUpreg'].values.T 525 | gsStat = -np.log(gsStat) 526 | 527 | if not np.isnan(Pw).all(): 528 | if np.size(Pw) != gsStat.shape[1]: 529 | raise Exception('Invalid weighting') 530 | gsStat = gsStat*Pw 531 | 532 | stddev = object.uns['asd'] 533 | if 0 in stddev: 534 | print("zeros detected in the standard deviation matrix; they have been replaced by small values") 535 | stddev[stddev == 0] = 1 ** -6 536 | stddev = pd.DataFrame(stddev, index=object.obs_names, columns=(object.obs).columns) 537 | 538 | featureLoadings = pd.DataFrame(featureLoadings, index=object.obs_names, columns=(object.obs).columns) 539 | 540 | if nullGenes: 541 | ZD = featureLoadings.loc[(featureLoadings.index).difference(GStoGenes),:].values / stddev.loc[(featureLoadings.index).difference(GStoGenes),:].values 542 | else: 543 | ZD = featureLoadings.loc[GStoGenes,:].values / stddev.loc[GStoGenes,:].values 544 | 545 | ZD_apply = np.multiply(ZD, gsStat) 546 | ZD_apply = np.sum(ZD_apply, axis=1) 547 | 548 | outStats = ZD_apply / np.sum(gsStat) 549 | outStats = outStats / np.sum(ZD, axis=1) 550 | outStats[np.argwhere(np.sum(ZD, axis=1) < 1e-6)] = 0 551 | 552 | if np.sum(gsStat) < 1e-6: 553 | return 0 554 | 555 | if nullGenes: 556 | outStats = pd.DataFrame(outStats, index=(featureLoadings.index).difference(GStoGenes)) 557 | else: 558 | outStats = pd.DataFrame(outStats, index=GStoGenes) 559 | 560 | return outStats 561 | 562 | 563 | def computeGeneGSProb(object, GStoGenes, numPerm=500, Pw=None, PwNull=False): 564 | """ Computes the p-value for gene set membership using the CoGAPS-based 565 | statistics developed in Fertig et al. (2012). This statistic refines set 566 | membership for each candidate gene in a set specified in \code{GSGenes} by 567 | comparing the inferred activity of that gene to the average activity of the 568 | set. 569 | 570 | Args: 571 | object (CogapsResult): a CogapsResult object 572 | GStoGenes (list): list with gene sets 573 | numPerm (int, optional): number of permutations for null. Defaults to 500. 574 | Pw ([type], optional): weight on genes. Defaults to None. 575 | PwNull (bool, optional): logical indicating gene adjustment. Defaults to False. 576 | 577 | Returns: 578 | arr: A vector of length GSGenes containing the p-values of set membership 579 | for each gene containined in the set specified in GSGenes. 580 | """ 581 | 582 | featureLoadings = object.obs 583 | # adata = object['anndata'] 584 | 585 | if Pw is None: 586 | Pw = np.ones(featureLoadings.shape[1]) 587 | 588 | geneGSStat = calcGeneGSStat(object, Pw=Pw, GStoGenes=GStoGenes, numPerm=numPerm).values 589 | 590 | if PwNull: 591 | permGSStat = calcGeneGSStat(object, GStoGenes=GStoGenes, numPerm=numPerm, Pw=Pw, nullGenes=True).values 592 | else: 593 | permGSStat = calcGeneGSStat(object, GStoGenes=GStoGenes, numPerm=numPerm, nullGenes=True).values 594 | 595 | finalStats = np.empty(len(GStoGenes)) 596 | for i in range(len(GStoGenes)): 597 | finalStats[i] = np.size(np.argwhere(permGSStat > geneGSStat[i])) / np.size(permGSStat) 598 | 599 | finalStats = pd.DataFrame(finalStats, index=GStoGenes) 600 | 601 | return finalStats 602 | 603 | 604 | 605 | def plotPatternMarkers(data, patternmarkers=None, groups = None, patternPalette=None, 606 | samplePalette=None, colorscheme="coolwarm", 607 | colDendrogram=True, rowDendrogram=False, scale="row", legend_pos=None, fn=""): 608 | """ Plots pattern markers of most associated pattern for each gene. 609 | 610 | Args: 611 | data (anndata): an anndata object, which should be your original data annotated with CoGAPS results 612 | patternmarkers (list, optional): list of markers for each pattern, as determined by the "patternMarkers(data)" function. Defaults to None. 613 | groups (list, optional): list of genes to group. Defaults to None. 614 | patternPalette (list, optional): a list of colors to be used for each pattern. 615 | if None, colors will be set automatically. Defaults to None. 616 | samplePalette (list, optional): a list of colors to be used for each sample. 617 | if None, colors will be set automatically. Defaults to None. 618 | colorscheme (str, optional): string indicating which color scheme should be used within the heatmap. 619 | more options at https://seaborn.pydata.org/tutorial/color_palettes.html. Defaults to "coolwarm". 620 | colDendrogram (bool, optional): Whether or not to draw a column dendrogram, default true. Defaults to True. 621 | rowDendrogram (bool, optional): Whether or not to draw a row dendrogram, default false. Defaults to False. 622 | scale (str, optional): whether you want data to be scaled by row, column, or none. Defaults to "row". 623 | legend_pos (str, optional): string indicating legend position, or none (no legend). Defaults to None. 624 | 625 | Returns: 626 | fig: a clustergrid instance 627 | """ 628 | 629 | # data = data["anndata"] 630 | if patternmarkers is None: 631 | patternmarkers=patternMarkers(data) 632 | if samplePalette is None: 633 | if groups is None: 634 | # color for each sample 635 | samplePalette=sns.color_palette("Spectral", np.shape(data)[1]) 636 | else: 637 | # color for each group 638 | samplePalette = sns.color_palette("Spectral", len(set(groups))) 639 | palette = [] 640 | groupkeys = list(set(groups)) 641 | grplst = list(groups) 642 | for i in range(len(groupkeys)): 643 | palette = np.concatenate((palette, np.repeat(mpl.colors.to_hex(samplePalette[i]), grplst.count(groupkeys[i])))) 644 | samplePalette = palette 645 | if patternPalette is None: 646 | palette = [] 647 | patternkeys = list(patternmarkers["PatternMarkers"].keys()) 648 | thiscmap = sns.color_palette("Spectral", len(patternkeys)) 649 | for i in range(len(patternkeys)): 650 | palette = np.concatenate((palette, np.repeat(mpl.colors.to_hex(thiscmap[i]), len(patternmarkers["PatternMarkers"][patternkeys[i]])))) 651 | patternPalette = palette 652 | elif patternPalette is not None: 653 | palette = [] 654 | patternkeys = list(patternmarkers["PatternMarkers"].keys()) 655 | for i in range(len(patternkeys)): 656 | palette = np.concatenate((palette, np.repeat(patternPalette[i], len(patternmarkers["PatternMarkers"][patternkeys[i]])))) 657 | patternPalette = palette 658 | if groups is not None: 659 | top = [] 660 | markers = patternmarkers["PatternMarkers"] 661 | keys=markers.keys() 662 | for key in keys: 663 | top.append(markers[key][1:15]) 664 | top=np.transpose(top) 665 | # top.columns = patterns[1:10] 666 | markers = [item for sublist in top for item in sublist] 667 | markers = [x for x in markers if str(x) != 'nan'] 668 | if len(groups) == len(data.var_names): 669 | data.var_names = groups 670 | else: 671 | warnings.warn("length of groups does not match number of samples, aborting...") 672 | return 673 | samples = data.var 674 | markermatrix = [] 675 | for group in groups: 676 | grplst = [] 677 | for marker in markers: 678 | print(group, marker) 679 | grplst.append(np.average(data[marker, group].X).tolist()) 680 | markermatrix.append(grplst) 681 | 682 | 683 | samplenames = list(set(data.var_names)) 684 | patterns = list(data.var.columns) 685 | else: 686 | markers = np.concatenate(list(patternmarkers["PatternMarkers"].values())) 687 | plotinfo = data[data.obs_names.isin(markers)] 688 | plotdata = plotinfo.X 689 | markerlabels = plotinfo.obs_names 690 | samplelabels = data[markers].var_names 691 | 692 | if scale not in ["row", "column", "none"]: 693 | warnings.warn("warning: scale must be one of \"row\", \"column\", or \"none\". data will not be scaled in " 694 | "this plot") 695 | if scale == "row": 696 | t = np.transpose(pd.DataFrame(plotdata)) 697 | z = zscore(t) 698 | plotdata_z = pd.DataFrame(np.transpose(z)) 699 | elif scale == "column": 700 | plotdata_z = pd.DataFrame(zscore(pd.DataFrame(plotdata))) 701 | else: 702 | plotdata_z = pd.DataFrame(plotdata) 703 | plotdata_z.columns = samplelabels 704 | plotdata_z.index = markerlabels 705 | plotdata_z.replace([np.inf, -np.inf, np.nan], 0, inplace=True) 706 | 707 | hm = sns.clustermap(plotdata_z, cmap=colorscheme, row_cluster=rowDendrogram, col_cluster=colDendrogram, 708 | row_colors=patternPalette, col_colors=samplePalette, cbar_pos=legend_pos) 709 | plt.title('Pattern Markers Plot') 710 | plt.savefig("{}_patternMarkers.png".format(fn)) 711 | plt.show() 712 | return hm 713 | 714 | 715 | def plotPatternUMAP(result, genes_in_rows=True, fn=None): 716 | """ Create a UMAP plot 717 | 718 | Args: 719 | result (anndata or CogapsResult): An anndata object of result or CogapsResult object 720 | genes_in_rows (bool, optional): Scanpy needs genes in columns, cells in rows. Defaults to True. 721 | """ 722 | # result=result["anndata"] 723 | if genes_in_rows: 724 | # scanpy needs genes in columns, cells in rows 725 | result = result.transpose() 726 | import scanpy as sc 727 | # set up environment 728 | patterns = list() 729 | obs_columns = list(result.obs.columns) 730 | for column in obs_columns: 731 | if (column.startswith("Pattern")): 732 | patterns.append(column) 733 | sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3) 734 | sc.logging.print_header() 735 | sc.settings.set_figure_params(dpi=80, facecolor='white') 736 | # result.var_names_make_unique() 737 | sc.pp.log1p(result) 738 | sc.pp.highly_variable_genes(result) 739 | result = result[:, result.var.highly_variable] 740 | sc.pp.scale(result, max_value=10) 741 | sc.tl.pca(result, svd_solver='arpack') 742 | sc.pp.neighbors(result) 743 | sc.tl.umap(result) 744 | sc.pl.umap(result, color=patterns) 745 | if fn is not None: 746 | sc.pl.umap(result, color=patterns, save="{}_UMAP.png".format(fn)) 747 | 748 | if __name__ == '__main__': 749 | import pickle 750 | import sys 751 | import os 752 | import matplotlib as mpl 753 | mpl.use('tkagg') 754 | 755 | # path to your result file, from command line 756 | path = sys.argv[1] 757 | 758 | # this unpickles the result object for use 759 | result = anndata.read_h5ad(path) 760 | 761 | # get filename, to name the saved plots 762 | filename = os.path.basename(path).split('.')[0] 763 | 764 | # call some of the plotting functions and save 765 | plot(result, fn=filename) 766 | # # binaryA(result, threshold=2, fn=filename) 767 | plotPatternMarkers(result, fn=filename) 768 | # plotResiduals(result, fn=filename) 769 | plotPatternUMAP(result, fn=filename) 770 | --------------------------------------------------------------------------------