├── .gitignore
├── .idea
└── other.xml
├── .travis.yml
├── LICENSE
├── R
├── drg_datasets.R
├── seurat_cca_hockley_usoskin.R
├── simulation.R
└── test_simulation.R
├── README.md
├── __init__.py
├── _config.yml
├── bin
├── scRNA-generate-data.sh
├── scRNA-source.sh
└── scRNA-target.sh
├── doc
├── screen_install_generate.png
├── screen_install_pip.png
├── screen_install_result.png
├── screen_install_scripts.png
├── screen_install_source.png
└── screen_install_target.png
├── matlab
└── magic_hockley_usoskin.m
├── notebooks
└── example.ipynb
├── requirements.txt
├── scRNA
├── __init__.py
├── abstract_clustering.py
├── cmd_generate_data.py
├── cmd_source.py
├── cmd_target.py
├── nmf_clustering.py
├── sc3_clustering.py
├── sc3_clustering_impl.py
├── simulation.py
└── utils.py
├── scripts
├── experiments
│ ├── main_wrapper_generated_data.py
│ ├── main_wrapper_hockley.py
│ ├── main_wrapper_hockley_NMF_labels.py
│ ├── main_wrapper_hockley_NMF_labels_robustness.py
│ ├── main_wrapper_hockley_robustness.py
│ ├── main_wrapper_hockley_robustness_magic.py
│ ├── main_wrapper_hockley_robustness_seurat.py
│ ├── main_wrapper_tasic.py
│ └── main_wrapper_tasic_NMF_labels.py
└── plots
│ ├── evaluate_hockley_robustness.py
│ ├── evaluate_hockley_robustness_magic_seurat.py
│ ├── main_plots_generated_data.py
│ ├── main_plots_hockley.py
│ ├── main_plots_hockley_magic.py
│ ├── main_plots_hockley_robustness.py
│ ├── main_plots_hockley_seurat.py
│ └── main_plots_tasic.py
├── setup.py
└── tests
└── test_transfer.py
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | .idea/
3 |
--------------------------------------------------------------------------------
/.idea/other.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | python:
3 | - "3.6"
4 | install:
5 | # - sudo apt-get update
6 | # We do this conditionally because it saves us some downloading if the
7 | # version is the same.
8 | - if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then
9 | wget https://repo.continuum.io/miniconda/Miniconda2-latest-Linux-x86_64.sh -O miniconda.sh;
10 | else
11 | wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh;
12 | fi
13 | - bash miniconda.sh -b -p $HOME/miniconda
14 | - export PATH="$HOME/miniconda/bin:$PATH"
15 | - hash -r
16 | - conda config --set always_yes yes --set changeps1 no
17 | - conda update -q conda
18 | # Useful for debugging any issues with conda
19 | - conda info -a
20 |
21 | # Replace dep1 dep2 ... with your dependencies
22 | - conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION --file requirements.txt
23 | - source activate test-environment
24 | - python setup.py install
25 | script:
26 | - scRNA-generate-data.sh --target_ncells 100 --source_ncells 1000 --cluster_spec "[1, 2, 3, [4, 5], [6, [7, 8]]]"
27 | --num_genes 1000 --splitting_mode 4
28 | - ls
29 | - scRNA-source.sh --fname fout_source_data_T1_100_S1_1000.tsv --fgene-ids fout_geneids.tsv
30 | --fout src --cluster-range 8 --flabels fout_source_labels_T1_100_S1_1000.tsv
31 | --no-tsne --no-cell-filter --no-gene-filter --no-transform
32 | - scRNA-target.sh --src-fname src_c8.npz --fname fout_source_data_T1_100_S1_1000.tsv
33 | --fgene-ids fout_geneids.tsv --flabels-trg fout_target_labels_T1_100_S1_1000.tsv
34 | --no-cell-filter --no-gene-filter --cluster-range 8
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2015 Nico Goernitz, TU Berlin
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
23 |
--------------------------------------------------------------------------------
/R/drg_datasets.R:
--------------------------------------------------------------------------------
1 | library(readr)
2 | library(dplyr)
3 | library(ggplot2)
4 | library(preprocessCore)
5 |
6 | load.matrix <- function(matrix, rowids, colids){
7 | m = read_tsv(matrix, col_names = FALSE)
8 |
9 | r = scan(rowids, what = "character")
10 | c = scan(colids, what = "character")
11 |
12 | rownames(m) = r
13 | colnames(m) = c
14 |
15 | return(as.matrix(m))
16 |
17 | }
18 |
19 | matrices = list()
20 |
21 | matrices[[1]] = load.matrix(
22 | "scRNASeq_TU_Berlin_Pfizer-selected/Human_PFE_iPSC_Neurons/fpkm_log2_matrix.tsv",
23 | "scRNASeq_TU_Berlin_Pfizer-selected/Human_PFE_iPSC_Neurons/fpkm_rows.tsv",
24 | "scRNASeq_TU_Berlin_Pfizer-selected/Human_PFE_iPSC_Neurons/fpkm_cols.tsv"
25 | )
26 |
27 | matrices[[2]] = load.matrix(
28 | "scRNASeq_TU_Berlin_Pfizer-selected/Mouse_JH_DRG_Neurons/fpkm_log2_matrix.tsv",
29 | "scRNASeq_TU_Berlin_Pfizer-selected/Mouse_JH_DRG_Neurons/fpkm_rows.tsv",
30 | "scRNASeq_TU_Berlin_Pfizer-selected/Mouse_JH_DRG_Neurons/fpkm_cols.tsv"
31 | )
32 |
33 | matrices[[3]] = load.matrix(
34 | "scRNASeq_TU_Berlin_Pfizer-selected/Mouse_Usoskin_DRG_Neurons/usoskin_log2_matrix.tsv",
35 | "scRNASeq_TU_Berlin_Pfizer-selected/Mouse_Usoskin_DRG_Neurons/usoskin_rows.tsv",
36 | "scRNASeq_TU_Berlin_Pfizer-selected/Mouse_Usoskin_DRG_Neurons/usoskin_cols.tsv"
37 | )
38 |
39 | matrices[[4]] = load.matrix(
40 | "scRNASeq_TU_Berlin_Pfizer-selected/Rat_PFE_DRG_Neurons/vst_log2_matrix.tsv",
41 | "scRNASeq_TU_Berlin_Pfizer-selected/Rat_PFE_DRG_Neurons/vst_rows.tsv",
42 | "scRNASeq_TU_Berlin_Pfizer-selected/Rat_PFE_DRG_Neurons/vst_cols.tsv"
43 | )
44 |
45 | matrices[[5]] = load.matrix(
46 | "scRNASeq_TU_Berlin_Pfizer-selected/Mouse_Li_DRG_Neurons/fpkm_log2_matrix.tsv",
47 | "scRNASeq_TU_Berlin_Pfizer-selected/Mouse_Li_DRG_Neurons/fpkm_rows.tsv",
48 | "scRNASeq_TU_Berlin_Pfizer-selected/Mouse_Li_DRG_Neurons/fpkm_cols.tsv"
49 | )
50 |
51 | common.rows = Reduce(intersect, lapply(matrices, rownames))
52 |
53 | matrices.filt = lapply(matrices, function(m){
54 | return(m[common.rows,])
55 | })
56 |
57 | merged.matrix = do.call(cbind, matrices.filt)
58 | norm.matrix = normalize.quantiles(merged.matrix)
59 |
60 | d = dist(t(norm.matrix)) # euclidean distances between the rows
61 | fit = cmdscale(d, eig=TRUE, k=2) # k is the number of dim
62 |
63 | plot.df = as.data.frame(fit$points)
64 |
65 | plot.df$Dataset = factor(
66 | c(
67 | rep("iPSC", ncol(matrices[[1]])),
68 | rep("MouseJH", ncol(matrices[[2]])),
69 | rep("MouseUsoskin", ncol(matrices[[3]])),
70 | rep("RatPFE", ncol(matrices[[4]])),
71 | rep("MouseLi", ncol(matrices[[5]]))
72 | )
73 | )
74 | ggplot(plot.df, aes(x = V1, y = V2, color = Dataset)) + geom_point()
--------------------------------------------------------------------------------
/R/seurat_cca_hockley_usoskin.R:
--------------------------------------------------------------------------------
1 | # Seurats CCA on Hockley and Usoskin
2 |
3 | # Reading Jims data
4 | data_target = read.table("C:/Users/Bettina/ml/scRNAseq/Data/Jims data/Visceraltpm_m_fltd_mat.tsv");
5 | # reverse log2 for now
6 | data_target = (2^data_target)-1
7 |
8 | cell_names_target = read.table("C:/Users/Bettina/ml/scRNAseq/Data/Jims data/Visceraltpm_m_fltd_col.tsv")
9 | transcript_names_target = t(read.table("C:/Users/Bettina/ml/scRNAseq/Data/Jims data/Visceraltpm_m_fltd_row.tsv"))
10 | colnames(data_target)=t(cell_names_target)
11 | rownames(data_target)=transcript_names_target
12 |
13 | # Reading Usoskin data
14 | data_source = read.table("C:/Users/Bettina/ml/scRNAseq/Data/Usoskin data/usoskin_m_fltd_mat.tsv");
15 | cell_names_source = read.table("C:/Users/Bettina/ml/scRNAseq/Data/Usoskin data/usoskin_m_fltd_col.tsv")
16 | transcript_names_source = t(read.table("C:/Users/Bettina/ml/scRNAseq/Data/Usoskin data/usoskin_m_fltd_row.tsv"))
17 | colnames(data_source)=t(cell_names_source)
18 | rownames(data_source)=transcript_names_source
19 |
20 | # Perform Seurats CCA
21 |
22 | library(Seurat)
23 |
24 | # Set up target object
25 | target <- CreateSeuratObject(counts = data_target, project = "Jim",assay = "RNA", min.cells = 18, min.features = 2000)
26 | target$set <- "target"
27 | # target <- subset(target, subset = nFeature_RNA > 500)
28 | target <- NormalizeData(target, normalization.method = "LogNormalize")
29 | target <- FindVariableFeatures(target, selection.method = "vst", nfeatures = 5000)
30 |
31 | # Set up stimulated object
32 | source <- CreateSeuratObject(counts = data_source, project = "Usoskin",assay = "RNA", min.cells = 37, min.features = 2000)
33 | source$set <- "source"
34 | # source <- subset(source , subset = nFeature_RNA > 500)
35 | source <- NormalizeData(source, normalization.method = "LogNormalize")
36 | source <- FindVariableFeatures(source , selection.method = "vst", nfeatures = 5000)
37 |
38 | immune.anchors <- FindIntegrationAnchors(object.list = list(target, source), dims = 1:20, anchor.features = 5000)
39 | immune.combined <- IntegrateData(anchorset = immune.anchors, dims = 1:20)
40 |
41 | # MetageneBicorPlot(immune.combined, grouping.var = "set", dims.eval = 1:20)
42 |
43 | # Extract the combined and the two individual datasets
44 | data_comb = immune.combined$integrated
45 | data_target_new = data_comb[,immune.combined$set=="target"]
46 | data_target_new = exp(data_target_new)
47 | data_source_new = data_comb[,immune.combined$set=="source"]
48 | data_source_new = exp(data_source_new)
49 |
50 | # Save data
51 | write.table(data_target_new, "C:/Users/Bettina/ml/scRNAseq/Data/Jims data/Jim_after_Seurat.tsv", quote=FALSE, sep='\t', col.names=FALSE,row.names=FALSE)
52 | write.table(colnames(data_target_new), "C:/Users/Bettina/ml/scRNAseq/Data/Jims data/Jim_cell_names_after_Seurat.tsv", quote=FALSE, sep='\t', col.names=FALSE,row.names=FALSE)
53 | write.table(rownames(data_target_new), "C:/Users/Bettina/ml/scRNAseq/Data/Jims data/Jim_gene_names_after_Seurat.tsv", quote=FALSE, sep='\t', col.names=FALSE,row.names=FALSE)
54 |
55 |
56 | write.table(data_source_new, "C:/Users/Bettina/ml/scRNAseq/Data/Usoskin data/Usoskin_after_Seurat.tsv", quote=FALSE, sep='\t', col.names=FALSE,row.names=FALSE)
57 | write.table(colnames(data_source_new), "C:/Users/Bettina/ml/scRNAseq/Data/Usoskin data/Usoskin_cell_names_after_Seurat.tsv", quote=FALSE, sep='\t', col.names=FALSE,row.names=FALSE)
58 | write.table(rownames(data_source_new), "C:/Users/Bettina/ml/scRNAseq/Data/Usoskin data/Usoskin_gene_names_after_Seurat.tsv", quote=FALSE, sep='\t', col.names=FALSE,row.names=FALSE)
59 |
60 |
--------------------------------------------------------------------------------
/R/simulation.R:
--------------------------------------------------------------------------------
1 | set.seed(100)
2 |
3 | n.genes = 1e4
4 | n.cells = 1e3
5 | gene.length = 1e4
6 |
7 | #We should try to base these values off real datasets as much as possible
8 |
9 | prop.cells.per.cluster = c(0.25,0.6,0.15) #Must sum to 1
10 | prop.de.per.pop = c(0.1,0.3,0.25) #Must sum to <= 1
11 | de.logfc = c(2, 1, 2) #These are *log2* fold changes
12 |
13 | nde = prop.de.per.pop * n.genes
14 | pop.sizes = prop.cells.per.cluster * n.cells
15 |
16 | gamma.shape = 2
17 | gamma.rate = 2
18 | nb.dispersion = 0.1
19 |
20 | population.means = rgamma(n.genes, gamma.shape, gamma.rate)
21 |
22 | counts = list()
23 | true.facs = list()
24 | cluster.means = list()
25 |
26 | for (x in seq_along(pop.sizes)) {
27 |
28 | #This simulates per cell differences in sequencing efficiency / capture
29 | all.facs = 2^rnorm(pop.sizes[x], mean = 0, sd=0.5)
30 | effective.means = outer(population.means, all.facs, "*")
31 |
32 | #This simulates DE in the proportion of genes specified
33 | chosen = c(1, cumsum(nde))[x]:cumsum(nde)[x]
34 | up.down = sign(rnorm(length(chosen)))
35 |
36 | #This captures the 'true' means for this cluster
37 | ideal.means = population.means
38 | ideal.means = ideal.means[chosen] * 2^(de.logfc[x] * up.down)
39 | cluster.means[[x]] = ideal.means
40 |
41 | #This simulates the effective counts for this cluster
42 | effective.means[chosen,] = effective.means[chosen,] * 2^(de.logfc[x] * up.down)
43 | counts[[x]] = matrix(
44 | rnbinom(n.genes*pop.sizes[x], mu=effective.means, size=1/nb.dispersion),
45 | ncol=pop.sizes[x]
46 | )
47 | }
48 |
49 | counts = do.call(cbind, counts)
50 | fpkms = t(t(counts) / (colSums(counts)/1e6)) / gene.length
51 |
52 | library(pheatmap)
53 | library(RColorBrewer)
54 | #library(tsne)
55 |
56 | cluster.annotation = data.frame(
57 | Cluster = rep(LETTERS[1:length(pop.sizes)], pop.sizes)
58 | )
59 | cor.df = as.data.frame(cor(log2(fpkms+1)))
60 |
61 | rownames(cor.df) = paste("Cell", as.character(1:sum(pop.sizes)), sep = "")
62 | rownames(cluster.annotation) = rownames(cor.df)
63 |
64 | pheatmap(
65 | cor.df,
66 | color = colorRampPalette(brewer.pal(n = 7, name = "Blues"))(100),
67 | annotation_row = cluster.annotation,
68 | show_rownames = F, show_colnames = F
69 | )
--------------------------------------------------------------------------------
/R/test_simulation.R:
--------------------------------------------------------------------------------
1 | library(readr)
2 | library(pheatmap)
3 |
4 | source.fname = "source_data_T5_500_S5_500.tsv"
5 | target.fname = "target_data_T5_500_S5_500.tsv"
6 |
7 | source.data = read_tsv(source.fname, col_names = FALSE)
8 | target.data = read_tsv(target.fname, col_names = FALSE)
9 |
10 | colnames(source.data) = as.character(1:ncol(source.data))
11 | colnames(target.data) = as.character((ncol(source.data)+1):(ncol(source.data)+ncol(target.data)))
12 |
13 | source.labels = read_tsv("source_labels_T5_500_S5_500.tsv", col_names = FALSE)
14 | source.labels$X1 = factor(source.labels$X1)
15 | target.labels = read_tsv("target_labels_T5_500_S5_500.tsv", col_names = FALSE)
16 | target.labels$X1 = factor(target.labels$X1)
17 |
18 | source.dist.mat = cor(source.data)
19 | rownames(source.labels) = rownames(source.dist.mat)
20 |
21 | target.dist.mat = cor(target.data)
22 | rownames(target.labels) = rownames(target.dist.mat)
23 |
24 | pheatmap(
25 | source.dist.mat,
26 | annotation_col = source.labels,
27 | clustering_method = "ward.D",
28 | show_colnames = FALSE, show_rownames = FALSE
29 | )
30 | pheatmap(
31 | target.dist.mat,
32 | annotation_col = target.labels,
33 | clustering_method = "ward.D",
34 | show_colnames = FALSE, show_rownames = FALSE
35 | )
36 |
37 | combined.data = cbind(source.data, target.data)
38 | combined.labels = rbind(source.labels, target.labels)
39 |
40 | combined.dist.mat = cor(combined.data)
41 | rownames(combined.labels) = rownames(combined.dist.mat)
42 |
43 | combined.labels$X2 = factor(c(rep("Source", ncol(source.data)), rep("Target", ncol(target.data))))
44 |
45 | pheatmap(
46 | combined.dist.mat,
47 | annotation_col = combined.labels,
48 | clustering_method = "ward.D",
49 | show_colnames = FALSE, show_rownames = FALSE
50 | )
51 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | scRNA - Transfer learning for clustering single-cell RNA-Seq data
2 | =====
3 | A Python framework for single-cell RNA-Seq clustering with special focus on transfer learning.
4 |
5 | This package contains methods for generating artificial data, clustering, and transfering knowledge
6 | from a source to a target dataset.
7 |
8 | This software package is developed by Nico Goernitz, Bettina Mieth, Marina Vidovic, and Alex Gutteridge.
9 |
10 | 
11 |
12 | #### Publication
13 | The Python framework and this website are part of a publication currently under peer-review at Nature Scientific Reports. Links to the published paper and online supplementary material will be included here once available.
14 |
15 | **Abstract:** In many research areas scientists are interested in clustering objects within small datasets while making use of prior knowledge from large reference datasets. We propose a method to apply the machine learning concept of transfer learning to unsupervised clustering problems and show its effectiveness in the field of single-cell RNA sequencing (scRNA-Seq). The goal of scRNA-Seq experiments is often the definition and cataloguing of cell types from the transcriptional output of individual cells. To improve the clustering of small disease- or tissue-specific datasets, for which the identification of rare cell types is often problematic, we propose a transfer learning method to utilize large and well-annotated reference datasets, such as those produced by the Human Cell Atlas. Our approach modifies the dataset of interest while incorporating key information from the larger reference dataset via Non-negative Matrix Factorization (NMF). The modified dataset is subsequently provided to a clustering algorithm. We empirically evaluate the benefits of our approach on simulated scRNA-Seq data as well as on publicly available datasets. Finally, we present results for the analysis of a recently published small dataset and find improved clustering when transferring knowledge from a large reference dataset.
16 |
17 | #### News
18 | * (2019.08) Information on the experimental results presented in our paper (_under review_) can be accessed
19 | in the Section ["Replicating experiments"](#replicating-experiments)
20 | * (2019.08) We added example application using Jupyter notebooks (cf. Section ["Example application"](#example-application))
21 | * (2019.08) Added Python 3 support (scRNA no longer supports Python 2)
22 | * (2019.08) Finalized version
23 | * (2017.02) Added Travis-CI
24 | * (2017.02) Added string label support
25 | * (2017.02) Simple example [available](#example)
26 | * (2017.02) [Website](http://nicococo.github.io/scRNA/) is up and running
27 | * (2017.02) [Wiki](https://github.com/nicococo/scRNA/wiki) with detailed information (e.g. command line arguments)
28 | * (2017.01) Please report [Bugs](https://github.com/nicococo/scRNA/issues) or other inconveniences
29 | * (2017.01) scRNA can now be conveniently installed using the _pip install git+https://github.com/nicococo/scRNA.git_
30 | command (see [Installation](#installation) for further information)
31 | * (2017.01) Command line script available
32 |
33 |
34 |
35 | Getting started
36 | ---------------
37 |
38 | ### Installation
39 | We assume that Python is installed and the _pip_ command is
40 | callable from the command line. If starting from scratch, we recommend installing
41 | the [Anaconda](https://www.continuum.io/downloads) open data science
42 | platform (w/ Python 3) which comes with a bunch of most useful packages
43 | for scientific computing.
44 |
45 | The *scRNA* software package can be installed using the _pip install git+https://github.com/nicococo/scRNA.git_
46 | command. After successful completion, three command line arguments will be
47 | available for **MacOS and Linux only**:
48 |
49 | * _scRNA-generate-data.sh_
50 | * _scRNA-source.sh_
51 | * _scRNA-target.sh_
52 |
53 |
54 | ### Example
55 | **Step 1**: Installation with _pip install git+https://github.com/nicococo/scRNA.git_
56 | 
57 |
58 |
59 |
60 | **Step 2**: Check the scripts
61 | 
62 |
63 |
64 |
65 | **Step 3**: Create directory /foo. Go to directory /foo. Generate some artificial data
66 | by simply calling the _scRNA-generate-data.sh_ (using only default parameters).
67 |
68 |
69 | 
70 |
71 |
72 | This will result in a number of files:
73 | * Gene ids
74 | * Source- and target data
75 | * Source- and target ground truth labels
76 |
77 |
78 |
79 | **Step 4**: NMF of source data using the provided gene ids and source data. Ie. we want
80 | to turn off the cell- and gene-filter as well as the log transformation.
81 | You can provide source labels to be used as a starting point for NMF. If not those labels
82 | will be generated via NMF Clustering.
83 | Potential problems:
84 | * If a ''Intel MKL FATAL ERROR: Cannot load libmkl_avx.so or libmkl_def.so.''
85 | occurs and Anaconda open data science platform is used, then use _conda install mkl_ first.
86 | * Depending on the data and cluster range, this step can take time. However, you can
87 | speed up the process by tuning off the t-SNE plots using the _--no-tsne_ command
88 | (see [Wiki](https://github.com/nicococo/scRNA/wiki) for further information)
89 |
90 | 
91 |
92 |
93 | This will result in a number of files:
94 | * t-SNE plots (.png) for every number of cluster as specified in the --cluster-range argument (default 6,7,8)
95 | * Output source model in .npz format for every number of cluster as specified in the --cluster-range argument (default 6,7,8)
96 | * A summarizing .png figure
97 | * True cluster labels - either as provided from user or as generated via NMF Clustering - (and corresponding cell id) in .tsv format for every number of cluster as specified in the --cluster-range argument (default 6,7,8)
98 | * Model cluster labels after NMF (and corresponding cell id) in .tsv format for every number of cluster as specified in the --cluster-range argument (default 6,7,8)
99 |
100 |
101 |
102 |
103 | **Step 5**: Now, it is time to cluster the target data and transfer knowledge from the source model to our target data. Therefore, we need to
104 | choose a source data model which was generated in **Step 4**. In this example, we will
105 | pick the model with 8 cluster (*src_c8.npz*).
106 |
107 | * Depending on the data, the cluster range and the mixture range, this step can take a long
108 | time. However, you can speed up the process by tuning off the t-SNE plots using the _--no-tsne_ command
109 | (see [Wiki](https://github.com/nicococo/scRNA/wiki) for further information)
110 |
111 | 
112 |
113 | Which results in a number of files (for each value in the cluster range).
114 | * Predicted cluster labels after transfer learning (and corresponding cell id) in .tsv format for every number of cluster as specified in the --cluster-range argument (default 6,7,8)
115 | * t-SNE plots with predicted labels (.png)
116 | * Data and gene ids in .tsv files
117 |
118 | In addition there is a summarizing .png figure of all accs and a t-SNE plot with the real target labels, if they were provided.
119 |
120 | 
121 |
122 | Command line output shows a number of results: unsupervised and supervised (if no ground truth labels
123 | are given this will remain 0.) accuracy measures.
124 |
125 | Example application
126 | ---------------
127 |
128 | Using Jupyter notebooks, we showcase the main workflow as well as the abilities of the application.
129 | The main features are
130 | * generating read-count data
131 | * data splits using various scenarios
132 | * source data clustering with and without accompanying labels
133 | * augmented clustering of the target data with user defined mix-in of the source data influence.
134 |
135 | The Jupyter notebook can be accessed under [https://github.com/nicococo/scRNA/blob/master/notebooks/example.ipynb][example_notebook]
136 |
137 |
138 | [example_notebook]: https://github.com/nicococo/scRNA/blob/master/notebooks/example.ipynb
139 |
140 |
141 | Replicating experiments
142 | ---------------
143 | In the course of our research (Mieth et al., see references below) we have investigated the performance of the proposed method in comparison with the most important baseline methods firstly in a simulation study on generated data, secondly on subsampled real data (Tasic et al.) and finally on two independent real datasets (Hockley et al. and Usoskin et al.). We have also shown, that batch effect removal approaches (Butler et al.) and imputation methods (Van Dijk et al.) can be used to further improve clustering results when applying our method.
144 |
145 | * Mieth, B. et al. Clustering single-cell RNA-Seq data: An approach to transferring prior reference knowledge into datasets of small sample size. Under review at Nat. Sci. Rep. (2019)
146 | * Tasic, B. et al. Adult mouse cortical cell taxonomy revealed by single cell transcriptomics. Nat. Neurosci. 19, 335–46 (2016).
147 | * Hockley, J. R. F. et al. Single-cell RNAseq reveals seven classes of colonic sensory neuron. Gut. 2017–315631 (2018).
148 | * Usoskin, D. et al. Unbiased classification of sensory neuron types by large-scale single-cell RNA sequencing. Nat. Neurosci. 18, 145–153 (2014).
149 | * Butler, A., Hoffman, P., Smibert, P., Papalexi, E. & Satija, R. Integrating single-cell transcriptomic data across different conditions, technologies, and species. Nat. Biotechnol. 36, 411-420 (2018).
150 | * Van Dijk, D. et al. Recovering Gene Interactions from Single-Cell Data Using Data Diffusion. Cell. 174, 716-729 (2018).
151 |
152 | To fully reproduce the experiments of our study you can find the corresponding scripts at the following links:
153 |
154 | * [Experiments on generated datasets](https://github.com/nicococo/scRNA/blob/master/scripts/experiments/main_wrapper_generated_data.py)
155 | * [Experiments on subsampled Tasic data with labels from the original publication](https://github.com/nicococo/scRNA/blob/master/scripts/experiments/main_wrapper_tasic.py)
156 | * [Experiments on subsampled Tasic data with NMF labels](https://github.com/nicococo/scRNA/blob/master/scripts/experiments/main_wrapper_tasic_NMF_labels.py)
157 | * [Experiments on Hockley data with Usoskin as source data](https://github.com/nicococo/scRNA/blob/master/scripts/experiments/main_wrapper_hockley.py)
158 | * [Robustness experiments on Hockley data with Usoskin as source data](https://github.com/nicococo/scRNA/blob/master/scripts/experiments/main_wrapper_hockley_robustness.py)
159 | * [Robustness experiments on Hockley data with usoskin as source data and pre-processing through Seurats batch effect removal method](https://github.com/nicococo/scRNA/blob/master/scripts/experiments/main_wrapper_hockley_robustness_seurat.py)
160 | - [corresponding R script for Seurat pre-processing](https://github.com/nicococo/scRNA/blob/master/R/seurat_cca_hockley_usoskin.R)
161 | * [Robustness experiments on Hockley data with usoskin as source data and pre-processing through MAGIC imputation](https://github.com/nicococo/scRNA/blob/master/scripts/experiments/main_wrapper_hockley_robustness_magic.py)
162 | - [corresponding Matlab script for MAGIC pre-processing](https://github.com/nicococo/scRNA/blob/master/matlab/magic_hockley_usoskin.m)
163 |
164 | For producing the figures of the paper go to:
165 | * [Figure 2](https://github.com/nicococo/scRNA/blob/master/scripts/plots/main_plots_generated_data.py)
166 | * [Figure 3](https://github.com/nicococo/scRNA/blob/master/scripts/plots/main_plots_tasic.py)
167 | * [Figure 4](https://github.com/nicococo/scRNA/blob/master/scripts/plots/main_plots_hockley.py)
168 |
169 | For evaluating the robustness experiments producing table 1 of the paper go to:
170 | * [Without preprocessing](https://github.com/nicococo/scRNA/blob/master/scripts/plots/evaluate_hockley_robustness.py)
171 | * [With Seurat and MAGIC pre-processing](https://github.com/nicococo/scRNA/blob/master/scripts/plots/evaluate_hockley_robustness_magic_seurat.py)
172 |
173 | Parameter Selection
174 | ---------------
175 | All pre-processing parameters of the experiments presented in the paper can be found in the corresponding scripts (above) and in the online supplementary material (for the generated datasets in Supplementary Sections 2.1, for the Tasic data in Supplementary Sections 3.1 and for the Hockley and Usoskin datasets in Supplementary Sections 4.1. Details on all other parameters of the respective datasets can also be found in the scripts or in the corresponding sections of the supplementary online material (Supplementary Sections 2.2, 3.2 and 4.2, respectively).
176 |
177 |
178 | Data availability
179 | ---------------
180 | The datasets analyzed during the current study are available in the following GEO repositories:
181 | * Tasic et al. (2016): https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE71585.
182 | * Usoskin et al. (2014): https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE59739
183 | * Hockley et al. (2018): https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE102962
184 | * The command line script for simulating scRNA-Seq datasets is available at https://github.com/nicococo/scRNA/blob/master/scRNA/cmd_generate_data.py.
185 |
186 |
--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nicococo/scRNA/72999f7e8c813534b193d9c77a10068f9d489e05/__init__.py
--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-slate
--------------------------------------------------------------------------------
/bin/scRNA-generate-data.sh:
--------------------------------------------------------------------------------
1 | #!python
2 |
3 | from scRNA import cmd_generate_data
4 |
--------------------------------------------------------------------------------
/bin/scRNA-source.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | from scRNA import cmd_source
4 |
--------------------------------------------------------------------------------
/bin/scRNA-target.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | from scRNA import cmd_target
4 |
--------------------------------------------------------------------------------
/doc/screen_install_generate.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nicococo/scRNA/72999f7e8c813534b193d9c77a10068f9d489e05/doc/screen_install_generate.png
--------------------------------------------------------------------------------
/doc/screen_install_pip.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nicococo/scRNA/72999f7e8c813534b193d9c77a10068f9d489e05/doc/screen_install_pip.png
--------------------------------------------------------------------------------
/doc/screen_install_result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nicococo/scRNA/72999f7e8c813534b193d9c77a10068f9d489e05/doc/screen_install_result.png
--------------------------------------------------------------------------------
/doc/screen_install_scripts.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nicococo/scRNA/72999f7e8c813534b193d9c77a10068f9d489e05/doc/screen_install_scripts.png
--------------------------------------------------------------------------------
/doc/screen_install_source.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nicococo/scRNA/72999f7e8c813534b193d9c77a10068f9d489e05/doc/screen_install_source.png
--------------------------------------------------------------------------------
/doc/screen_install_target.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nicococo/scRNA/72999f7e8c813534b193d9c77a10068f9d489e05/doc/screen_install_target.png
--------------------------------------------------------------------------------
/matlab/magic_hockley_usoskin.m:
--------------------------------------------------------------------------------
1 | %% MAGIC Pre-processing for Hockley and Usoskin datasets
2 |
3 | % go to correct directory before you start!
4 | cd 'C:/Users/Bettina/Desktop/MAGIC-master/matlab';
5 |
6 | %% load Jims data
7 | % data should be cells as rows and genes as columns
8 | file = 'C:/Users/Bettina/ml/scRNAseq/Data/Jims data/Visceraltpm_m_fltd_mat.tsv';
9 | data = importdata(file);
10 | data = data';
11 |
12 | %% MAGIC
13 | [pc_imputed, U, ~] = run_magic(data, 'npca', 100, 'k', 15, 'a', 15, 'make_plot_opt_t', false);
14 | magic_data = pc_imputed * U';
15 | magic_data = magic_data';
16 | magic_data(magic_data<0) = 0;
17 |
18 | % Save results
19 | dlmwrite('jim_data_magic.tsv', magic_data, 'delimiter', '\t');
20 |
21 | %% load Usoskin data
22 | file = 'C:/Users/Bettina/ml/scRNAseq/Data/Usoskin data/usoskin_m_fltd_mat.tsv';
23 | data = importdata(file);
24 | data = data';
25 |
26 | %% MAGIC
27 | [pc_imputed, U, ~] = run_magic(data, 'npca', 100, 'k', 15, 'a', 15, 'make_plot_opt_t', false);
28 | magic_data = pc_imputed * U';
29 | magic_data = magic_data';
30 | magic_data(magic_data<0) = 0;
31 |
32 | % Save results
33 | dlmwrite('usoskin_data_magic.tsv', magic_data, 'delimiter', '\t');
34 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | scipy
2 | numpy=1.16.2
3 | numba
4 | nose
5 | matplotlib
6 | cvxopt
7 | scikit-learn
8 | pandas
--------------------------------------------------------------------------------
/scRNA/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nicococo/scRNA/72999f7e8c813534b193d9c77a10068f9d489e05/scRNA/__init__.py
--------------------------------------------------------------------------------
/scRNA/abstract_clustering.py:
--------------------------------------------------------------------------------
1 | from abc import ABCMeta, abstractmethod
2 |
3 | import numpy as np
4 |
5 |
6 | class AbstractClustering(object, metaclass=ABCMeta):
7 | cell_filter_list = None
8 | gene_filter_list = None
9 | data_transf = None
10 |
11 | data = None
12 | gene_ids = None
13 | num_cells = -1
14 | num_transcripts = -1
15 |
16 | pp_data = None
17 | cluster_labels = None
18 |
19 | remain_cell_inds = None
20 | remain_gene_inds = None
21 |
22 | def __init__(self, data, gene_ids=None):
23 | # init lists
24 | self.cell_filter_list = list()
25 | self.gene_filter_list = list()
26 | self.data_transf = lambda x: x
27 | self.gene_ids = gene_ids
28 | self.data = np.array(data, dtype=np.float)
29 | self.num_transcripts, self.num_cells = data.shape
30 | if self.gene_ids is None:
31 | #print('No gene ids provided.')
32 | self.gene_ids = np.arange(self.num_transcripts)
33 | self.cluster_labels = np.zeros((self.num_cells, 1))
34 | #print('Number of cells = {0}, number of transcripts = {1}'.format(self.num_cells, self.num_transcripts))
35 |
36 | def set_data_transformation(self, data_transf):
37 | self.data_transf = data_transf
38 |
39 | def add_cell_filter(self, cell_filter):
40 | if self.cell_filter_list is None:
41 | self.cell_filter_list = list(cell_filter)
42 | else:
43 | self.cell_filter_list.append(cell_filter)
44 |
45 | def add_gene_filter(self, gene_filter):
46 | if self.gene_filter_list is None:
47 | self.gene_filter_list = list(gene_filter)
48 | else:
49 | self.gene_filter_list.append(gene_filter)
50 |
51 | def pre_processing(self):
52 | self.pp_data, self.remain_gene_inds, self.remain_cell_inds = self.pre_processing_impl(self.data)
53 | return self.pp_data
54 |
55 | def pre_processing_impl(self, data):
56 | transcripts, cells = data.shape
57 | # 1. cell filter
58 | remain_cell_inds = np.arange(0, cells)
59 |
60 | for c in self.cell_filter_list:
61 | res = c(data)
62 | remain_cell_inds = np.intersect1d(remain_cell_inds, res)
63 | #print('1. Remaining number of cells after filtering: {0}/{1}'.format(remain_cell_inds.size, cells))
64 | A = data[:, remain_cell_inds]
65 |
66 | # 2. gene filter
67 | remain_gene_inds = np.arange(0, transcripts)
68 | for g in self.gene_filter_list:
69 | res = g(data)
70 | remain_gene_inds = np.intersect1d(remain_gene_inds, res)
71 | #print('2. Remaining number of transcripts after filtering: {0}/{1}'.format(remain_gene_inds.size, transcripts))
72 |
73 | # 3. data transformation
74 | B = A[remain_gene_inds, :]
75 | #print '3. Data transformation'
76 | #print 'Before data transformation: '
77 | #print '- Mean\median\max values: ', np.mean(B), np.median(B), np.max(B)
78 | #print '- Percentiles: ', np.percentile(B, [50, 75, 90, 99])
79 | X = self.data_transf(B)
80 | #print 'After data transformation: '
81 | #print '- Mean\median\max values: ', np.mean(X), np.median(X), np.max(X)
82 | #print '- Percentiles: ', np.percentile(X, [50, 75, 90, 99])
83 | return X, remain_gene_inds, remain_cell_inds
84 |
85 | @abstractmethod
86 | def apply(self):
87 | pass
88 |
89 | def __str__(self):
90 | if self.cluster_labels is None:
91 | return 'Empty cluster pipeline.'
92 | ret = 'Cluster Pipeline ({1} processed datapoints, {0} processed features):\n'.format(
93 | self.pp_data.shape[0], self.pp_data.shape[1])
94 | ret = '{0}-------------------------------------\n'.format(ret)
95 | lbls = np.unique(self.cluster_labels)
96 | for i in range(lbls.size):
97 | inds = np.where(self.cluster_labels == lbls[i])[0]
98 | ret = '{2}({1})[{0}'.format(inds[0], lbls[i], ret)
99 | for j in range(1, inds.size):
100 | ret = '{0},{1}'.format(ret, inds[j])
101 | ret = '{0}]\n'.format(ret)
102 | ret = '{0}-------------------------------------\n'.format(ret)
103 | return ret
--------------------------------------------------------------------------------
/scRNA/cmd_generate_data.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import ast
3 | import sys
4 |
5 | from scRNA.simulation import generate_toy_data, split_source_target
6 | from scRNA.utils import *
7 |
8 | # 0. PARSE ARGUMENTS
9 | parser = argparse.ArgumentParser()
10 | parser.add_argument(
11 | "--fout_target_data",
12 | help = "Output filename target data",
13 | default = 'fout_target_data.tsv',
14 | type = str
15 | )
16 | parser.add_argument(
17 | "--fout_source_data",
18 | help = "Output filename source data",
19 | default = 'fout_source_data.tsv',
20 | type = str
21 | )
22 | parser.add_argument(
23 | "--fout_geneids",
24 | help = "Output filename geneids",
25 | default = 'fout_geneids.tsv',
26 | type = str
27 | )
28 | parser.add_argument(
29 | "--fout_target_labels",
30 | help = "Output filename target labels",
31 | default = 'fout_target_labels.tsv',
32 | type = str
33 | )
34 | parser.add_argument(
35 | "--fout_source_labels",
36 | help = "Output filename source labels",
37 | default = 'fout_source_labels.tsv',
38 | type = str
39 | )
40 |
41 | parser.add_argument(
42 | "--num_genes",
43 | help = "Number of genes/transcripts per cell (default 1000)",
44 | default = 1000,
45 | type = int
46 | )
47 | parser.add_argument(
48 | "--num_cells",
49 | help = "Number of cells (default 1100)",
50 | default = 2000,
51 | type = int
52 | )
53 |
54 | parser.add_argument(
55 | "--cluster_spec",
56 | help = "Cluster specification as Python list",
57 | default = "[1, 2, 3, [4, 5], [6, [7, 8]]]",
58 | type = str
59 | )
60 | parser.add_argument(
61 | "--dir_cluster_size",
62 | help = "Dirichlet parameter cluster size (default 10)",
63 | default = 10,
64 | type = float
65 | )
66 |
67 | parser.add_argument(
68 | "--gamma_shape",
69 | help = "Gamma distribution shape parameter (default 2)",
70 | default = 2,
71 | type = float
72 | )
73 | parser.add_argument(
74 | "--gamma_rate",
75 | help = "Gamma distribution rate parameter (default 2)",
76 | default = 2,
77 | type = float
78 | )
79 | parser.add_argument(
80 | "--nb_dispersion",
81 | help = "Negative binomial distribution dispersion parameter (default 0.1)",
82 | default = 0.1,
83 | type = float
84 | )
85 | parser.add_argument(
86 | "--min_prop_genes_de",
87 | help = "Minimum proportion of genes DE in each cluster (default 0.1)",
88 | default = 0.1,
89 | type = float
90 | )
91 | parser.add_argument(
92 | "--max_prop_genes_de",
93 | help = "Maximum proportion of genes DE in each cluster (default 0.4)",
94 | default = 0.4,
95 | type = float
96 | )
97 | parser.add_argument(
98 | "--mean_de_logfc",
99 | help = "Mean log2 fold change of DE genes (default 1)",
100 | default = 1,
101 | type = float
102 | )
103 | parser.add_argument(
104 | "--sd_de_logfc",
105 | help = "Standard deviation of log2 fold change of DE genes (default 0.5)",
106 | default = 0.5,
107 | type = float
108 | )
109 |
110 | parser.add_argument(
111 | "--target_ncells",
112 | help = "How much of data will be target data (default 100)",
113 | default = 100,
114 | type = int
115 | )
116 | parser.add_argument(
117 | "--source_ncells",
118 | help = "How much of data will be source data (default 1000)",
119 | default = 1000,
120 | type = int
121 | )
122 | parser.add_argument(
123 | "--source_ncells_range",
124 | help = "How much of data will be source data (range)",
125 | default = "[]",
126 | type = str
127 | )
128 | parser.add_argument(
129 | "--target_ncells_range",
130 | help = "How much of data will be target data (range)",
131 | default = "[]",
132 | type = str
133 | )
134 | parser.add_argument(
135 | "--noise_target",
136 | help = "Add noise to target data",
137 | dest = "noise_target",
138 | action = 'store_true'
139 | )
140 | parser.add_argument(
141 | "--no-noise_target",
142 | help = "Do not add noise to target",
143 | dest = "noise_target",
144 | action = 'store_false'
145 | )
146 | parser.set_defaults(noise_target = False)
147 |
148 | parser.add_argument(
149 | "--noise_sd",
150 | help = "Standard deviation of target noise",
151 | default = 0.5,
152 | type = float
153 | )
154 |
155 | parser.add_argument(
156 | "--splitting_mode",
157 | help = "Splitting mode:\n\t- 1 = split randomly\n\t- 2 = split randomly, but stratified\n\t- 3 = Split randomly but antistratified\n\t- 4 = Have some overlapping and some exclusive clusters\n\t- 5 = Have only exclusive clusters\n\t- 6 = Have some defined clusters as the source\n\t- 7 = Define a range of number of overlapping clusters using variable: common\n\t(default 4)",
158 | default = 4,
159 | type = int
160 | )
161 | parser.add_argument(
162 | "--source_clusters",
163 | help = "Clusters to use as source when splitting by mode 6. Define as Python list",
164 | default = "[",
165 | type = str
166 | )
167 |
168 | parser.add_argument(
169 | "--common",
170 | help = "Range of number of overlapping clusters when splitting by mode 7. Define as Python list",
171 | default = "[0,3,5]",
172 | type = str
173 | )
174 |
175 | parser.add_argument(
176 | "--normalise",
177 | help = "Normalise data to log2(fpkm+1)",
178 | dest = "normalise",
179 | action = 'store_true'
180 | )
181 | parser.add_argument(
182 | "--no-normalise",
183 | help = "Disable normalise data to log2(fpkm+1)",
184 | dest = "normalise",
185 | action = 'store_false'
186 | )
187 | parser.set_defaults(normalise = False)
188 |
189 | args = parser.parse_args(sys.argv[1:])
190 | print('Command line arguments:')
191 | print(args)
192 |
193 | try:
194 | cluster_spec = ast.literal_eval(args.cluster_spec)
195 | except SyntaxError:
196 | sys.stderr.write("Error: Invalid cluster specification.")
197 | sys.exit()
198 |
199 |
200 | try:
201 | source_clusters = None
202 | if args.splitting_mode == 6:
203 | source_clusters = ast.literal_eval(args.source_clusters)
204 | except SyntaxError:
205 | sys.stderr.write("Error: Invalid source cluster specification.")
206 | sys.exit()
207 |
208 |
209 | # 1. GENERATE TOY DATA
210 | print('\nGenerate artificial single-cell RNA-seq data.')
211 | data, labels = generate_toy_data(
212 | num_genes = args.num_genes,
213 | num_cells = args.num_cells,
214 |
215 | cluster_spec = cluster_spec,
216 | dirichlet_parameter_cluster_size = args.dir_cluster_size,
217 |
218 | gamma_shape = args.gamma_shape,
219 | gamma_rate = args.gamma_rate,
220 | nb_dispersion = args.nb_dispersion,
221 | min_prop_genes_de = args.min_prop_genes_de,
222 | max_prop_genes_de = args.max_prop_genes_de,
223 | mean_de_logfc = args.mean_de_logfc,
224 | sd_de_logfc = args.sd_de_logfc
225 | )
226 | print(('Data dimension: ', data.shape))
227 |
228 | output_fmt = "%u"
229 |
230 | #Perform FPKM and log2 normalisation if required
231 | if args.normalise:
232 | data = np.log2(data.astype(float) / (np.sum(data, 0) / 1e6) + 1)
233 | output_fmt = "%f"
234 |
235 | # 2. SPLIT TOY DATA IN TARGET AND SOURCE DATA
236 | try:
237 | source_ncells_range = ast.literal_eval(args.source_ncells_range)
238 | target_ncells_range = ast.literal_eval(args.target_ncells_range)
239 | except SyntaxError:
240 | sys.stderr.write("Error: Invalid source/target size specification.")
241 | sys.exit()
242 |
243 | if len(source_ncells_range) == 0:
244 | source_ncells_range = [args.source_ncells]
245 |
246 | if len(target_ncells_range) == 0:
247 | target_ncells_range = [args.target_ncells]
248 |
249 | for sidx, source_ncells in enumerate(source_ncells_range):
250 | for tidx, target_ncells in enumerate(target_ncells_range):
251 |
252 | print('\nSplit artificial single-cell RNA-seq data in target and source data.')
253 | data_source, data_target, true_labels_source, true_labels_target = \
254 | split_source_target(
255 | data,
256 | labels,
257 | target_ncells = target_ncells,
258 | source_ncells = source_ncells,
259 | source_clusters = source_clusters,
260 | noise_target = args.noise_target,
261 | noise_sd = args.noise_sd,
262 | mode = args.splitting_mode,
263 | common = args.common
264 | )
265 | print(('Target data dimension: ', data_target.shape))
266 | print(('Source data dimension: ', data_source.shape))
267 |
268 | # 3. GENERATE GENE AND CELL NAMES
269 | gene_ids = np.arange(args.num_genes)
270 |
271 | # 4. SAVE RESULTS
272 | print(('Saving target data to \'{0}\'.'.format(args.fout_target_data)))
273 | np.savetxt(
274 | os.path.splitext(args.fout_target_data)[0] +
275 | "_T" + str(tidx+1) + "_" + str(target_ncells) +
276 | "_S" + str(sidx+1) + "_" + str(source_ncells) +
277 | os.path.splitext(args.fout_target_data)[1],
278 | data_target,
279 | fmt = output_fmt,
280 | delimiter = '\t'
281 | )
282 | np.savetxt(
283 | os.path.splitext(args.fout_target_labels)[0] +
284 | "_T" + str(tidx+1) + "_" + str(target_ncells) +
285 | "_S" + str(sidx+1) + "_" + str(source_ncells) +
286 | os.path.splitext(args.fout_target_labels)[1],
287 | true_labels_target,
288 | fmt = 'lbl_%u',
289 | delimiter = '\t'
290 | )
291 | np.savetxt(
292 | args.fout_geneids,
293 | gene_ids,
294 | fmt = '%u',
295 | delimiter = '\t'
296 | )
297 |
298 | print(('Saving source data to \'{0}\'.'.format(args.fout_source_data)))
299 | np.savetxt(
300 | os.path.splitext(args.fout_source_data)[0] +
301 | "_T" + str(tidx+1) + "_" + str(target_ncells) +
302 | "_S" + str(sidx+1) + "_" + str(source_ncells) +
303 | os.path.splitext(args.fout_source_data)[1],
304 | data_source,
305 | fmt = output_fmt,
306 | delimiter = '\t'
307 | )
308 | np.savetxt(
309 | os.path.splitext(args.fout_source_labels)[0] +
310 | "_T" + str(tidx+1) + "_" + str(target_ncells) +
311 | "_S" + str(sidx+1) + "_" + str(source_ncells) +
312 | os.path.splitext(args.fout_source_labels)[1],
313 | true_labels_source,
314 | fmt = 'lbl_%u',
315 | delimiter = '\t'
316 | )
317 |
318 | print('Done.')
319 |
--------------------------------------------------------------------------------
/scRNA/cmd_source.py:
--------------------------------------------------------------------------------
1 | import matplotlib as mpl
2 | mpl.use('Agg')
3 |
4 | import argparse, sys
5 |
6 | from functools import partial
7 | from sklearn.manifold import TSNE
8 |
9 | from scRNA.nmf_clustering import NmfClustering_initW, NmfClustering
10 | from scRNA.utils import *
11 |
12 | # --------------------------------------------------
13 | # PARSE COMMAND LINE ARGUMENTS
14 | # --------------------------------------------------
15 | parser = argparse.ArgumentParser()
16 | parser.add_argument("--fname", help="Source data (TSV file)", type=str, default='fout_source_data_T1_100_S1_1000.tsv')
17 | # parser.add_argument("--fname", help="Source data (TSV file)", type=str, default='C:\Users\Bettina\PycharmProjects2\scRNA_new\data\Usoskin\usoskin_m_fltd_mat.tsv')
18 | parser.add_argument("--fgene-ids", help="Source data gene ids (TSV file)", dest='fgene_ids', type=str, default='fout_geneids.tsv')
19 | # parser.add_argument("--fgene-ids", help="Source data gene ids (TSV file)", dest='fgene_ids', type=str, default='C:\Users\Bettina\PycharmProjects2\scRNA_new\data\Usoskin\usoskin_m_fltd_row.tsv')
20 | parser.add_argument("--fname-trg", help="Target data (TSV file)", type=str, default='fout_target_data_T1_100_S1_1000.tsv')
21 | # parser.add_argument("--fname-trg", help="Target data (TSV file)", type=str, default='C:\Users\Bettina\PycharmProjects2\scRNA_new\data\Jim\Visceraltpm_m_fltd_mat.tsv')
22 | parser.add_argument("--fgene-ids-trg", help="Target data gene ids (TSV file)", dest='fgene_ids_trg', type=str, default='fout_geneids.tsv')
23 | # parser.add_argument("--fgene-ids-trg", help="Target data gene ids (TSV file)", dest='fgene_ids_trg', type=str, default='C:\Users\Bettina\PycharmProjects2\scRNA_new\data\Jim\Visceraltpm_m_fltd_row.tsv')
24 | parser.add_argument("--fout", help="Result files will use this prefix.", default='src', type=str)
25 | parser.add_argument("--flabels", help="[optional] Source cluster labels (TSV file)", required=False, type=str,default = 'fout_source_labels_T1_100_S1_1000.tsv')
26 | # parser.add_argument("--flabels", help="[optional] Source cluster labels (TSV file)", required=False, type=str)
27 |
28 | parser.add_argument("--min_expr_genes", help="(Cell filter) Minimum number of expressed genes (default 2000)", default=2000, type=int)
29 | parser.add_argument("--non_zero_threshold", help="(Cell/gene filter) Threshold for zero expression per gene (default 2.0)", default=2.0, type=float)
30 | parser.add_argument("--perc_consensus_genes", help="(Gene filter) Filter genes that coincide across a percentage of cells (default 0.94)", default=0.94, type=float)
31 |
32 | parser.add_argument("--cluster-range", help="Comma separated list of numbers of clusters (default 8)", dest='cluster_range', default='8', type=str)
33 |
34 | parser.add_argument("--nmf_alpha", help="(NMF) Regularization strength (default 10.0)", default=10.0, type = float)
35 | parser.add_argument("--nmf_l1", help="(NMF) L1 regularization impact [0,1] (default 0.75)", default=0.75, type = float)
36 | parser.add_argument("--nmf_max_iter", help="(NMF) Maximum number of iterations (default 4000).", default=4000, type = int)
37 | parser.add_argument("--nmf_rel_err", help="(NMF) Relative error threshold must be reached before convergence (default 1e-3)", default=1e-3, type=float)
38 |
39 | parser.add_argument(
40 | "--cell-filter",
41 | help = "Enable cell filter for source and target datasets.",
42 | dest = "use_cell_filter",
43 | action = 'store_true')
44 | parser.add_argument(
45 | "--no-cell-filter",
46 | help = "Disable cell filter for source and target datasets.",
47 | dest = "use_cell_filter",
48 | action = 'store_false')
49 | parser.set_defaults(use_cell_filter = False)
50 |
51 | parser.add_argument(
52 | "--gene-filter",
53 | help = "Enable gene filter for source and target datasets.",
54 | dest = "use_gene_filter",
55 | action = 'store_true')
56 | parser.add_argument(
57 | "--no-gene-filter",
58 | help = "Disable gene filter for source and target datasets.",
59 | dest = "use_gene_filter",
60 | action = 'store_false')
61 | parser.set_defaults(use_gene_filter = False)
62 |
63 | parser.add_argument(
64 | "--transform",
65 | help = "Transform data to log2(x+1)",
66 | dest = "transform",
67 | action = 'store_true')
68 | parser.add_argument(
69 | "--no-transform",
70 | help = "Disable transform data to log2(x+1)",
71 | dest = "transform",
72 | action = 'store_false')
73 | parser.set_defaults(transform = False)
74 |
75 | parser.add_argument(
76 | "--tsne",
77 | help = "Enable t-SNE plots.",
78 | dest = "tsne",
79 | action = 'store_true')
80 | parser.add_argument(
81 | "--no-tsne",
82 | help = "Disable t-SNE plots.",
83 | dest = "tsne",
84 | action = 'store_false')
85 | parser.set_defaults(tsne=True)
86 |
87 | arguments = parser.parse_args(sys.argv[1:])
88 | print('Command line arguments:')
89 | print(arguments)
90 |
91 |
92 | # --------------------------------------------------
93 | # 1. LOAD DATA
94 | # --------------------------------------------------
95 | print(("\nLoading dataset (data={0} and gene_ids={1}).".format(arguments.fname, arguments.fgene_ids)))
96 | data, gene_ids, labels, labels_2_ids = load_dataset_tsv(arguments.fname, arguments.fgene_ids, flabels=arguments.flabels)
97 | print(('Data: {1} cells and {0} genes/transcripts.'.format(data.shape[0], data.shape[1])))
98 | if labels is not None:
99 | print("Source labels provided: ")
100 | print((np.unique(labels)))
101 | np.savetxt('{0}.labels2ids.tsv'.format(arguments.fout), (np.arange(labels_2_ids.size), labels_2_ids), fmt='%s', delimiter='\t')
102 | else:
103 | print("No source labels provided, they will be generated via NMF Clustering!")
104 |
105 | print('Number of genes/transcripts in data and gene-ids must coincide.')
106 | assert(data.shape[0] == gene_ids.shape[0])
107 |
108 | # --------------------------------------------------
109 | # 2. CELL and GENE FILTER
110 | # --------------------------------------------------
111 | # Preprocessing Source Data
112 | print(("Source data dimensions before preprocessing: genes x cells", data.shape))
113 | # Cell and gene filter and transformation before the whole procedure
114 | if arguments.use_cell_filter:
115 | cell_inds = sc.cell_filter(data, num_expr_genes=arguments.min_expr_genes, non_zero_threshold=arguments.non_zero_threshold)
116 | data = data[:, cell_inds]
117 | if labels is not None:
118 | labels = labels[cell_inds]
119 | labels_2_ids = labels_2_ids[cell_inds]
120 | if arguments.use_gene_filter:
121 | gene_inds = sc.gene_filter(data, perc_consensus_genes=arguments.perc_consensus_genes, non_zero_threshold=arguments.non_zero_threshold)
122 | data = data[gene_inds, :]
123 | gene_ids = gene_ids[gene_inds]
124 | if arguments.transform:
125 | data = sc.data_transformation_log2(data)
126 | cell_filter_fun = partial(sc.cell_filter, num_expr_genes=0, non_zero_threshold=-1)
127 | gene_filter_fun = partial(sc.gene_filter, perc_consensus_genes=1, non_zero_threshold=-1)
128 | data_transf_fun = sc.no_data_transformation
129 | print(("source data dimensions after preprocessing: genes x cells: ", data.shape))
130 |
131 | # --------------------------------------------------
132 | # Gene subset between source and target
133 | # --------------------------------------------------
134 | data_trg, gene_ids_trg, _, _ = load_dataset_tsv(arguments.fname_trg, arguments.fgene_ids_trg)
135 |
136 | # Preprocessing Target Data
137 | print(("Target data dimensions before preprocessing: genes x cells", data_trg.shape))
138 | # Cell and gene filter and transformation before the whole procedure
139 | if arguments.use_cell_filter:
140 | cell_inds = sc.cell_filter(data_trg, num_expr_genes=arguments.min_expr_genes, non_zero_threshold=arguments.non_zero_threshold)
141 | data_trg = data_trg[:, cell_inds]
142 | if arguments.use_gene_filter:
143 | gene_inds = sc.gene_filter(data_trg, perc_consensus_genes=arguments.perc_consensus_genes, non_zero_threshold=arguments.non_zero_threshold)
144 | data_trg = data_trg[gene_inds, :]
145 | gene_ids_trg = gene_ids_trg[gene_inds]
146 |
147 | print(("Target data dimensions after preprocessing: genes x cells: ", data_trg.shape))
148 |
149 | print('Genes/transcripts in source and target data must coincide.')
150 | # Find gene subset
151 | gene_intersection = list(set(x for x in gene_ids_trg).intersection(set(x for x in gene_ids)))
152 |
153 | # Adjust source data to only include overlapping genes
154 | data_source_indices = list(list(gene_ids).index(x) for x in gene_intersection)
155 | data = data[data_source_indices,]
156 |
157 | print(("source data dimensions after taking target intersection: genes x cells: ", data.shape))
158 |
159 | # --------------------------------------------------
160 | # 3. CLUSTERING
161 | # --------------------------------------------------
162 | num_cluster = list(map(np.int, arguments.cluster_range.split(",")))
163 |
164 | accs_names = ['KTA (linear)', 'ARI']
165 | accs = np.zeros((2, len(num_cluster)))
166 |
167 | for i in range(len(num_cluster)):
168 | k = num_cluster[i]
169 | print(('Iteration {0}, num-cluster={1}'.format(i, k)))
170 | # --------------------------------------------------
171 | # 3.1. SETUP SOURCE DATA NMF CLUSTERING
172 | # --------------------------------------------------
173 | if labels is None:
174 | # No source labels are provided, generate them via NMF clustering
175 | nmf_labels = None
176 | nmf_labels = NmfClustering(data, gene_ids, num_cluster=k, labels=[])
177 | nmf_labels.add_cell_filter(cell_filter_fun)
178 | nmf_labels.add_gene_filter(gene_filter_fun)
179 | nmf_labels.set_data_transformation(data_transf_fun)
180 | nmf_labels.apply(k=k, alpha=arguments.nmf_alpha, l1=arguments.nmf_l1, max_iter=arguments.nmf_max_iter, rel_err=arguments.nmf_rel_err)
181 | labels = nmf_labels.cluster_labels
182 |
183 | # Use perfect number of latent states for nmf and sc3
184 | src_labels = np.array(labels, dtype=np.int)
185 | src_lbl_set = np.unique(src_labels)
186 | k_now = src_lbl_set.size
187 |
188 | nmf = None
189 | nmf = NmfClustering_initW(data, gene_ids, labels=labels, num_cluster=k_now)
190 | nmf.add_cell_filter(cell_filter_fun)
191 | nmf.add_gene_filter(gene_filter_fun)
192 | nmf.set_data_transformation(data_transf_fun)
193 | nmf.apply(k=k_now, alpha=arguments.nmf_alpha, l1=arguments.nmf_l1, max_iter=arguments.nmf_max_iter, rel_err=arguments.nmf_rel_err)
194 |
195 | # --------------------------------------------------
196 | # 3.2. EVALUATE CLUSTER ASSIGNMENT
197 | # --------------------------------------------------
198 | print('\nUnsupervised evaluation:')
199 | accs[0, i] = unsupervised_acc_kta(nmf.pp_data, nmf.cluster_labels, kernel='linear')
200 | print((' -KTA (linear) : ', accs[0, i]))
201 | print('\nSupervised evaluation:')
202 | accs[1, i] = metrics.adjusted_rand_score(labels[nmf.remain_cell_inds], nmf.cluster_labels)
203 | print((' -ARI: ', accs[1, i]))
204 |
205 | # --------------------------------------------------
206 | # 3.3. SAVE RESULTS
207 | # --------------------------------------------------
208 | nmf.cell_filter_list = None
209 | nmf.gene_filter_list = None
210 | nmf.data_transf = None
211 | print(('\nSaving data structures and results to file with prefix \'{0}_c{1}\'.'.format(arguments.fout, k)))
212 | np.savez('{0}_c{1}.npz'.format(arguments.fout, k), src=nmf, args=arguments, allow_pickle=True)
213 | np.savetxt('{0}_c{1}.labels.tsv'.format(arguments.fout, k),
214 | (nmf.cluster_labels, nmf.remain_cell_inds), fmt='%u', delimiter='\t')
215 | np.savetxt('{0}_c{1}.true_labels.tsv'.format(arguments.fout, k),
216 | (nmf.cluster_labels, labels[nmf.remain_cell_inds]), fmt='%u', delimiter='\t')
217 |
218 | # --------------------------------------------------
219 | # 3.4. T-SNE PLOT
220 | # --------------------------------------------------
221 | if arguments.tsne:
222 | model = TSNE(n_components=2, random_state=0, init='pca', method='exact', metric='euclidean', perplexity=30)
223 | ret = model.fit_transform(nmf.pp_data.T)
224 | plt.title('{0} cluster (Euclidean)'.format(k))
225 | plt.scatter(ret[:, 0], ret[:, 1], 20, nmf.cluster_labels)
226 | plt.xticks([])
227 | plt.yticks([])
228 |
229 | plt.savefig('{0}_c{1}.tsne.png'.format(arguments.fout, k), format='png', bbox_inches=None, pad_inches=0.1)
230 | # plt.show()
231 |
232 | # --------------------------------------------------
233 | # 6. SUMMARIZE RESULTS
234 | # --------------------------------------------------
235 | print('\n------------------------------ Summary:')
236 | print(('Cluster:', num_cluster))
237 | print(('Accuracy measures: ', accs_names))
238 | print('Accuracies:')
239 | print(accs)
240 |
241 | np.set_printoptions(precision=3)
242 | np.set_printoptions(suppress=True)
243 |
244 | print('\n\n\n')
245 | print('================================================================================')
246 | print('\n\n\n')
247 | print('SUMMARY')
248 | print('\n\n\n')
249 | print('Parameters')
250 | print('-------------')
251 | print((' - Output prefix: ', arguments.fout))
252 | print((' - Source file name: ', arguments.fname))
253 | print((' - Cluster:', num_cluster))
254 | if labels is not None:
255 | print((' - Class 2 label conversion (class {0:1d}-{1:1d}): '.format(
256 | np.int(np.min(labels)), np.int(np.max(labels))), labels_2_ids))
257 | print('')
258 |
259 | print('Results')
260 | print('-------------')
261 | print((' - Accuracies: ', accs_names))
262 | for i in range(accs.shape[0]):
263 | print(('\n{0} (cluster({1})):'.format(accs_names[i], len(num_cluster))))
264 | print((accs[i, :]))
265 |
266 | plt.figure(0, figsize=(20,5), dpi=100)
267 | fig, axes = plt.subplots(nrows=1, ncols=accs.shape[0])
268 | fig.tight_layout(h_pad=1.08, pad=2.2) # Or equivalently, "plt.tight_layout()"
269 | for i in range(accs.shape[0]):
270 | plt.subplot(1, accs.shape[0], i+1)
271 |
272 | if i % 2 == 0:
273 | plt.title(accs_names[i] + '\n', fontsize=12, fontweight='bold')
274 | else:
275 | plt.title('\n' + accs_names[i], fontsize=12, fontweight='bold')
276 |
277 | if i == accs.shape[0]-1:
278 | plt.bar(list(range(len(num_cluster))), accs[i, :], color='red')
279 | else:
280 | plt.bar(list(range(len(num_cluster))), accs[i, :])
281 |
282 | if i == 0:
283 | plt.xlabel('Cluster', fontsize=12)
284 | plt.ylabel('Accuracy', fontsize=12)
285 |
286 | plt.yticks(fontsize=8)
287 | plt.xticks(np.array(list(range(len(num_cluster))), dtype=np.float)+0.5, num_cluster, fontsize=8)
288 | plt.grid('on')
289 |
290 | plt.savefig('{0}.accs.png'.format(arguments.fout), format='png',
291 | bbox_inches=None, pad_inches=0.1, dpi=100)
292 | # plt.show()
293 |
294 | print('\nDone.')
295 |
--------------------------------------------------------------------------------
/scRNA/nmf_clustering.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import scipy.stats as stats
4 | from sklearn import decomposition as decomp
5 |
6 | from scRNA.abstract_clustering import AbstractClustering
7 | from scRNA.utils import center_kernel, normalize_kernel, kta_align_binary, \
8 | get_matching_gene_inds, get_transferred_data_matrix, get_transferability_score
9 |
10 |
11 | class NmfClustering(AbstractClustering):
12 | num_cluster = -1
13 | dictionary = None
14 | data_matrix = None
15 |
16 | def __init__(self, data, gene_ids, num_cluster, labels):
17 | super(NmfClustering, self).__init__(data, gene_ids=gene_ids)
18 | self.num_cluster = num_cluster
19 |
20 | def apply(self, k=-1, alpha=1.0, l1=0.75, max_iter=100, rel_err=1e-3):
21 | if k == -1:
22 | k = self.num_cluster
23 | X = self.pre_processing()
24 |
25 | nmf = decomp.NMF(alpha=alpha, init='nndsvdar', l1_ratio=l1, max_iter=max_iter,
26 | n_components=k, random_state=0, shuffle=True, solver='cd',
27 | tol=rel_err, verbose=0)
28 |
29 | W = nmf.fit_transform(X)
30 | H = nmf.components_
31 | self.cluster_labels = np.argmax(nmf.components_, axis=0)
32 |
33 | if np.any(np.isnan(H)):
34 | raise Exception('H contains NaNs (alpha={0}, k={1}, l1={2}, data={3}x{4}'.format(
35 | alpha, k, l1, X.shape[0], X.shape[1]))
36 | if np.any(np.isnan(W)):
37 | raise Exception('W contains NaNs (alpha={0}, k={1}, l1={2}, data={3}x{4}'.format(
38 | alpha, k, l1, X.shape[0], X.shape[1]))
39 |
40 | # self.print_reconstruction_error(X, W, H)
41 | self.dictionary = W
42 | self.data_matrix = H
43 |
44 | def print_reconstruction_error(self, X, W, H):
45 | print((' Elementwise absolute reconstruction error : ', np.sum(np.abs(X - W.dot(H))) / np.float(X.size)))
46 | print((' Fro-norm reconstruction error : ', np.sqrt(np.sum((X - W.dot(H))*(X - W.dot(H)))) / np.float(X.size)))
47 |
48 |
49 | class NmfClustering_initW(AbstractClustering):
50 | num_cluster = -1
51 | dictionary = None
52 | data_matrix = None
53 |
54 | def __init__(self, data, gene_ids, num_cluster, labels):
55 | super(NmfClustering_initW, self).__init__(data, gene_ids=gene_ids)
56 | self.num_cluster = num_cluster
57 | self.labels=labels
58 |
59 | def apply(self, k=-1, alpha=1.0, l1=0.75, max_iter=100, rel_err=1e-3):
60 | if k == -1:
61 | k = self.num_cluster
62 | X = self.pre_processing()
63 |
64 | fixed_W = pd.get_dummies(self.labels)
65 | fixed_W_t = fixed_W.T # interpret W as H (transpose), you can only fix H, while optimizing W in the code. So we simply switch those matrices (invert their roles).
66 | learned_H_t, fixed_W_t_same, n_iter = decomp.non_negative_factorization(X.astype(np.float), n_components=k, init='custom', random_state=0, update_H=False, H=fixed_W_t.astype(np.float), alpha=alpha, l1_ratio=l1, max_iter=max_iter, shuffle=True, solver='cd',tol=rel_err, verbose=0)
67 |
68 | init_W = fixed_W_t_same.T
69 | init_H = learned_H_t.T
70 |
71 | nmf = decomp.NMF(alpha=alpha, init='custom',l1_ratio=l1, max_iter=max_iter, n_components=k, random_state=0, shuffle=True, solver='cd', tol=rel_err, verbose=0)
72 | W = nmf.fit_transform(X.T, W=init_W, H = init_H)
73 | H = nmf.components_
74 | self.cluster_labels = np.argmax(W, axis=1)
75 |
76 | if np.any(np.isnan(H)):
77 | raise Exception('H contains NaNs (alpha={0}, k={1}, l1={2}, data={3}x{4}'.format(
78 | alpha, k, l1, X.shape[0], X.shape[1]))
79 | if np.any(np.isnan(W)):
80 | raise Exception('W contains NaNs (alpha={0}, k={1}, l1={2}, data={3}x{4}'.format(
81 | alpha, k, l1, X.shape[0], X.shape[1]))
82 |
83 | # self.print_reconstruction_error(X, W, H)
84 | self.dictionary = H.T
85 | self.data_matrix = W.T
86 |
87 | def print_reconstruction_error(self, X, W, H):
88 | print((' Elementwise absolute reconstruction error : ', np.sum(np.abs(X - W.dot(H))) / np.float(X.size)))
89 | print((' Fro-norm reconstruction error : ', np.sqrt(np.sum((X - W.dot(H))*(X - W.dot(H)))) / np.float(X.size)))
90 |
91 |
92 | class NmfClustering_fixW(AbstractClustering):
93 | num_cluster = -1
94 | dictionary = None
95 | data_matrix = None
96 |
97 | def __init__(self, data, gene_ids, num_cluster, labels):
98 | super(NmfClustering_fixW, self).__init__(data, gene_ids=gene_ids)
99 | self.num_cluster = num_cluster
100 | self.labels=labels
101 |
102 | def apply(self, k=-1, alpha=1.0, l1=0.75, max_iter=100, rel_err=1e-3):
103 | if k == -1:
104 | k = self.num_cluster
105 | X_t = self.pre_processing()
106 | X = X_t.T
107 |
108 | fixed_W = pd.get_dummies(self.labels)
109 | fixed_W_t = fixed_W.T # interpret W as H (transpose), you can only fix H, while optimizing W in the code. So we simply switch those matrices (invert their roles).
110 | learned_H_t, fixed_W_t_same, n_iter = decomp.non_negative_factorization(X_t.astype(np.float), n_components=k, init='custom', random_state=0, update_H=False, H=fixed_W_t.astype(np.float), alpha=alpha, l1_ratio=l1, max_iter=max_iter, shuffle=True, solver='cd',tol=rel_err, verbose=0)
111 |
112 | assert(np.all(fixed_W_t == fixed_W_t_same))
113 | #self.cluster_labels = np.argmax(fixed_W_t_same.T, axis=1)
114 |
115 | # Now take the learned H, fix it and learn W to see how well it worked
116 | learned_W, learned_H_fix, n_iter = decomp.non_negative_factorization(X.astype(np.float), n_components=k, init='custom', random_state=0, update_H=False, H=learned_H_t.T, alpha=alpha, l1_ratio=l1, max_iter=max_iter, shuffle=True, solver='cd',tol=rel_err, verbose=0)
117 |
118 | assert(np.all(learned_H_t.T == learned_H_fix))
119 | self.cluster_labels = np.argmax(learned_W, axis=1)
120 |
121 | if np.any(np.isnan(learned_H_t)):
122 | raise Exception('H contains NaNs (alpha={0}, k={1}, l1={2}, data={3}x{4}'.format(
123 | alpha, k, l1, X.shape[0], X.shape[1]))
124 | if np.any(np.isnan(fixed_W_t)):
125 | raise Exception('W contains NaNs (alpha={0}, k={1}, l1={2}, data={3}x{4}'.format(
126 | alpha, k, l1, X.shape[0], X.shape[1]))
127 |
128 | #self.print_reconstruction_error(X, fixed_W_t, learned_H_t)
129 | self.dictionary = learned_H_t
130 | self.data_matrix = fixed_W_t
131 |
132 |
133 | class DaNmfClustering(NmfClustering):
134 | reject = None
135 | transferability_score = 0.0
136 | transferability_percs = None
137 | transferability_rand_scores = None
138 | transferability_pvalue = 1.0
139 | src = None
140 | intermediate_model = None
141 | mixed_data = None
142 |
143 | def __init__(self, src, trg_data, trg_gene_ids, num_cluster):
144 | super(DaNmfClustering, self).__init__(trg_data, gene_ids=trg_gene_ids, num_cluster=num_cluster, labels=[])
145 | self.src = src
146 |
147 | def get_mixed_data(self, mix=0.0, reject_ratio=0., use_H2=True, calc_transferability=False, max_iter=100, rel_err=1e-3):
148 | trg_data = self.pre_processing()
149 | trg_gene_ids = self.gene_ids[self.remain_gene_inds]
150 | # print self.src.gene_ids.shape
151 | # print self.src.remain_gene_inds.shape
152 | src_gene_ids = self.src.gene_ids[self.src.remain_gene_inds].copy()
153 | inds1, inds2 = get_matching_gene_inds(src_gene_ids, trg_gene_ids)
154 |
155 | # print 'MTL source {0} genes -> {1} genes.'.format(src_gene_ids.size, inds2.size)
156 | # print 'MTL target {0} genes -> {1} genes.'.format(trg_gene_ids.size, inds1.size)
157 |
158 | src_gene_ids = src_gene_ids[inds2]
159 | self.gene_ids = trg_gene_ids[inds1]
160 | trg_data = trg_data[inds1, :]
161 |
162 | # print('Sorted, filtered gene ids for src/trg. They should coincide!')
163 | for i in range(inds1.size):
164 | #if i < 10 or src_gene_ids[i] != self.gene_ids[i]:
165 | # print i, src_gene_ids[i], self.gene_ids[i]
166 | assert(src_gene_ids[i] == self.gene_ids[i])
167 |
168 | assert(self.src.dictionary is not None) # source data should always be pre-processed
169 | W, H, H2, new_err = get_transferred_data_matrix(self.src.dictionary[inds2, :], trg_data, max_iter=max_iter, rel_err=rel_err)
170 | self.cluster_labels = np.argmax(H, axis=0)
171 | #self.print_reconstruction_error(trg_data, W, H2)
172 | self.intermediate_model = (W, H, H2)
173 | self.reject = self.calc_rejection(trg_data, W, H, H2)
174 |
175 | if calc_transferability:
176 | #print('Calculating transferability score...')
177 | self.transferability_score, self.transferability_rand_scores, self.transferability_pvalue = \
178 | get_transferability_score(W, H, trg_data, max_iter=max_iter)
179 | self.transferability_percs = np.percentile(self.transferability_rand_scores, [25, 50, 75, 100])
180 | self.reject.append(('Transfer_Percentiles', self.transferability_percs))
181 | self.reject.append(('Transferability', self.transferability_score))
182 | self.reject.append(('Transferability p-value', self.transferability_pvalue))
183 |
184 | if use_H2:
185 | new_trg_data = W.dot(H2)
186 | else:
187 | new_trg_data = W.dot(H)
188 |
189 | # reject option enabled?
190 | assert(reject_ratio < 1.) # rejection of 100% (or more) does not make any sense
191 |
192 | if reject_ratio > 0.:
193 | name, neg_entropy = self.reject[2]
194 | # inds = np.arange(0, trg_data.shape[1], dtype=np.int)
195 | inds = np.argsort(-neg_entropy) # ascending order
196 | keep = np.float(inds.size) * reject_ratio
197 | inds = inds[:keep]
198 | new_trg_data[:, inds] = trg_data[:, inds]
199 |
200 | mixed_data = mix*new_trg_data + (1.-mix)*trg_data
201 | if np.any(trg_data < 0.0):
202 | print('Error! Negative values in target data!')
203 | if np.any(mixed_data < 0.0):
204 | print('Error! Negative values in reconstructed data!')
205 | return mixed_data, new_trg_data, trg_data
206 |
207 | def apply(self, k=-1, mix=0.0, reject_ratio=0., alpha=1.0, l1=0.75, max_iter=100, rel_err=1e-3, calc_transferability=False):
208 | if k == -1:
209 | k = self.num_cluster
210 | mixed_data, new_trg_data, trg_data = self.get_mixed_data(mix=mix,
211 | reject_ratio=reject_ratio,
212 | max_iter=max_iter,
213 | rel_err=rel_err,
214 | calc_transferability=calc_transferability)
215 | nmf = decomp.NMF(alpha=alpha, init='nndsvdar', l1_ratio=l1, max_iter=max_iter,
216 | n_components=k, random_state=0, shuffle=True, solver='cd', tol=1e-6, verbose=0)
217 | W = nmf.fit_transform(mixed_data)
218 | H = nmf.components_
219 | self.dictionary = W
220 | self.data_matrix = H
221 | self.cluster_labels = np.argmax(nmf.components_, axis=0)
222 | self.mixed_data = mixed_data
223 | # print('Labels used: {0} of {1}.'.format(np.unique(self.cluster_labels).size, k))
224 |
225 | def calc_rejection(self, trg_data, W, H, H2):
226 | diffs = np.zeros(H2.shape[1])
227 | for c in range(self.src.num_cluster):
228 | inds = np.where(self.cluster_labels == c)[0]
229 | if inds.size > 0:
230 | min_h2 = np.min(H[:, inds])
231 | max_h2 = np.max(H[:, inds])
232 | foo = H[:, inds]-min_h2 / (max_h2 - min_h2)
233 | foo = np.max(foo, axis=0) - np.min(foo, axis=0)
234 | diffs[inds] = foo
235 |
236 | sum_expr = np.sum(trg_data, axis=0)
237 | sum_expr -= np.min(sum_expr)
238 | sum_expr /= np.max(sum_expr)
239 | sum_expr = sum_expr + 1.0
240 | sum_expr /= np.max(sum_expr)
241 | weight = 1. - sum_expr
242 |
243 | reconstr_err = np.sum(np.abs(trg_data - W.dot(H2)), axis=0)
244 | reconstr_err -= np.min(reconstr_err)
245 | reconstr_err /= np.max(reconstr_err)
246 |
247 | final_values = weight * reconstr_err #* neg_entropy
248 | # final_values = reconstr_err #* neg_entropy
249 |
250 | reject = list()
251 | reject.append(('Reconstr. Error', -final_values))
252 |
253 | # kurts = stats.kurtosis(H, fisher=False, axis=0)
254 | # K1 = trg_data.T.dot(trg_data)
255 | # K2 = W.dot(H).T.dot(W.dot(H))
256 | # K3 = W.dot(H2).T.dot(W.dot(H2))
257 |
258 | neg_entropy = stats.entropy(H)
259 | neg_entropy -= np.min(neg_entropy)
260 | neg_entropy /= np.max(neg_entropy)
261 |
262 | reject.append(('Kurtosis', stats.kurtosis(H, fisher=False, axis=0)))
263 | reject.append(('Entropy', -neg_entropy))
264 | # reject.append(('KTA kurt1', self.reject_classifier(K1, diffs)))
265 | # reject.append(('KTA kurt2', self.reject_classifier(K2, kurts)))
266 | # reject.append(('KTA kurt3', self.reject_classifier(K3, kurts)))
267 | reject.append(('Diffs', diffs))
268 | reject.append(('Dist L2 H', -np.sum((np.abs(trg_data - W.dot(H))**2. ), axis=0)))
269 | reject.append(('Dist L2 H2', -np.sum((np.abs(trg_data - W.dot(H2))**2. ), axis=0)))
270 | reject.append(('Dist L1 H', -np.sum(np.abs(trg_data - W.dot(H)), axis=0)))
271 | reject.append(('Dist L1 H2', -np.sum(np.abs(trg_data - W.dot(H2)), axis=0)))
272 | return reject
273 |
274 | def reject_classifier(self, K, kurts):
275 | """
276 | :param K: numpy.array
277 | :param kurts: numpy.array
278 | :return: numpy.array
279 | """
280 | sinds = np.argsort(kurts)
281 | K = center_kernel(K)
282 | K = normalize_kernel(K)
283 | max_kta = -1.0
284 | max_kta_ind = -1
285 | for i in range(K.shape[1]-2):
286 | # 1. build binary label matrix
287 | labels = np.ones(kurts.size, dtype=np.int)
288 | labels[sinds[:i+1]] = -1
289 | kta = kta_align_binary(K, labels)
290 | if kta > max_kta:
291 | max_kta = kta
292 | max_kta_ind = i+1
293 | labels = np.ones(kurts.size, dtype=np.int)
294 | labels[sinds[:max_kta_ind]] = -1
295 | return labels
296 |
--------------------------------------------------------------------------------
/scRNA/sc3_clustering.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from .abstract_clustering import AbstractClustering
4 | import pdb
5 |
6 |
7 | class SC3Clustering(AbstractClustering):
8 | """ Meta-class for single-cell clustering based on the SC3 pipeline.
9 | Nico Goernitz, TU Berlin, 2016
10 | """
11 | dists_list = None
12 | dimred_list = None
13 | intermediate_clustering_list = None
14 | build_consensus_matrix = None
15 | consensus_clustering = None
16 |
17 | dists = None
18 | pc_range = None
19 | sub_sample = None
20 | consensus_mode = None
21 |
22 | def __init__(self, data, gene_ids=None,
23 | pc_range=[4, 10], sub_sample=True, consensus_mode=0):
24 | super(SC3Clustering, self).__init__(data, gene_ids=gene_ids)
25 | # init lists
26 | self.dists_list = list()
27 | self.dimred_list = list()
28 | self.intermediate_clustering_list = list()
29 | self.consensus_clustering = lambda X: np.zeros(X.shape[0])
30 | self.pc_range = pc_range
31 | self.sub_sample = sub_sample
32 | self.consensus_mode = consensus_mode
33 |
34 | def set_consensus_clustering(self, consensus_clustering):
35 | self.consensus_clustering = consensus_clustering
36 |
37 | def set_build_consensus_matrix(self, build_consensus_matrix):
38 | self.build_consensus_matrix = build_consensus_matrix
39 |
40 | def add_distance_calculation(self, dist_calculation):
41 | if self.dists_list is None:
42 | self.dists_list = list(dist_calculation)
43 | else:
44 | self.dists_list.append(dist_calculation)
45 |
46 | def add_dimred_calculation(self, dimred_computation):
47 | if self.dimred_list is None:
48 | self.dimred_list = list(dimred_computation)
49 | else:
50 | self.dimred_list.append(dimred_computation)
51 |
52 | def add_intermediate_clustering(self, intermediate_clustering):
53 | if self.intermediate_clustering_list is None:
54 | self.intermediate_clustering_list = list(intermediate_clustering)
55 | else:
56 | self.intermediate_clustering_list.append(intermediate_clustering)
57 |
58 | def apply(self):
59 | # check range
60 | assert self.pc_range[0] > 0
61 | assert self.pc_range[1] < self.num_cells
62 |
63 | X = self.pre_processing()
64 |
65 | # 4. distance calculations
66 | #print '4. Distance calculations ({0} methods).'.format(len(self.dists_list))
67 | dists = list()
68 | for d in self.dists_list:
69 | dists.append(d(X, self.gene_ids[self.remain_gene_inds]))
70 |
71 | # 5. transformations (dimension reduction)
72 | #print '5. Distance transformations ({0} transformations * {1} distances = {2} in total).'.format(
73 | # len(self.dimred_list), len(self.dists_list), len(self.dists_list)*len(self.dimred_list))
74 | transf = list()
75 | for d in dists:
76 | for t in self.dimred_list:
77 | dres, deigv = t(d)
78 | transf.append((dres, deigv))
79 |
80 | # 6. intermediate clustering and consensus matrix generation
81 | #print '6. Intermediate clustering and consensus matrix generation.'
82 | range_inds = list(range(self.pc_range[0], self.pc_range[1]+1))
83 | if self.sub_sample and len(range_inds) > 15:
84 | # subsample 15 inds from this range
85 | range_inds = np.random.permutation(range_inds)[:15]
86 | #print 'Subsample 15 eigenvectors for intermediate clustering: ', range_inds
87 | #else:
88 | #print('Using complete range of eigenvectors from {0} to {1}.'.format(
89 | # self.pc_range[0], self.pc_range[1]))
90 |
91 | cnt = 0.
92 | consensus2 = np.zeros((self.remain_cell_inds.size, self.remain_cell_inds.size))
93 | for cluster in self.intermediate_clustering_list:
94 | for t in range(len(transf)):
95 | _, deigv = transf[t]
96 | labels = list()
97 | for d in range_inds:
98 | labels.append(cluster(deigv[:, 0:d].reshape((deigv.shape[0], d))))
99 | if self.consensus_mode == 0:
100 | consensus2 += self.build_consensus_matrix(np.array(labels[-1]))
101 | cnt += 1.
102 | if self.consensus_mode == 1:
103 | consensus2 += self.build_consensus_matrix(np.array(labels))
104 | cnt += 1.
105 | consensus2 /= cnt
106 |
107 | # 7. consensus clustering
108 | #print '7. Consensus clustering.'
109 | self.cluster_labels, self.dists = self.consensus_clustering(consensus2)
110 |
111 |
--------------------------------------------------------------------------------
/scRNA/sc3_clustering_impl.py:
--------------------------------------------------------------------------------
1 | import scipy.cluster.hierarchy as spc
2 | import scipy.spatial.distance as dist
3 | import scipy.stats as stats
4 | import scipy.linalg as sl
5 | import sklearn.cluster as cluster
6 |
7 | from .utils import *
8 |
9 | # These are the SC3 labels for Ting with 7 clusters, PCA, Euclidean distances
10 | SC3_Ting7_results = [
11 | [1,2,3,4,5,6,7,12,35,38,39,40,43,44,45,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,71,72,73,74,75,76,77,78,79,80,81,82,83,91,122,123,124,125,126,127,128,129,132,133,134,137,138,139,141,160,172,176,177,178,179,180,181,182,183,184,185,186,187],
12 | [31,32,33,34,36,37,41,42,70,84,85,86,87,88,89,90,92,93,94,95,96,97,98,99,100,101,102,103,104,105],
13 | [19,20,21,22,23,24,25,26,27,28,29,30],
14 | [46,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121],
15 | [130,131,135,140,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,161,162,163,164,165,166,167,168,169,171,173,174,175],
16 | [8,9,10,11,13,14,15,16,17,18],
17 | [136,170]]
18 |
19 |
20 | def get_sc3_Ting_labels():
21 | """
22 | :return: array {0,..,6}^187 (results from SC3 Ting dataset)
23 | """
24 | sc3_labels = np.zeros(187)
25 | for lbl in range(len(SC3_Ting7_results)):
26 | inds = np.array(SC3_Ting7_results[lbl], dtype=np.int)-1
27 | # print lbl, inds.shape
28 | sc3_labels[inds] = lbl
29 | return sc3_labels
30 |
31 |
32 | def cell_filter(data, num_expr_genes=2000, non_zero_threshold=2):
33 | """
34 | :param data: transcripts x cells data matrix
35 | :return: indices of valid cells
36 | """
37 | #print('SC3 cell filter with num_expr_genes={0} and non_zero_threshold={1}'.format(num_expr_genes, non_zero_threshold))
38 | ai, bi = np.where(np.isnan(data))
39 | data[ai, bi] = 0
40 | num_transcripts, num_cells = data.shape
41 | res = np.sum(data >= non_zero_threshold , axis=0)
42 | return np.where(np.isfinite(res) & (res >= num_expr_genes))[0]
43 |
44 |
45 | def gene_filter(data, perc_consensus_genes=0.94, non_zero_threshold=2):
46 | """
47 | :param data: transcripts x cells data matrix
48 | :return: indices of valid transcripts
49 | """
50 | #print('SC3 gene filter with perc_consensus_genes={0} and non_zero_threshold={1}'.format(perc_consensus_genes, non_zero_threshold))
51 | ai, bi = np.where(np.isnan(data))
52 | data[ai, bi] = 0
53 | num_transcripts, num_cells = data.shape
54 | res_l = np.sum(data >= non_zero_threshold , axis=1)
55 | res_h = np.sum(data > 0, axis=1)
56 | lower_bound = np.float(num_cells)*(1.-perc_consensus_genes)
57 | upper_bound = np.float(num_cells)*perc_consensus_genes
58 | return np.where((res_l >= lower_bound) & (res_h <= upper_bound))[0]
59 |
60 |
61 | def data_transformation_log2(data):
62 | """
63 | :param data: transcripts x cells data matrix
64 | :return: log2 transformed data
65 | """
66 | # print('SC3 log2 data transformation.')
67 | return np.log2(data + 1.)
68 |
69 |
70 | def no_data_transformation(data):
71 | """
72 | :param data: transcripts x cells data matrix
73 | :return: identical data
74 | """
75 | # print('SC3 log2 data transformation.')
76 | return data
77 |
78 |
79 | def da_nmf_distances(data, gene_ids, da_model, reject_ratio=0., metric='euclidean', mixture=0.5, use_H2=True):
80 | if mixture == 0.0:
81 | return distances(data, [], metric=metric)
82 |
83 | W, H, H2 = da_model
84 |
85 | # convex combination of vanilla distance and nmf distance
86 | dist1 = distances(data, [], metric=metric)
87 | if use_H2:
88 | dist2 = distances(W.dot(H2), [], metric=metric)
89 | else:
90 | print('Using H instead of H2 for reconstruction.')
91 | dist2 = distances(W.dot(H), [], metric=metric)
92 |
93 | # normalize distance
94 | if np.max(dist2) < 1e-12:
95 | if mixture == 1.0:
96 | raise Exception('Distances are all close to zero and mixture=1.0. '
97 | 'Seems that source and target data do not go well together.')
98 | else:
99 | print('Warning! Max distance is close to zero.')
100 | else:
101 | # print 'Max dists before normalization: ', np.max(dist1), np.max(dist2)
102 | dist2 *= np.max(dist1) / np.max(dist2)
103 | return mixture*dist2 + (1.-mixture)*dist1
104 |
105 |
106 | def distances(data, gene_ids, metric='euclidean'):
107 | """
108 | :param data: transcripts x cells data matrix
109 | :param gene_ids: #transcripts vector with corresponding gene(transcript) ids
110 | :param metric: string with distance metric name (ie. 'euclidean','pearson','spearman')
111 | :return: cells x cells distance matrix
112 | """
113 | # print('SC3 pairwise distance computations (metric={0}).'.format(metric))
114 |
115 | # Euclidean: Use the standard Euclidean (as-the-crow-flies) distance.
116 | # Euclidean Squared: Use the Euclidean squared distance in cases where you would use regular Euclidean distance in Jarvis-Patrick or K-Means clustering.
117 | # Manhattan: Use the Manhattan (city-block) distance.
118 | # Pearson Correlation: Use the Pearson Correlation coefficient to cluster together genes or samples with similar behavior; genes or samples with opposite behavior are assigned to different clusters.
119 | # Pearson Squared: Use the squared Pearson Correlation coefficient to cluster together genes with similar or opposite behaviors (i.e. genes that are highly correlated and those that are highly anti-correlated are clustered together).
120 | # Chebychev: Use Chebychev distance to cluster together genes that do not show dramatic expression differences in any samples; genes with a large expression difference in at least one sample are assigned to different clusters.
121 | # Spearman: Use Spearman Correlation to cluster together genes whose expression profiles have similar shapes or show similar general trends (e.g. increasing expression with time), but whose expression levels may be very different.
122 |
123 | if metric == 'pearson':
124 | X = 1. - np.corrcoef(data.T)
125 | elif metric == 'spearman':
126 | X, _ = stats.spearmanr(data, axis=0)
127 | X = 1.-X
128 | else:
129 | X = dist.pdist(data.T, metric=metric)
130 | X = dist.squareform(X)
131 | # print X.shape
132 | return X
133 |
134 |
135 | def transformations(dm, components=5, method='pca'):
136 | """
137 | :param dm: cells x cells distance matrix
138 | :param components: number of eigenvector/eigenvalues to use
139 | :param method: either 'pca' or 'spectral'
140 | :return: cells x cells (centered!) distance matrix, cells x components Eigenvectors
141 | """
142 | # print('SC3 {1} transformation (components={0}).'.format(components, method.upper()))
143 | if method == 'spectral':
144 | num_cells = dm.shape[0]
145 | A = np.exp(-dm/np.max(dm))
146 | D = np.sum(dm, axis=1)
147 | L = D - A
148 | D1 = np.diag(D.__pow__(-0.5))
149 | D1[np.isinf(D)] = 0.0
150 | dm = D1.dot(L.dot(D1))
151 | # Laplacian:
152 | # L := D - A
153 | # symmetric normalized laplacian:
154 | # L_sym := D^-0.5 L D^-0.5
155 | inds = list(range(components))
156 | else:
157 | # column-wise scaling and normalizing
158 | num_cells = dm.shape[0]
159 | # J = np.eye(num_cells) - 1./np.float(num_cells)*np.ones((num_cells, num_cells))
160 | # dm = 0.5*J.dot(dm.dot(J))
161 | dm = dm - np.repeat(np.mean(dm, axis=0).reshape((1, num_cells)), num_cells, axis=0)
162 | #dm = dm / np.repeat(np.nanstd(dm, axis=0).reshape((1, num_cells)), num_cells, axis=0)
163 | np.place(dm, np.isinf(dm), np.nan)
164 | np.place(dm, np.isneginf(dm), np.nan)
165 | # print "Number of infs: ", sum(sum(np.isinf(dm)))+sum(sum(np.isneginf(dm)))
166 | # print "Number of nans: ", sum(sum(np.isnan(dm)))
167 | # print "std: ", np.nanstd(dm, axis=0)
168 | #print "Percentage of zero-entries in std", 100*sum(np.nanstd(dm, axis=0) == 0)/len(np.nanstd(dm, axis=0)), "%"
169 |
170 | if sum(np.nanstd(dm, axis=0) == 0) == len(np.nanstd(dm, axis=0)):
171 | print("All values are Zero!")
172 | else:
173 | dm = dm / np.repeat(np.nanstd(dm, axis=0).reshape((1, num_cells)), num_cells, axis=0)
174 |
175 | # vals: the eigenvalues in ascending order, each repeated according to its multiplicity.
176 | # vecs: the column v[:, i] is the normalized eigenvector corresponding to the eigenvalue w[i]
177 | # vals, vecs = np.linalg.eigh(dm)
178 | _, vals, ev = sl.svd(dm)
179 | vecs = ev.T
180 | vals /= np.sqrt(np.max((1, num_cells - 1)))
181 |
182 | if method == 'pca':
183 | # This part is done to imitate sc3 behavior which only sorts absolute Eigenvalues
184 | # making the highest Eigenvalue first followed by the smallest (ascending) Eigenvalues
185 | # x = np.sqrt(vals*vals)
186 | # inds = np.argsort(-x) # argsort is ascending order
187 | # inds = np.argsort(vals) # argsort is ascending order
188 | # inds = inds[:components]
189 | inds = np.arange(components)
190 | # print inds
191 |
192 | ## inds = range(vals.size-components, vals.size)
193 | ## inds = range(components)
194 | ## print inds
195 | ## print vals.size, vals
196 | #plt.bar(np.arange(len(vals)), np.sort(vals)[::-1])
197 | ##print '# of zero eigenvalues: ', sum(vals==0)
198 | #plt.xlim(0,int(len(vals)/3)) # eigenvalues between 0 and 2
199 | #plt.axvline(x=np.true_divide(len(vals), 100)*4, color='r')
200 | #plt.axvline(x=np.true_divide(len(vals), 100)*7, color='r')
201 |
202 | D = np.diag(vals[inds])
203 | return vecs[:, inds].dot(D.dot(vecs[:, inds].T)), vecs[:, inds]
204 |
205 |
206 | def intermediate_kmeans_clustering(X, k=5, n_init=10, max_iter=10000, init='k-means++'):
207 | """
208 | :param X: cells x d vector
209 | :param k: number of clusters
210 | :param n_init: number of re-starts for k-means
211 | :param max_iter: maximum number of iterations per run
212 | :param init: initialization strategy for k-means (either 'k-means++' or 'random')
213 | :return: cells x 1 labels
214 | """
215 | kmeans = cluster.KMeans(n_clusters=k, precompute_distances=True, n_init=n_init, max_iter=max_iter,
216 | init=init, n_jobs=1)
217 | labels = kmeans.fit_predict(X)
218 | assert labels.size == X.shape[0]
219 | return labels
220 |
221 |
222 | def build_consensus_matrix(X):
223 | """
224 | :param X: n x cells label matrix
225 | :return: cells x cells consensus matrix
226 | """
227 | if len(X.shape) == 1:
228 | X = X[np.newaxis, :]
229 | n, cells = X.shape
230 | consensus = np.zeros((cells, cells), dtype=np.float)
231 | for i in range(n):
232 | t = dist.squareform(dist.pdist(X[i, :].reshape(cells, 1)))
233 | t = np.array(t, dtype=np.int)
234 | # print np.unique(t)
235 | t[t != 0] = -1
236 | t[t == 0] = +1
237 | t[t == -1] = 0
238 | # print np.unique(t)
239 | consensus += np.array(t, dtype=np.float)
240 | consensus /= np.float(n)
241 | return consensus
242 |
243 |
244 | def consensus_clustering(consensus, n_components=5):
245 | """
246 | :param consensus: cells x cells consensus matrix
247 | :param n_components: number of clusters
248 | :return: cells x 1 labels
249 | """
250 | # print 'SC3 Agglomorative hierarchical clustering.'
251 | # condensed distance matrix
252 | cdm = dist.pdist(consensus)
253 | # hierarchical clustering (SC3: complete agglomeration + cutree)
254 | hclust = spc.complete(cdm)
255 | cutree = spc.cut_tree(hclust, n_clusters=n_components)
256 | labels = cutree.reshape(consensus.shape[0])
257 | # Below is the hclust code for the older version, fyi
258 | # hclust = spc.linkage(cdm)
259 | # labels = spc.fcluster(hclust, n_components, criterion='maxclust')
260 | return labels, dist.squareform(cdm)
--------------------------------------------------------------------------------
/scRNA/utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | import numpy as np
3 | import sklearn.metrics as metrics
4 | import sklearn.decomposition as decomp
5 | import pdb
6 | import matplotlib.pyplot as plt
7 |
8 | from . import sc3_clustering_impl as sc
9 |
10 |
11 | def load_dataset_tsv(fname, fgenes=None, flabels=None):
12 | # check data filename
13 | if not os.path.exists(fname):
14 | raise Exception('File \'{0}\' not found.'.format(fname))
15 |
16 | #print('Loading TSV data file from {0}.'.format(fname))
17 | data = np.loadtxt(fname, delimiter='\t')
18 | #print data.shape
19 |
20 | gene_ids = np.arange(0, data.shape[0]).astype(np.str)
21 | # Some scripts expect the gene ids (esp. for multitask learning of two or
22 | # more datasets). If not specified, inform the user.
23 | if fgenes is None:
24 | print('Warning! Gene identifier file is not specified. Gene ids are now generated.')
25 | else:
26 | gene_ids = np.loadtxt(fgenes, delimiter='\t', dtype=np.str)
27 | #print('Gene ids loaded for {0} genes.'.format(gene_ids.shape[0]))
28 | if not np.unique(gene_ids).shape[0] == gene_ids.shape[0]:
29 | print(('Warning! Gene ids are supposed to be unique. '
30 | 'Only {0} of {1} entries are unique.'.format(np.unique(gene_ids).shape[0], gene_ids.shape[0])))
31 |
32 | labels = None
33 | labels_2_ids = None
34 | if flabels is not None:
35 | #print('Loading labels from \'{0}\'.'.format(flabels))
36 | # labels are handled as string values even though they are numerical
37 | label_ids = np.loadtxt(flabels, delimiter='\t', dtype=np.str_)
38 | assert label_ids.size == data.shape[1]
39 |
40 | labels_2_ids = np.unique(label_ids)
41 | unique_ind = np.arange(start=0, stop=labels_2_ids.shape[0])
42 | labels = np.zeros((data.shape[1]), dtype=np.int)
43 | #print('Found {0} unique labels:'.format(labels_2_ids.size))
44 | #print labels_2_ids
45 | for i in range(unique_ind.size):
46 | inds = np.where(label_ids == labels_2_ids[i])[0]
47 | labels[inds] = unique_ind[i]
48 | #print('Label {0} occured {1} times. Assigned class is {2}.'.format(labels_2_ids[i], inds.size, unique_ind[i]))
49 |
50 | return data, gene_ids, labels, labels_2_ids
51 |
52 |
53 | def load_dataset(fname):
54 | if not os.path.exists(fname):
55 | raise Exception('File \'{0}\' not found.'.format(fname))
56 | foo = np.load(fname)
57 | data = foo['data']
58 | gene_ids = foo['transcripts']
59 | # look for labels
60 | labels = None
61 | if 'labels' in foo:
62 | labels = foo['labels']
63 | return data, gene_ids, labels
64 |
65 |
66 | def normalize_kernel(K):
67 | # A kernel K is normalized, iff K_ii = 1 \forall i
68 | N = K.shape[0]
69 | a = np.sqrt(np.diag(K)).reshape((N, 1))
70 | if any(np.isnan(a)) or any(np.isinf(a)) or any(np.abs(a)<=1e-16):
71 | print('Numerical instabilities.')
72 | C = np.eye(N)
73 | else:
74 | b = 1. / a
75 | C = b.dot(b.T)
76 | return K * C
77 |
78 |
79 | def center_kernel(K):
80 | # Mean free in feature space
81 | N = K.shape[0]
82 | a = np.ones((N, N)) / np.float(N)
83 | return K - a.dot(K) - K.dot(a) + a.dot(K.dot(a))
84 |
85 |
86 | def kta_align_general(K1, K2):
87 | """
88 | Computes the (empirical) alignment of two kernels K1 and K2
89 | Definition 1: (Empirical) Alignment
90 | a = _Frob
91 | b = sqrt( )
92 | kta = a / b
93 | with _Frob = sum_ij A_ij B_ij = tr(AB')
94 | """
95 | return K1.dot(K2.T).trace() / np.sqrt(K1.dot(K1.T).trace() * K2.dot(K2.T).trace())
96 |
97 |
98 | def kta_align_binary(K, y):
99 | # Computes the (empirical) alignment of kernel K1 and
100 | # a corresponding binary label vector y \in \{+1, -1\}^m
101 | m = np.int(y.size)
102 | YY = y.reshape((m, 1)).dot(y.reshape((1, m)))
103 | return K.dot(YY).trace() / (m * np.sqrt(K.dot(K.T).trace()))
104 |
105 |
106 | def get_kernel(X, Y, type='linear', param=1.0):
107 | """Calculates a kernel given the data X and Y (dims x exms)"""
108 | (Xdims, Xn) = X.shape
109 | (Ydims, Yn) = Y.shape
110 |
111 | kernel = 1.0
112 | if type=='linear':
113 | #print('Calculating linear kernel with size {0}x{1}.'.format(Xn, Yn))
114 | kernel = X.T.dot(Y)
115 |
116 | if type=='rbf':
117 | #print('Calculating Gaussian kernel with size {0}x{1} and sigma2={2}.'.format(Xn, Yn, param))
118 | Dx = (np.ones((Yn, 1)) * np.diag(X.T.dot(X)).reshape(1, Xn)).T
119 | Dy = (np.ones((Xn, 1)) * np.diag(Y.T.dot(Y)).reshape(1, Yn))
120 | kernel = Dx - 2.* np.array(X.T.dot(Y)) + Dy
121 | kernel = np.exp(-kernel / param)
122 | #print kernel.shape
123 | return kernel
124 |
125 |
126 | def unsupervised_acc_silhouette(X, labels, metric='euclidean'):
127 | dists = sc.distances(X, gene_ids=np.arange(X.shape[1]), metric=metric)
128 | num_lbls = np.unique(labels).size
129 | if num_lbls > 1 and not np.any(np.isnan(dists)) and not np.any(np.isinf(dists)):
130 | return metrics.silhouette_score(dists, labels, metric='precomputed')
131 | return 0.0
132 |
133 |
134 | def unsupervised_acc_kta(X, labels, kernel='linear', param=1.0, center=True, normalize=True):
135 | Ky = np.zeros((labels.size, np.max(labels) + 1))
136 | for i in range(len(labels)):
137 | Ky[i, labels[i]] = 1.
138 |
139 | if kernel == 'rbf':
140 | Kx = get_kernel(X, X, type='rbf', param=param)
141 | Ky = get_kernel(Ky.T, Ky.T, type='linear', param=param)
142 | else:
143 | Kx = X.T.dot(X)
144 | Ky = Ky.dot(Ky.T)
145 |
146 | if center:
147 | Kx = center_kernel(Kx)
148 | Ky = center_kernel(Ky)
149 | if normalize:
150 | Kx = normalize_kernel(Kx)
151 | Ky = normalize_kernel(Ky)
152 | return kta_align_general(Kx, Ky)
153 |
154 |
155 | def get_transferability_score(W, H, trg_data, reps=100, alpha=0.0, l1=0.75, max_iter=100, rel_err=1e-3):
156 | # estimate maximum error without any transfer
157 | errs = np.zeros((reps,))
158 | for i in range(errs.size):
159 | rand_gene_inds = np.random.permutation(W.shape[0])
160 | _, _, _, errs[i] = get_transferred_data_matrix(W[rand_gene_inds, :], trg_data, max_iter=max_iter, rel_err=rel_err)
161 |
162 | #print 'Calculating non-permuted error score'
163 | _, _, _, err_nonpermuted = get_transferred_data_matrix(W, trg_data, max_iter=max_iter, rel_err=rel_err) # minimum transfer error
164 |
165 | nmf = decomp.NMF(alpha=alpha, init='nndsvdar', l1_ratio=l1, max_iter=max_iter,
166 | n_components=W.shape[1], random_state=0, shuffle=True, solver='cd', tol=0.00001, verbose=0)
167 | W_best = nmf.fit_transform(trg_data)
168 | H_best = nmf.components_
169 |
170 | err_best = np.sum(np.abs(trg_data - W_best.dot(H_best))) / np.float(trg_data.size) # absolute
171 | err_curr = np.sum(np.abs(trg_data - W.dot(H))) / np.float(trg_data.size) # absolute
172 | err_worst = np.max(errs)
173 |
174 | errs[errs < err_best] = err_best
175 | percs = 1.0 - (errs - err_best) / (err_worst - err_best)
176 | score = 1.0 - np.max([err_curr - err_best, 0]) / (err_worst - err_best)
177 |
178 | p_value = sum(errs < err_nonpermuted)/reps
179 | # plt.hist(errs)
180 | # plt.title("Histogram of random error scores")
181 | # plt.axvline(err_best, color='k', linestyle='dashed', linewidth=1)
182 | # plt.show()
183 |
184 | return score, percs, p_value
185 |
186 |
187 | def get_transferred_data_matrix(W, trg_data, normalize_H2=False, max_iter=100, rel_err=1e-3):
188 | # initialize H: data matrix
189 | H = np.random.randn(W.shape[1], trg_data.shape[1])
190 | a1, a2 = np.where(H < 0.)
191 | H[a1, a2] *= -1.
192 | a1, a2 = np.where(H < 1e-10)
193 | H[a1, a2] = 1e-10
194 |
195 | n_iter = 0
196 | err = 1e10
197 | while n_iter < max_iter:
198 | n_iter += 1
199 | if np.any(W.T.dot(W.dot(H))==0.):
200 | raise Exception('DA target: division by zero.')
201 | H *= W.T.dot(trg_data) / W.T.dot(W.dot(H))
202 | new_err = np.sum(np.abs(trg_data - W.dot(H))) / np.float(trg_data.size) # absolute
203 | # new_err = np.sqrt(np.sum((Xtrg - W.dot(H))*(Xtrg - W.dot(H)))) / np.float(Xtrg.size) # frobenius
204 | if np.abs((err - new_err) / err) <= rel_err and err >= new_err:
205 | break
206 | err = new_err
207 | # print ' Number of iterations for reconstruction + reconstruction error : ', n_iter, new_err
208 | H2 = np.zeros((W.shape[1], trg_data.shape[1]))
209 |
210 | H2[(np.argmax(H, axis=0), np.arange(trg_data.shape[1]))] = 1
211 | # H2[(np.argmax(H, axis=0), np.arange(trg_data.shape[1]))] = np.sum(H, axis=0) # DOES NOT WORK WELL!
212 |
213 | # normalization
214 | if normalize_H2:
215 | #print 'Normalize H2.'
216 | n_iter = 0
217 | err = 1e10
218 | sparse_rec_err = np.sum(np.abs(trg_data - W.dot(H2))) / np.float(trg_data.size) # absolute
219 | #print n_iter, ': sparse rec error: ', sparse_rec_err
220 | while n_iter < max_iter:
221 | n_iter += 1
222 | H2 *= W.T.dot(trg_data) / W.T.dot(W.dot(H2))
223 | # foo = 0.05 * W.T.dot(trg_data - W.dot(H2))
224 | # H2[np.argmax(H, axis=0), :] -= foo[np.argmax(H, axis=0), :]
225 | sparse_rec_err = np.sum(np.abs(trg_data - W.dot(H2))) / np.float(trg_data.size) # absolute
226 | #print n_iter, ': sparse rec error: ', sparse_rec_err
227 | if np.abs((err - sparse_rec_err) / err) <= rel_err and err >= sparse_rec_err:
228 | break
229 | err = sparse_rec_err
230 | return W, H, H2, new_err
231 |
232 |
233 | def get_matching_gene_inds(src_gene_ids, trg_gene_ids):
234 | if not np.unique(src_gene_ids).size == src_gene_ids.size:
235 | # raise Exception('(MTL) Gene ids are supposed to be unique.')
236 | print(('\nWarning! (MTL gene ids) Gene ids are supposed to be unique. '
237 | 'Only {0} of {1} entries are unique.'.format(np.unique(src_gene_ids).shape[0], src_gene_ids.shape[0])))
238 | print('Only first occurance will be used.\n')
239 | if not np.unique(trg_gene_ids).size == trg_gene_ids.size:
240 | # raise Exception('(Target) Gene ids are supposed to be unique.')
241 | print(('\nWarning! (Target gene ids) Gene ids are supposed to be unique. '
242 | 'Only {0} of {1} entries are unique.'.format(np.unique(trg_gene_ids).shape[0], trg_gene_ids.shape[0])))
243 | print('Only first occurance will be used.\n')
244 |
245 | # common_ids = np.intersect1d(trg_gene_ids, src_gene_ids)
246 | # sort the common ids according to target gene ids
247 | common_ids = []
248 | for i in range(trg_gene_ids.size):
249 | if np.any(trg_gene_ids[i] == src_gene_ids):
250 | common_ids.append(trg_gene_ids[i])
251 | # common_ids = np.array(common_ids, dtype=np.str)
252 | common_ids = np.array(common_ids)
253 |
254 | #print('Both datasets have (after processing) {0} (src={1}%,trg={2}%) gene ids in common.'.format(
255 | # common_ids.shape[0],
256 | # np.int(np.float(common_ids.size) / np.float(src_gene_ids.size)*100.0),
257 | # np.int(np.float(common_ids.size) / np.float(trg_gene_ids.size)*100.0)))
258 |
259 | #print('Number of common genes must not be 0!')
260 | assert(common_ids.shape[0] > 0)
261 |
262 | # find indices of common_ids in pgene_ids and gene_ids
263 | inds1 = np.zeros(common_ids.shape[0], dtype=np.int)
264 | inds2 = np.zeros(common_ids.shape[0], dtype=np.int)
265 | for i in range(common_ids.shape[0]):
266 | # 1: inds1[i] = np.where(common_ids[i] == trg_gene_ids)[0][0]
267 | inds = np.where(common_ids[i] == trg_gene_ids)[0]
268 | if inds.size > 1:
269 | inds1[i] = inds[0]
270 | else:
271 | inds1[i] = inds
272 | # 2: inds2[i] = np.where(common_ids[i] == src_gene_ids)[0][0]
273 | inds = np.where(common_ids[i] == src_gene_ids)[0]
274 | if inds.size > 1:
275 | inds2[i] = inds[0]
276 | else:
277 | inds2[i] = inds
278 | return inds1, inds2
279 |
--------------------------------------------------------------------------------
/scripts/experiments/main_wrapper_generated_data.py:
--------------------------------------------------------------------------------
1 | ###################################################
2 | ### ###
3 | ### Complete Experiment on generated data ###
4 | ### written by Bettina Mieth, Nico Görnitz, ###
5 | ### Marina Vidovic and Alex Gutteridge ###
6 | ### ###
7 | ###################################################
8 |
9 | # Please change all directories to yours!
10 |
11 | import sys
12 | sys.path.append('C:/Users/Bettina/PycharmProjects2/scRNA_new/final_scripts/implementations')
13 | import logging
14 | logging.basicConfig()
15 | from functools import partial
16 | from experiments_utils import (method_sc3_ours, method_sc3_combined_ours, method_transfer_ours, acc_ari, acc_kta)
17 | from nmf_clustering import NmfClustering_initW
18 | from simulation import generate_toy_data, split_source_target
19 | from utils import *
20 | import datetime
21 | import numpy as np
22 |
23 | # Running times
24 |
25 | now1 = datetime.datetime.now()
26 | print("Current date and time:")
27 | print(now1.strftime("%Y-%m-%d %H:%M"))
28 |
29 | # Directories and parameters
30 |
31 | fname_final ='/home/bmieth/scRNAseq/results/toy_data_final/main_results_toydata_revision_realistic_counts_complete.npz' # directory to save results in
32 | reps = 100 # number of repetitions, 100
33 | genes = [10000] # number of genes, 1000
34 | n_src = [1000] # number of source data points, 1000
35 | n_trg = 800 # overall number of target data points, 800
36 | percs = np.true_divide([10,25,50,100,200,400, 600, 800], n_trg) # Percentages of complete target data to use, [10,25,50,100,200,400, 600, 800]
37 | cluster_spec =[1, 2, 3, [4, 5], [6, [7, 8]]] # Spectrum of clusters defining a hierarchical clustering structure with eight leaf nodes, [1, 2, 3, [4, 5], [6, [7, 8]]]
38 | splitting_mode = 7 # mode defining how data is split in source and target set, 2 = randomly stratified splitting, 7 = splitting according to common, 7
39 | common = [0,3,5] # different numbers of overlapping top node clusters in source and target data, [0,3,5]
40 |
41 | # data generation parameters
42 | gamma_rate=0.1
43 |
44 | # NMF parameters
45 | nmf_alpha = 10.0
46 | nmf_l1 = 0.75
47 | nmf_max_iter = 4000
48 | nmf_rel_err = 1e-3
49 |
50 | # Mixture parameters to choose from (algorithm will automatically pick one of them)
51 | mixes = [0.0,0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9,1.0] # Mixture parameters of transfer learning, [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
52 |
53 | # List of accuracy functions to be used
54 | acc_funcs = list()
55 | acc_funcs.append(partial(acc_ari, use_strat=False))
56 | acc_funcs.append(partial(acc_kta, mode=0))
57 |
58 | # Create list of methods to be applied
59 | methods = list()
60 | # original SC3 (SC3 on target data, TargetCluster)
61 | methods.append(partial(method_sc3_ours))
62 | # combined baseline SC3 (SC3 on combined source and target data, ConcatenateCluster)
63 | methods.append(partial(method_sc3_combined_ours))
64 | # transfer via mixing (Transfer learning via mixing source and target before SC3, TransferCluster)
65 | # Run TransferCluster for all mixture_parameters
66 | for m in mixes:
67 | methods.append(partial(method_transfer_ours, mix=m, calc_transferability=False))
68 |
69 | # Create results matrices
70 | res = np.zeros((len(n_src), len(genes), len(common), len(acc_funcs), reps, len(percs), len(methods)))
71 | res_opt_mix_ind = np.zeros((len(n_src), len(genes), len(common), reps, len(percs)))
72 | res_opt_mix_aris = np.zeros((len(n_src), len(genes), len(common), reps, len(percs)))
73 | source_aris = np.zeros((len(n_src), len(genes), len(common), reps))
74 |
75 | # Setup experiment
76 | params = []
77 | exp_counter = 1
78 | num_exps = len(n_src) * len(genes) * len(common) * reps * len(percs) * len(methods)
79 |
80 | # Run experiment
81 | for s in range(len(n_src)):
82 | for g in range(len(genes)):
83 | for c in range(len(common)):
84 | flatten = lambda l: flatten(l[0]) + (flatten(l[1:]) if len(l) > 1 else []) if type(l) is list else [l]
85 | n_cluster = len(flatten(cluster_spec))
86 | accs = np.zeros((len(acc_funcs), reps, len(percs), len(methods)))
87 | accs_desc = list()
88 | opt_mix_ind = np.zeros((reps, len(percs)))
89 | opt_mix_aris = np.zeros((reps, len(percs)))
90 |
91 | num_strat = np.zeros((reps, len(percs), len(methods)))
92 | res_desc = []
93 | r = 0
94 | while r < reps:
95 | # 1. Generate scRNA data
96 | data, labels = generate_toy_data(num_genes=genes[g], num_cells=10. * (n_trg + n_src[s]), cluster_spec=cluster_spec, gamma_rate=gamma_rate)
97 | # 2. Split source and target according to specified mode/setting
98 | src, trg, src_labels, trg_labels = split_source_target(data, labels, target_ncells=n_trg, source_ncells=n_src[s], mode=splitting_mode, common=common[c], source_clusters=None, noise_target=False, noise_sd=0.1, cluster_spec=cluster_spec)
99 | trg_labels = np.array(trg_labels, dtype=np.int)
100 | src_labels = np.array(src_labels, dtype=np.int)
101 | # 3.a. Subsampling order for target
102 | inds = np.random.permutation(trg_labels.size)
103 | # 3.b. Use perfect number of latent states for nmf and sc3
104 | src_lbl_set = np.unique(src_labels)
105 | n_trg_cluster = np.unique(trg_labels).size
106 | n_src_cluster = src_lbl_set.size
107 | # 3.c. train source once per repetition
108 | source_nmf = NmfClustering_initW(src, np.arange(src.shape[0]), num_cluster=n_src_cluster, labels=src_labels)
109 | source_nmf.apply(k=n_src_cluster, alpha=nmf_alpha, l1=nmf_l1, max_iter=nmf_max_iter, rel_err=nmf_rel_err)
110 | # Evaluate source clustering
111 | source_aris[s,g,c,r] = metrics.adjusted_rand_score(src_labels, source_nmf.cluster_labels)
112 | print('ITER(', r, '): SOURCE ARI = ', source_aris[s,g,c,r])
113 |
114 | # 3.d. Target data subsampling loop
115 | for i in range(len(percs)):
116 | n_trg_perc = np.int(n_trg * percs[i])
117 | p_trg = trg[:, inds[:n_trg_perc]].copy()
118 | p_trg_labels = trg_labels[inds[:n_trg_perc]].copy()
119 | # 4. MTL/DA mixing parameter loop
120 | res_desc = list()
121 | for m in range(len(methods)):
122 | # Run method
123 | print(('Running experiment {0} of {1}: repetition {2} - {3} source cells, {4} genes, {5} common clusters, {6} target cells and the {7}th method'.format(exp_counter, num_exps, r+1, n_src[s], genes[g], common[c],n_trg_perc, m+1)))
124 | desc, target_nmf, data_for_SC3,trg_lbls_pred = methods[m](source_nmf, p_trg, num_cluster=n_trg_cluster)
125 | res_desc.append(desc)
126 | accs_desc = list()
127 | if m >=2:
128 | mixed_data, _, _ = target_nmf.get_mixed_data(mix=mixes[m-2], calc_transferability=False)
129 | # Evaluate results
130 | for f in range(len(acc_funcs)):
131 | if f != 1 or m <= 1:
132 | accs[f, r, i, m], accs_descr = acc_funcs[f]([], p_trg.copy(), p_trg_labels.copy(), trg_lbls_pred.copy())
133 | else:
134 | accs[f, r, i, m], accs_descr = acc_funcs[f]([], mixed_data, p_trg_labels.copy(), trg_lbls_pred.copy())
135 | accs_desc.append(accs_descr)
136 |
137 | perc_done = round(np.true_divide(exp_counter,num_exps)*100, 4)
138 | print(('{0}% of experiments done.'.format(perc_done)))
139 | exp_counter += 1
140 | # Identify optimal mixture parameter
141 | opt_mix_ind[r, i] = np.argmax(accs[1, r, i, 2:])
142 | opt_mix_aris[r, i] = accs[0, r, i, int(opt_mix_ind[r, i]+2)]
143 | r += 1
144 | params.append((s, g, c))
145 | res[s, g, c, :, :, :, :] = accs
146 | res_opt_mix_ind[s,g,c,:,:] = opt_mix_ind
147 | res_opt_mix_aris[s,g,c,:,:] = opt_mix_aris
148 |
149 | # Save results
150 | np.savez(fname_final, methods=methods, acc_funcs=acc_funcs, res=res, accs_desc=accs_desc, method_desc=res_desc, source_aris=source_aris, percs=percs, reps=reps, genes=genes, n_src=n_src, n_trg=n_trg, common=common, mixes=mixes, res_opt_mix_ind=res_opt_mix_ind, res_opt_mix_aris=res_opt_mix_aris)
151 |
152 | # Show running times
153 | now2 = datetime.datetime.now()
154 | print("Current date and time:")
155 | print(now2.strftime("%Y-%m-%d %H:%M"))
156 | print("Time passed:")
157 | print(now2-now1)
158 | print('Done.')
159 |
--------------------------------------------------------------------------------
/scripts/experiments/main_wrapper_hockley.py:
--------------------------------------------------------------------------------
1 | ###################################################
2 | ### ###
3 | ### Complete Experiment on Hockley data ###
4 | ### written by Bettina Mieth, Nico Görnitz, ###
5 | ### Marina Vidovic and Alex Gutteridge ###
6 | ### ###
7 | ###################################################
8 |
9 | # Please change all directories to yours!
10 |
11 | import sys
12 | sys.path.append('/home/bmieth/scRNAseq/implementations')
13 | import logging
14 | logging.basicConfig()
15 | from functools import partial
16 | from experiments_utils import (method_sc3_ours, method_sc3_combined_ours, method_transfer_ours, acc_ari, acc_kta)
17 | from nmf_clustering import NmfClustering_initW
18 | from utils import *
19 | import datetime
20 | import pandas as pd
21 | import numpy as np
22 |
23 | # Running times
24 | now1 = datetime.datetime.now()
25 | print("Current date and time:")
26 | print(now1.strftime("%Y-%m-%d %H:%M"))
27 |
28 | # Data location - Please change directories to yours!
29 | fname_data_target = '/home/bmieth/scRNAseq/data/Jim/Visceraltpm_m_fltd_mat.tsv'
30 | fname_gene_names_target = '/home/bmieth/scRNAseq/data/Jim/Visceraltpm_m_fltd_row.tsv'
31 | fname_cell_names_target = '/home/bmieth/scRNAseq/data/Jim/Visceraltpm_m_fltd_col.tsv'
32 | fname_data_source = '/home/bmieth/scRNAseq/data/usoskin/usoskin_m_fltd_mat.tsv'
33 | fname_gene_names_source = '/home/bmieth/scRNAseq/data/usoskin/usoskin_m_fltd_row.tsv'
34 | fname_cell_ids_source = '/home/bmieth/scRNAseq/data/usoskin/usoskin_m_fltd_col.tsv'
35 | fname_labels_source = '/home/bmieth/scRNAseq/data/usoskin/Usoskin_labels_only.xlsx'
36 |
37 | # Result file
38 | fname_final = '/home/bmieth/scRNAseq/results/jims_data/final_for_pub/jimtarget_usoskinsource_level3labels_k7.npz'
39 |
40 | # Pre-processing parameters for gene and cell filter
41 | min_expr_genes = 2000
42 | non_zero_threshold_source = 1
43 | non_zero_threshold_target = 4
44 | perc_consensus_genes_source = 0.94
45 | perc_consensus_genes_target = 0.94
46 |
47 | # Number of clusters to obtain
48 | num_cluster = 7
49 |
50 | # Source labels are taken at which level of the original Usoskin publication
51 | labels_level_ind = 3 # 1,2 or 3
52 |
53 | # NMF parameters
54 | nmf_alpha = 10.0
55 | nmf_l1 = 0.75
56 | nmf_max_iter = 4000
57 | nmf_rel_err = 1e-3
58 |
59 | # Transfer learning parameters
60 | mixes = np.arange(0,0.75,0.05) # range of mixture parameters to use for transfer learning
61 |
62 | # List of accuracy functions to be used
63 | acc_funcs = list()
64 | acc_funcs.append(partial(acc_ari, use_strat=False))
65 | acc_funcs.append(partial(acc_kta, mode=0))
66 |
67 | # Read source data
68 | data_source = pd.read_csv(fname_data_source, sep='\t', header=None).values
69 | gene_names_source = pd.read_csv(fname_gene_names_source, sep='\t', header=None).values
70 | cell_ids_source = pd.read_csv(fname_cell_ids_source, sep='\t', header=None).values
71 |
72 | # Read source labels
73 | print("Load source labels")
74 | df = pd.read_excel(io=fname_labels_source, sheet_name='Tabelle1')
75 | df_cell_ids = df.columns[1:]
76 | df_cell_ids = list(x.encode('ascii','replace') for x in df_cell_ids)
77 | src_labels = df.values[labels_level_ind-1,1:]
78 | src_labels = list(x.encode('ascii','replace') for x in src_labels)
79 |
80 | label_source_names, label_source_counts = np.unique(src_labels, return_counts = True)
81 | print("Source labels: ", label_source_names)
82 | print("Source label counts: ", label_source_counts)
83 |
84 | # Find cell subset/order
85 | cell_intersection = list(set(x[0] for x in cell_ids_source.tolist()).intersection(set(x for x in df_cell_ids)))
86 |
87 | # Adjust source data to only include cells with labels
88 | data_indices = list(list(cell_ids_source).index(x) for x in cell_intersection)
89 | data_source = data_source[:,data_indices]
90 | cell_ids_source = cell_ids_source[data_indices]
91 |
92 | # Adjust order of labels
93 | labels_indices = list(list(df_cell_ids).index(x) for x in cell_intersection)
94 | src_labels = np.asarray(src_labels)[labels_indices]
95 | df_cell_ids = np.asarray(df_cell_ids)[labels_indices]
96 |
97 | # Preprocessing source data
98 | print("Source data dimensions before preprocessing: genes x cells", data_source.shape)
99 | # Cell and gene filter and transformation before the whole procedure
100 | cell_inds = sc.cell_filter(data_source, num_expr_genes=min_expr_genes, non_zero_threshold=non_zero_threshold_source)
101 | data_source = data_source[:,cell_inds]
102 | cell_ids_source = cell_ids_source[cell_inds]
103 | src_labels = src_labels[cell_inds]
104 | gene_inds = sc.gene_filter(data_source, perc_consensus_genes=perc_consensus_genes_source, non_zero_threshold=non_zero_threshold_source)
105 | data_source = data_source[gene_inds, :]
106 | gene_names_source = gene_names_source[gene_inds,:]
107 | data_source = sc.data_transformation_log2(data_source)
108 | # data is no filtered and transformed, don't do it again:
109 | cell_filter_fun = partial(sc.cell_filter, num_expr_genes=0, non_zero_threshold=-1)
110 | gene_filter_fun = partial(sc.gene_filter, perc_consensus_genes=1, non_zero_threshold=-1)
111 | data_transf_fun = sc.no_data_transformation
112 | print("source data dimensions after preprocessing: genes x cells: ", data_source.shape)
113 |
114 | # Read target data
115 | data_target = pd.read_csv(fname_data_target, sep='\t',header=None ).values
116 | # reverse log2 for now (dataset is saved in log-format, so we have to undo this)
117 | data_target = np.power(2,data_target)-1
118 | gene_names_target = pd.read_csv(fname_gene_names_target, sep='\t', header=None).values
119 | cell_names_target = pd.read_csv(fname_cell_names_target, sep='\t', header=None).values
120 |
121 | # Preprocessing target data
122 | print("Target data dimensions before preprocessing: genes x cells", data_target.shape)
123 | # Cell and gene filter and transformation before the whole procedure
124 | cell_inds = sc.cell_filter(data_target, num_expr_genes=min_expr_genes, non_zero_threshold=non_zero_threshold_target)
125 | data_target = data_target[:,cell_inds]
126 | cell_names_target = cell_names_target[cell_inds]
127 | gene_inds = sc.gene_filter(data_target, perc_consensus_genes=perc_consensus_genes_target, non_zero_threshold=non_zero_threshold_target)
128 | data_target = data_target[gene_inds, :]
129 | gene_names_target = gene_names_target[gene_inds,:]
130 | data_target = sc.data_transformation_log2(data_target)
131 | print("Target data dimensions after preprocessing: genes x cells: ", data_target.shape)
132 |
133 | # Find gene subset of genees that appear in both source and target
134 | gene_intersection = list(set(x[0] for x in gene_names_target).intersection(set(x[0] for x in gene_names_source)))
135 |
136 | # Adjust source and target data to only include overlapping genes
137 | data_target_indices = list(list(gene_names_target).index(x) for x in gene_intersection)
138 | data_target = data_target[data_target_indices,]
139 | gene_names_target = gene_names_target[data_target_indices]
140 |
141 | data_source_indices = list(list(gene_names_source).index(x) for x in gene_intersection)
142 | data_source = data_source[data_source_indices,]
143 | gene_names_source = gene_names_source[data_source_indices]
144 |
145 | print("Target data dimensions after taking source intersection: genes x cells: ", data_target.shape)
146 | print("source data dimensions after taking target intersection: genes x cells: ", data_source.shape)
147 |
148 | # Specify dataset sizes
149 | genes = len(gene_intersection) # number of genes
150 | n_src = data_source.shape[1]
151 | n_trg = data_target.shape[1]
152 |
153 | # List of methods to be applied
154 | methods = list()
155 | # original SC3 (SC3 on target data, TargetCluster)
156 | methods.append(partial(method_sc3_ours))
157 | # combined baseline SC3 (SC3 on combined source and target data, ConcatenateCluster)
158 | methods.append(partial(method_sc3_combined_ours))
159 | # transfer via mixing (Transfer learning via mixing source and target before SC3, TransferCluster)
160 | # Experiment for all mixture_parameters
161 | for m in mixes:
162 | methods.append(partial(method_transfer_ours, mix=m, calc_transferability=False))
163 |
164 | # Create results matrix
165 | res = np.zeros((len(acc_funcs), len(methods)))
166 | exp_counter = 1
167 | num_exps = len(methods)
168 | accs = np.zeros((len(acc_funcs), len(methods)))
169 | trg_labels = np.zeros((n_trg, len(methods)))
170 |
171 | # Use perfect number of latent states for nmf and sc3
172 | src_lbl_set = np.unique(src_labels)
173 | n_trg_cluster = num_cluster
174 | n_src_cluster = src_lbl_set.size
175 |
176 | ## Train source
177 | source_nmf = NmfClustering_initW(data_source, np.arange(data_source.shape[0]), num_cluster=n_src_cluster, labels=src_labels)
178 | source_nmf.apply(k=n_src_cluster, alpha=nmf_alpha, l1=nmf_l1, max_iter=nmf_max_iter, rel_err=nmf_rel_err)
179 |
180 | ## Calculate ARIs and KTAs
181 | source_aris = metrics.adjusted_rand_score(src_labels[source_nmf.remain_cell_inds], source_nmf.cluster_labels)
182 | print('SOURCE ARI = ', source_aris)
183 |
184 | # MTL/DA mixing parameter loop
185 | res_desc = list()
186 | for m in range(len(methods)):
187 | print(('Running experiment {0} of {1}: Train target data - {2} source cells, {3} genes, {4} target cells and the {5}th method'.format(exp_counter, num_exps, n_src, genes, n_trg, m+1)))
188 | source_nmf.cell_filter_list = list()
189 | source_nmf.gene_filter_list = list()
190 | # source data is already filtered and transformed ...
191 | source_nmf.add_cell_filter(lambda x: np.arange(x.shape[1]).tolist())
192 | source_nmf.add_gene_filter(lambda x: np.arange(x.shape[0]).tolist())
193 | source_nmf.set_data_transformation(lambda x: x)
194 |
195 | # Run method
196 | desc, target_nmf, data_for_SC3,trg_lbls_pred = methods[m](source_nmf, data_target.copy(), num_cluster=n_trg_cluster)
197 | trg_labels[:,m] = trg_lbls_pred
198 | res_desc.append(desc)
199 |
200 | # Evaluate results
201 | print("Evaluation of target results")
202 | accs_desc = list()
203 | if m >=2:
204 | mixed_data, _, _ = target_nmf.get_mixed_data(mix=mixes[m-2], calc_transferability=False)
205 | for f in range(len(acc_funcs)):
206 | if f != 1 or m <= 1:
207 | accs[f,m] = accs[f,m]
208 | accs_descr = "No labels, no ARIs."
209 | else:
210 | accs[f, m], accs_descr = acc_funcs[f]([], mixed_data, [], trg_lbls_pred.copy())
211 | accs_desc.append(accs_descr)
212 | print(('Accuracy: {0} ({1})'.format(accs[f, m], accs_descr)))
213 | perc_done = round(np.true_divide(exp_counter, num_exps)*100, 4)
214 | print(('{0}% of experiments done.'.format(perc_done)))
215 | exp_counter += 1
216 |
217 | # Identify optimal mixture parameter
218 | opt_mix_ind = np.argmax(accs[1, 2:])
219 | opt_mix_aris = accs[0, int(opt_mix_ind+2)]
220 |
221 | # Save results
222 | res[:, :] = accs
223 | res_opt_mix_ind = opt_mix_ind
224 | res_opt_mix_aris = opt_mix_aris
225 |
226 | np.savez(fname_final, methods=methods, acc_funcs=acc_funcs, res=res, accs_desc=accs_desc, trg_labels = trg_labels, data_target = data_target, method_desc=res_desc, source_aris=source_aris, min_expr_genes=min_expr_genes, non_zero_threshold_target=non_zero_threshold_target, non_zero_threshold_source=non_zero_threshold_source, perc_consensus_genes_source=perc_consensus_genes_source, perc_consensus_genes_target=perc_consensus_genes_target, num_cluster=num_cluster, nmf_alpha=nmf_alpha, nmf_l1=nmf_l1, nmf_max_iter=nmf_max_iter, nmf_rel_err=nmf_rel_err, genes=genes, n_src=n_src, n_trg=n_trg, mixes=mixes, res_opt_mix_ind=res_opt_mix_ind, res_opt_mix_aris=res_opt_mix_aris, labels_source = src_labels, gene_intersection=gene_intersection, cell_names_source=cell_ids_source, cell_names_target=cell_names_target, gene_names_target=gene_names_target, gene_names_source=gene_names_source)
227 |
228 | # Print running times
229 | now2 = datetime.datetime.now()
230 | print("Current date and time:")
231 | print(now2.strftime("%Y-%m-%d %H:%M"))
232 | print("Time passed:")
233 | print(now2-now1)
234 | print('Done.')
235 |
--------------------------------------------------------------------------------
/scripts/experiments/main_wrapper_hockley_NMF_labels.py:
--------------------------------------------------------------------------------
1 | ###################################################
2 | ### ###
3 | ### Complete Experiment on Hockley data ###
4 | ### using NMF labels for source data ###
5 | ### written by Bettina Mieth, Nico Görnitz, ###
6 | ### Marina Vidovic and Alex Gutteridge ###
7 | ### ###
8 | ###################################################
9 |
10 | # Please change all directories to yours!
11 |
12 | import sys
13 | sys.path.append('/home/bmieth/scRNAseq/implementations')
14 | import logging
15 | logging.basicConfig()
16 | from functools import partial
17 | from experiments_utils import (method_sc3_ours, method_sc3_combined_ours, method_transfer_ours, acc_ari, acc_kta)
18 | from nmf_clustering import NmfClustering, NmfClustering_initW
19 | from utils import *
20 | import datetime
21 | import pandas as pd
22 | import numpy as np
23 | import pdb
24 |
25 | # Running times
26 | now1 = datetime.datetime.now()
27 | print("Current date and time:")
28 | print(now1.strftime("%Y-%m-%d %H:%M"))
29 |
30 | # Data location - Please change directories to yours!
31 | fname_data_target = '/home/bmieth/scRNAseq/data/Jim/Visceraltpm_m_fltd_mat.tsv'
32 | fname_gene_names_target = '/home/bmieth/scRNAseq/data/Jim/Visceraltpm_m_fltd_row.tsv'
33 | fname_cell_names_target = '/home/bmieth/scRNAseq/data/Jim/Visceraltpm_m_fltd_col.tsv'
34 | fname_data_source = '/home/bmieth/scRNAseq/data/usoskin/usoskin_m_fltd_mat.tsv'
35 | fname_gene_names_source = '/home/bmieth/scRNAseq/data/usoskin/usoskin_m_fltd_row.tsv'
36 | fname_cell_names_source = '/home/bmieth/scRNAseq/data/usoskin/usoskin_m_fltd_col.tsv'
37 |
38 | # Result file
39 | fname_final = '/home/bmieth/scRNAseq/results/jims_data/final_for_pub/jimtarget_usoskinsource_NMFlabels_k11.npz'
40 |
41 | # Pre-processing parameters for gene and cell filter
42 | min_expr_genes = 2000
43 | non_zero_threshold_source = 1
44 | non_zero_threshold_target = 4
45 | perc_consensus_genes_source = 0.94
46 | perc_consensus_genes_target = 0.94
47 |
48 | # Number of clusters to obtain
49 | num_cluster = 7
50 | num_cluster_source = 7
51 |
52 | # NMF parameters
53 | nmf_alpha = 10.0
54 | nmf_l1 = 0.75
55 | nmf_max_iter = 4000
56 | nmf_rel_err = 1e-3
57 |
58 | # Transfer learning parameters
59 | mixes = np.arange(0,0.75,0.05) # range of mixture parameters to use for transfer learning
60 |
61 | # List of accuracy functions to be used
62 | acc_funcs = list()
63 | acc_funcs.append(partial(acc_ari, use_strat=False))
64 | acc_funcs.append(partial(acc_kta, mode=0))
65 |
66 | # Read target data
67 | data_target = pd.read_csv(fname_data_target, sep='\t', header=None).values
68 | # reverse log2 for now (dataset is saved in log-format, so we have to undo this)
69 | data_target = np.power(2,data_target)-1
70 |
71 | # Preprocessing Target Data
72 | gene_names_target = pd.read_csv(fname_gene_names_target, sep='\t', header=None).values
73 | cell_names_target = pd.read_csv(fname_cell_names_target, sep='\t', header=None).values
74 |
75 | print("Target data dimensions before preprocessing: genes x cells", data_target.shape)
76 | # Cell and gene filter and transformation before the whole procedure
77 | cell_inds = sc.cell_filter(data_target, num_expr_genes=min_expr_genes, non_zero_threshold=non_zero_threshold_target)
78 | data_target = data_target[:,cell_inds]
79 | cell_names_target = cell_names_target[cell_inds]
80 | gene_inds = sc.gene_filter(data_target, perc_consensus_genes=perc_consensus_genes_target, non_zero_threshold=non_zero_threshold_target)
81 | data_target = data_target[gene_inds, :]
82 | gene_names_target = gene_names_target[gene_inds,:]
83 | data_target = sc.data_transformation_log2(data_target)
84 | print("Target data dimensions after preprocessing: genes x cells: ", data_target.shape)
85 |
86 | # Read source data
87 | data_source = pd.read_csv(fname_data_source, sep='\t', header=None).values
88 | gene_names_source = pd.read_csv(fname_gene_names_source, sep='\t', header=None).values
89 | cell_names_source = pd.read_csv(fname_cell_names_source, sep='\t', header=None).values
90 |
91 | # Preprocessing Source Data
92 | print("Source data dimensions before preprocessing: genes x cells", data_source.shape)
93 | # Cell and gene filter and transformation before the whole procedure
94 | cell_inds = sc.cell_filter(data_source, num_expr_genes=min_expr_genes, non_zero_threshold=non_zero_threshold_source)
95 | data_source = data_source[:,cell_inds]
96 | cell_names_source = cell_names_source[cell_inds]
97 | gene_inds = sc.gene_filter(data_source, perc_consensus_genes=perc_consensus_genes_source, non_zero_threshold=non_zero_threshold_source)
98 | data_source = data_source[gene_inds, :]
99 | gene_names_source = gene_names_source[gene_inds,:]
100 | data_source = sc.data_transformation_log2(data_source)
101 | # data is now filtered and transformed, don't do it again:
102 | cell_filter_fun = partial(sc.cell_filter, num_expr_genes=0, non_zero_threshold=-1)
103 | gene_filter_fun = partial(sc.gene_filter, perc_consensus_genes=1, non_zero_threshold=-1)
104 | data_transf_fun = sc.no_data_transformation
105 | print("source data dimensions after preprocessing: genes x cells: ", data_source.shape)
106 |
107 | # Find gene subset of genes that appear in both source and target
108 | gene_intersection = list(set(x[0] for x in gene_names_target).intersection(set(x[0] for x in gene_names_source)))
109 |
110 | # Adjust source and target data to only include overlapping genes
111 | data_target_indices = list(list(gene_names_target).index(x) for x in gene_intersection)
112 | data_target = data_target[data_target_indices,]
113 |
114 | data_source_indices = list(list(gene_names_source).index(x) for x in gene_intersection)
115 | data_source = data_source[data_source_indices,]
116 |
117 | print("Target data dimensions after taking source intersection: genes x cells: ", data_target.shape)
118 | print("source data dimensions after taking target intersection: genes x cells: ", data_source.shape)
119 |
120 | # Generating labels for source dataset
121 | print("Train complete data")
122 | complete_nmf = None
123 | complete_nmf = NmfClustering(data_source, np.arange(data_source.shape[0]), num_cluster=num_cluster_source, labels=[])
124 | complete_nmf.add_cell_filter(cell_filter_fun)
125 | complete_nmf.add_gene_filter(gene_filter_fun)
126 | complete_nmf.set_data_transformation(data_transf_fun)
127 | complete_nmf.apply(k=num_cluster_source, alpha=nmf_alpha, l1=nmf_l1, max_iter=nmf_max_iter, rel_err=nmf_rel_err)
128 |
129 | # Get labels
130 | labels_source = complete_nmf.cluster_labels
131 | label_source_names, label_source_counts = np.unique(labels_source, return_counts = True)
132 | print("Source labels: ", label_source_names)
133 | print("Source label counts: ", label_source_counts)
134 |
135 | # Adjust source dataset
136 | data_source = data_source[:, complete_nmf.remain_cell_inds]
137 |
138 | # Specify dataset sizes
139 | genes = len(gene_intersection) # number of genes
140 | n_src = data_source.shape[1]
141 | n_trg = data_target.shape[1]
142 |
143 | # List of methods to be applied
144 | methods = list()
145 | # original SC3 (SC3 on target data, TargetCluster)
146 | methods.append(partial(method_sc3_ours))
147 | # combined baseline SC3 (SC3 on combined source and target data, ConcatenateCluster)
148 | methods.append(partial(method_sc3_combined_ours))
149 | # transfer via mixing (Transfer learning via mixing source and target before SC3, TransferCluster)
150 | # Experiment for all mixture_parameters
151 | for m in mixes:
152 | methods.append(partial(method_transfer_ours, mix=m, calc_transferability=False))
153 |
154 | # Create results matrix
155 | res = np.zeros((len(acc_funcs), len(methods)))
156 | exp_counter = 1
157 | num_exps = len(methods)
158 | accs = np.zeros((len(acc_funcs), len(methods)))
159 | trg_labels = np.zeros((n_trg, len(methods)))
160 |
161 | # Use perfect number of latent states for nmf and sc3
162 | src_labels = np.array(labels_source, dtype=np.int)
163 | src_lbl_set = np.unique(src_labels)
164 | n_trg_cluster = num_cluster
165 | n_src_cluster = src_lbl_set.size
166 |
167 | ## Train source
168 | source_nmf = NmfClustering_initW(data_source, np.arange(data_source.shape[0]), num_cluster=n_src_cluster, labels=src_labels)
169 | source_nmf.apply(k=n_src_cluster, alpha=nmf_alpha, l1=nmf_l1, max_iter=nmf_max_iter, rel_err=nmf_rel_err)
170 |
171 | ## Calculate ARIs and KTAs
172 | source_aris = metrics.adjusted_rand_score(src_labels[source_nmf.remain_cell_inds], source_nmf.cluster_labels)
173 | print('SOURCE ARI = ', source_aris)
174 |
175 | # MTL/DA mixing parameter loop
176 | res_desc = list()
177 | for m in range(len(methods)):
178 | print(('Running experiment {0} of {1}: Train target data - {2} source cells, {3} genes, {4} target cells and the {5}th method'.format(exp_counter, num_exps, n_src, genes, n_trg, m+1)))
179 | source_nmf.cell_filter_list = list()
180 | source_nmf.gene_filter_list = list()
181 | # source data is already filtered and transformed ...
182 | source_nmf.add_cell_filter(lambda x: np.arange(x.shape[1]).tolist())
183 | source_nmf.add_gene_filter(lambda x: np.arange(x.shape[0]).tolist())
184 | source_nmf.set_data_transformation(lambda x: x)
185 |
186 | # Run method
187 | desc, target_nmf, data_for_SC3,trg_lbls_pred = methods[m](source_nmf, data_target.copy(), num_cluster=n_trg_cluster)
188 | trg_labels[:,m] = trg_lbls_pred
189 | res_desc.append(desc)
190 |
191 | # Evaluate results
192 | print("Evaluation of target results")
193 | accs_desc = list()
194 | if m >=2:
195 | mixed_data, _, _ = target_nmf.get_mixed_data(mix=mixes[m-2], calc_transferability=False)
196 | for f in range(len(acc_funcs)):
197 | if f != 1 or m <= 1:
198 | accs[f,m] = accs[f,m]
199 | accs_descr = "No labels, no ARIs."
200 | else:
201 | accs[f, m], accs_descr = acc_funcs[f]([], mixed_data, [], trg_lbls_pred.copy())
202 | accs_desc.append(accs_descr)
203 | print(('Accuracy: {0} ({1})'.format(accs[f, m], accs_descr)))
204 | perc_done = round(np.true_divide(exp_counter, num_exps)*100, 4)
205 | print(('{0}% of experiments done.'.format(perc_done)))
206 | exp_counter += 1
207 |
208 | # Identify optimal mixture parameter
209 | opt_mix_ind = np.argmax(accs[1, 2:])
210 | opt_mix_aris = accs[0, int(opt_mix_ind+2)]
211 |
212 | # Save results
213 | res[:, :] = accs
214 | res_opt_mix_ind = opt_mix_ind
215 | res_opt_mix_aris = opt_mix_aris
216 |
217 | np.savez(fname_final, methods=methods, acc_funcs=acc_funcs, res=res, accs_desc=accs_desc, trg_labels = trg_labels, data_target = data_target, method_desc=res_desc, source_aris=source_aris, min_expr_genes=min_expr_genes, non_zero_threshold_target=non_zero_threshold_target, non_zero_threshold_source=non_zero_threshold_source, perc_consensus_genes_source=perc_consensus_genes_source, perc_consensus_genes_target=perc_consensus_genes_target, num_cluster=num_cluster,num_cluster_source=num_cluster_source, nmf_alpha=nmf_alpha, nmf_l1=nmf_l1, nmf_max_iter=nmf_max_iter, nmf_rel_err=nmf_rel_err, genes=genes, n_src=n_src, n_trg=n_trg, mixes=mixes, res_opt_mix_ind=res_opt_mix_ind, res_opt_mix_aris=res_opt_mix_aris, labels_source = labels_source, gene_intersection=gene_intersection, cell_names_source=cell_names_source, cell_names_target=cell_names_target, gene_names_target=gene_names_target, gene_names_source=gene_names_source)
218 |
219 | # Print running times
220 | now2 = datetime.datetime.now()
221 | print("Current date and time:")
222 | print(now2.strftime("%Y-%m-%d %H:%M"))
223 | print("Time passed:")
224 | print(now2-now1)
225 | print('Done.')
226 |
--------------------------------------------------------------------------------
/scripts/experiments/main_wrapper_hockley_NMF_labels_robustness.py:
--------------------------------------------------------------------------------
1 | ###################################################
2 | ### ###
3 | ### Robustness Experiment on Hockley data ###
4 | ### using NMF labels for source data ###
5 | ### written by Bettina Mieth, Nico Görnitz, ###
6 | ### Marina Vidovic and Alex Gutteridge ###
7 | ### ###
8 | ###################################################
9 |
10 | # Please change all directories to yours!
11 | import sys
12 | sys.path.append('/home/bmieth/scRNAseq/implementations')
13 | import logging
14 | logging.basicConfig()
15 | from functools import partial
16 | from experiments_utils import (method_sc3_ours, method_sc3_combined_ours, method_transfer_ours, acc_ari, acc_kta)
17 | from nmf_clustering import NmfClustering, NmfClustering_initW
18 | from utils import *
19 | import datetime
20 | import pandas as pd
21 | import numpy as np
22 | import scipy.cluster.hierarchy as spc
23 | import scipy.spatial.distance as dist
24 |
25 |
26 | def build_consensus_here(X):
27 | """
28 | :param X: n x cells label matrix
29 | :return: cells x cells consensus matrix
30 | """
31 | if len(X.shape) == 1:
32 | X = X[np.newaxis, :]
33 | n, cells = X.shape
34 | consensus = np.zeros((cells, cells), dtype=np.float)
35 | for i in range(n):
36 | t = dist.squareform(dist.pdist(X[i, :].reshape(cells, 1)))
37 | t = np.array(t, dtype=np.int)
38 | # print np.unique(t)
39 | t[t != 0] = -1
40 | t[t == 0] = +1
41 | t[t == -1] = 0
42 | # print np.unique(t)
43 | consensus += np.array(t, dtype=np.float)
44 | consensus /= np.float(n)
45 | return consensus
46 |
47 |
48 |
49 | def consensus_clustering_here(consensus, n_components=5):
50 | """
51 | :param consensus: cells x cells consensus matrix
52 | :param n_components: number of clusters
53 | :return: cells x 1 labels
54 | """
55 | # print 'SC3 Agglomorative hierarchical clustering.'
56 | # condensed distance matrix
57 | cdm = dist.pdist(consensus)
58 | # hierarchical clustering (SC3: complete agglomeration + cutree)
59 | hclust = spc.complete(cdm)
60 | cutree = spc.cut_tree(hclust, n_clusters=n_components)
61 | labels = cutree.reshape(consensus.shape[0])
62 | # Below is the hclust code for the older version, fyi
63 | # hclust = spc.linkage(cdm)
64 | # labels = spc.fcluster(hclust, n_components, criterion='maxclust')
65 | return labels
66 |
67 | # Running times
68 | now1 = datetime.datetime.now()
69 | print("Current date and time:")
70 | print(now1.strftime("%Y-%m-%d %H:%M"))
71 |
72 | # Data location - Please change directories to yours!
73 | fname_data_target = '/home/bmieth/scRNAseq/data/Jim/Visceraltpm_m_fltd_mat.tsv'
74 | fname_gene_names_target = '/home/bmieth/scRNAseq/data/Jim/Visceraltpm_m_fltd_row.tsv'
75 | fname_cell_names_target = '/home/bmieth/scRNAseq/data/Jim/Visceraltpm_m_fltd_col.tsv'
76 | fname_data_source = '/home/bmieth/scRNAseq/data/usoskin/usoskin_m_fltd_mat.tsv'
77 | fname_gene_names_source = '/home/bmieth/scRNAseq/data/usoskin/usoskin_m_fltd_row.tsv'
78 | fname_cell_names_source = '/home/bmieth/scRNAseq/data/usoskin/usoskin_m_fltd_col.tsv'
79 |
80 | # Result file
81 | fname_final = '/home/bmieth/scRNAseq/results/jims_data/multiple_reps/jimtarget_usoskinsource_NMFlabels_k7_1000reps.npz'
82 |
83 | # Robustness experiment parameters
84 | reps = 1000 # Number of replication
85 |
86 | # Pre-processing parameters for gene and cell filter
87 | min_expr_genes = 2000
88 | non_zero_threshold_source = 1
89 | non_zero_threshold_target = 4
90 | perc_consensus_genes_source = 0.94
91 | perc_consensus_genes_target = 0.94
92 |
93 | # Number of clusters to obtain
94 | num_cluster = 7
95 | num_cluster_source = 7
96 |
97 | # NMF parameters
98 | nmf_alpha = 10.0
99 | nmf_l1 = 0.75
100 | nmf_max_iter = 4000
101 | nmf_rel_err = 1e-3
102 |
103 | # Transfer learning parameters
104 | mixes = np.arange(0,0.75,0.05) # range of mixture parameters to use for transfer learning
105 |
106 | # List of accuracy functions to be used
107 | acc_funcs = list()
108 | acc_funcs.append(partial(acc_ari, use_strat=False))
109 | acc_funcs.append(partial(acc_kta, mode=0))
110 |
111 | # Read target data
112 | data_target = pd.read_csv(fname_data_target, sep='\t', header=None).values
113 | # reverse log2 for now (dataset is saved in log-format, so we have to undo this)
114 | data_target = np.power(2,data_target)-1
115 |
116 | # Preprocessing Target Data
117 | gene_names_target = pd.read_csv(fname_gene_names_target, sep='\t', header=None).values
118 | cell_names_target = pd.read_csv(fname_cell_names_target, sep='\t', header=None).values
119 | print("Target data dimensions before preprocessing: genes x cells", data_target.shape)
120 |
121 | # Cell and gene filter and transformation before the whole procedure
122 | cell_inds = sc.cell_filter(data_target, num_expr_genes=min_expr_genes, non_zero_threshold=non_zero_threshold_target)
123 | data_target = data_target[:,cell_inds]
124 | cell_names_target = cell_names_target[cell_inds]
125 | gene_inds = sc.gene_filter(data_target, perc_consensus_genes=perc_consensus_genes_target, non_zero_threshold=non_zero_threshold_target)
126 | data_target = data_target[gene_inds, :]
127 | gene_names_target = gene_names_target[gene_inds,:]
128 | data_target = sc.data_transformation_log2(data_target)
129 | print("Target data dimensions after preprocessing: genes x cells: ", data_target.shape)
130 |
131 | # Read source data
132 | data_source = pd.read_csv(fname_data_source, sep='\t', header=None).values
133 | gene_names_source = pd.read_csv(fname_gene_names_source, sep='\t', header=None).values
134 | cell_names_source = pd.read_csv(fname_cell_names_source, sep='\t', header=None).values
135 |
136 | # Preprocessing Source Data
137 | print("Source data dimensions before preprocessing: genes x cells", data_source.shape)
138 | # Cell and gene filter and transformation before the whole procedure
139 | cell_inds = sc.cell_filter(data_source, num_expr_genes=min_expr_genes, non_zero_threshold=non_zero_threshold_source)
140 | data_source = data_source[:,cell_inds]
141 | cell_names_source = cell_names_source[cell_inds]
142 | gene_inds = sc.gene_filter(data_source, perc_consensus_genes=perc_consensus_genes_source, non_zero_threshold=non_zero_threshold_source)
143 | data_source = data_source[gene_inds, :]
144 | gene_names_source = gene_names_source[gene_inds,:]
145 | data_source = sc.data_transformation_log2(data_source)
146 | # data is now filtered and transformed, don't do it again:
147 | cell_filter_fun = partial(sc.cell_filter, num_expr_genes=0, non_zero_threshold=-1)
148 | gene_filter_fun = partial(sc.gene_filter, perc_consensus_genes=1, non_zero_threshold=-1)
149 | data_transf_fun = sc.no_data_transformation
150 | print("source data dimensions after preprocessing: genes x cells: ", data_source.shape)
151 |
152 | # Find gene subset of genes that appear in both source and target
153 | gene_intersection = list(set(x[0] for x in gene_names_target).intersection(set(x[0] for x in gene_names_source)))
154 |
155 | # Adjust source and target data to only include overlapping genes
156 | data_target_indices = list(list(gene_names_target).index(x) for x in gene_intersection)
157 | data_target = data_target[data_target_indices,]
158 |
159 | data_source_indices = list(list(gene_names_source).index(x) for x in gene_intersection)
160 | data_source = data_source[data_source_indices,]
161 |
162 | print("Target data dimensions after taking source intersection: genes x cells: ", data_target.shape)
163 | print("source data dimensions after taking target intersection: genes x cells: ", data_source.shape)
164 |
165 | # Generating labels for source dataset
166 | print("Train complete data")
167 | complete_nmf = None
168 | complete_nmf = NmfClustering(data_source, np.arange(data_source.shape[0]), num_cluster=num_cluster_source, labels=[])
169 | complete_nmf.add_cell_filter(cell_filter_fun)
170 | complete_nmf.add_gene_filter(gene_filter_fun)
171 | complete_nmf.set_data_transformation(data_transf_fun)
172 | complete_nmf.apply(k=num_cluster_source, alpha=nmf_alpha, l1=nmf_l1, max_iter=nmf_max_iter, rel_err=nmf_rel_err)
173 |
174 | # Get labels
175 | labels_source = complete_nmf.cluster_labels
176 | label_source_names, label_source_counts = np.unique(labels_source, return_counts = True)
177 | print("Source labels: ", label_source_names)
178 | print("Source label counts: ", label_source_counts)
179 |
180 | # Adjust source dataset
181 | data_source = data_source[:, complete_nmf.remain_cell_inds]
182 |
183 | # Specify dataset sizes
184 | genes = len(gene_intersection) # number of genes
185 | n_src = data_source.shape[1]
186 | n_trg = data_target.shape[1]
187 |
188 | # List of methods to be applied
189 | methods = list()
190 | # original SC3 (SC3 on target data, Target Cluster)
191 | methods.append(partial(method_sc3_ours))
192 | # combined baseline SC3 (SC3 on combined source and target data, ConcatenateCluster)
193 | methods.append(partial(method_sc3_combined_ours))
194 | # transfer via mixing (Transfer learning via mixing source and target before SC3, TransferCluster)
195 | # Experiment for all mixture_parameters
196 | for m in mixes:
197 | methods.append(partial(method_transfer_ours, mix=m, calc_transferability=False))
198 |
199 | # Create results matrix
200 | res = np.zeros((reps, len(acc_funcs), len(methods)))
201 | res_opt_mix_ind = np.zeros((reps,1))
202 | res_opt_mix_aris = np.zeros((reps,1))
203 | exp_counter = 1
204 | num_exps = len(methods)*reps
205 | accs = np.zeros((len(acc_funcs), len(methods)))
206 | trg_labels = np.zeros((n_trg, len(methods)))
207 | trg_labels_reps = np.zeros((n_trg, 3, reps))
208 |
209 | # Use perfect number of latent states for nmf and sc3
210 | src_labels = np.array(labels_source, dtype=np.int)
211 | src_lbl_set = np.unique(src_labels)
212 | n_trg_cluster = num_cluster
213 | n_src_cluster = src_lbl_set.size
214 |
215 | ## Train source once
216 | source_nmf = NmfClustering_initW(data_source, np.arange(data_source.shape[0]), num_cluster=n_src_cluster, labels=src_labels)
217 | source_nmf.apply(k=n_src_cluster, alpha=nmf_alpha, l1=nmf_l1, max_iter=nmf_max_iter, rel_err=nmf_rel_err)
218 |
219 | ## Calculate ARIs and KTAs
220 | source_aris = metrics.adjusted_rand_score(src_labels[source_nmf.remain_cell_inds], source_nmf.cluster_labels)
221 | print('SOURCE ARI = ', source_aris)
222 |
223 | # Multiple replications loop
224 | # MTL/DA mixing parameter loop
225 | for r in range(reps):
226 | res_desc = list()
227 | for m in range(len(methods)):
228 | print(('Running experiment {0} of {1}: Train target data - {2} source cells, {3} genes, {4} target cells and the {5}th method, rep = {6}'.format(exp_counter, num_exps, n_src, genes, n_trg, m+1, r)))
229 | source_nmf.cell_filter_list = list()
230 | source_nmf.gene_filter_list = list()
231 | # source data is already filtered and transformed ...
232 | source_nmf.add_cell_filter(lambda x: np.arange(x.shape[1]).tolist())
233 | source_nmf.add_gene_filter(lambda x: np.arange(x.shape[0]).tolist())
234 | source_nmf.set_data_transformation(lambda x: x)
235 |
236 | # Run method
237 | desc, target_nmf, data_for_SC3,trg_lbls_pred = methods[m](source_nmf, data_target.copy(), num_cluster=n_trg_cluster)
238 | trg_labels[:,m] = trg_lbls_pred
239 | res_desc.append(desc)
240 |
241 | # Evaluate results
242 | print("Evaluation of target results")
243 | accs_desc = list()
244 | if m >=2:
245 | mixed_data, _, _ = target_nmf.get_mixed_data(mix=mixes[m-2], calc_transferability=False)
246 | for f in range(len(acc_funcs)):
247 | if f != 1 or m <= 1:
248 | accs[f,m] = accs[f,m]
249 | accs_descr = "No labels, no ARIs."
250 | #accs[f, m], accs_descr = acc_funcs[f]([], data_target.copy(), p_trg_labels.copy(), trg_lbls_pred.copy())
251 | else:
252 | accs[f, m], accs_descr = acc_funcs[f]([], mixed_data, [], trg_lbls_pred.copy())
253 | accs_desc.append(accs_descr)
254 | print(('Accuracy: {0} ({1})'.format(accs[f, m], accs_descr)))
255 | perc_done = round(np.true_divide(exp_counter, num_exps)*100, 4)
256 | print(('{0}% of experiments done.'.format(perc_done)))
257 | exp_counter += 1
258 |
259 | # Identify optimal mixture parameter
260 | opt_mix_ind = np.argmax(accs[1, 2:])
261 | opt_mix_aris = accs[0, int(opt_mix_ind+2)]
262 |
263 | # Save results
264 | res[r, :, :] = accs
265 | res_opt_mix_ind[r] = opt_mix_ind
266 | res_opt_mix_aris[r] = opt_mix_aris
267 |
268 | trg_labels_reps[:,0,r]=trg_labels[:,0]
269 | trg_labels_reps[:,1,r]=trg_labels[:,1]
270 | trg_labels_reps[:,2,r]=trg_labels[:, opt_mix_ind+2]
271 |
272 | # building consensus matrices (This analysis is not in the final paper!)
273 | consensus_mat_sc3 = build_consensus_here(trg_labels_reps[:,0,:].T)
274 | consensus_mat_sc3_comb = build_consensus_here(trg_labels_reps[:,1,:].T)
275 | consensus_mat_sc3_mix = build_consensus_here(trg_labels_reps[:,2,:].T)
276 |
277 | # consensus clustering (This analysis is not in the final paper!)
278 | cons_clustering_sc3 = consensus_clustering_here(consensus_mat_sc3, n_components=n_trg_cluster)
279 | cons_clustering_sc3_comb = consensus_clustering_here(consensus_mat_sc3_comb, n_components=n_trg_cluster)
280 | cons_clustering_sc3_mix = consensus_clustering_here(consensus_mat_sc3_mix, n_components=n_trg_cluster)
281 |
282 | # Save results
283 | np.savez(fname_final, methods=methods, acc_funcs=acc_funcs, res=res, accs_desc=accs_desc, trg_labels = trg_labels, data_target = data_target, method_desc=res_desc, source_aris=source_aris, min_expr_genes=min_expr_genes, non_zero_threshold_target=non_zero_threshold_target, non_zero_threshold_source=non_zero_threshold_source, perc_consensus_genes_source=perc_consensus_genes_source, perc_consensus_genes_target=perc_consensus_genes_target, num_cluster=num_cluster,num_cluster_source=num_cluster_source, nmf_alpha=nmf_alpha, nmf_l1=nmf_l1, nmf_max_iter=nmf_max_iter, nmf_rel_err=nmf_rel_err, genes=genes, n_src=n_src, n_trg=n_trg, mixes=mixes, res_opt_mix_ind=res_opt_mix_ind, res_opt_mix_aris=res_opt_mix_aris, labels_source = labels_source, gene_intersection=gene_intersection, cell_names_source=cell_names_source, cell_names_target=cell_names_target, gene_names_target=gene_names_target, gene_names_source=gene_names_source, cons_clustering_sc3=cons_clustering_sc3, cons_clustering_sc3_comb=cons_clustering_sc3_comb, cons_clustering_sc3_mix=cons_clustering_sc3_mix, reps=reps, trg_labels_reps=trg_labels_reps)
284 |
285 | # Print running times
286 | now2 = datetime.datetime.now()
287 | print("Current date and time:")
288 | print(now2.strftime("%Y-%m-%d %H:%M"))
289 | print("Time passed:")
290 | print(now2-now1)
291 | print('Done.')
292 |
--------------------------------------------------------------------------------
/scripts/experiments/main_wrapper_hockley_robustness.py:
--------------------------------------------------------------------------------
1 | ###################################################
2 | ### ###
3 | ### Robustness Experiment on Hockley data ###
4 | ### written by Bettina Mieth, Nico Görnitz, ###
5 | ### Marina Vidovic and Alex Gutteridge ###
6 | ### ###
7 | ###################################################
8 |
9 | # Please change all directories to yours!
10 |
11 | import sys
12 | sys.path.append('/home/bmieth/scRNAseq/implementations')
13 | import logging
14 | logging.basicConfig()
15 | from functools import partial
16 | from experiments_utils import (method_sc3_ours, method_sc3_combined_ours, method_transfer_ours, acc_ari, acc_kta)
17 | from nmf_clustering import NmfClustering_initW
18 | from utils import *
19 | import datetime
20 | import pandas as pd
21 | import numpy as np
22 | import scipy.cluster.hierarchy as spc
23 | import scipy.spatial.distance as dist
24 |
25 |
26 | def build_consensus_here(X):
27 | """
28 | :param X: n x cells label matrix
29 | :return: cells x cells consensus matrix
30 | """
31 | if len(X.shape) == 1:
32 | X = X[np.newaxis, :]
33 | n, cells = X.shape
34 | consensus = np.zeros((cells, cells), dtype=np.float)
35 | for i in range(n):
36 | t = dist.squareform(dist.pdist(X[i, :].reshape(cells, 1)))
37 | t = np.array(t, dtype=np.int)
38 | # print np.unique(t)
39 | t[t != 0] = -1
40 | t[t == 0] = +1
41 | t[t == -1] = 0
42 | # print np.unique(t)
43 | consensus += np.array(t, dtype=np.float)
44 | consensus /= np.float(n)
45 | return consensus
46 |
47 |
48 |
49 | def consensus_clustering_here(consensus, n_components=5):
50 | """
51 | :param consensus: cells x cells consensus matrix
52 | :param n_components: number of clusters
53 | :return: cells x 1 labels
54 | """
55 | # print 'SC3 Agglomorative hierarchical clustering.'
56 | # condensed distance matrix
57 | cdm = dist.pdist(consensus)
58 | # hierarchical clustering (SC3: complete agglomeration + cutree)
59 | hclust = spc.complete(cdm)
60 | cutree = spc.cut_tree(hclust, n_clusters=n_components)
61 | labels = cutree.reshape(consensus.shape[0])
62 | # Below is the hclust code for the older version, fyi
63 | # hclust = spc.linkage(cdm)
64 | # labels = spc.fcluster(hclust, n_components, criterion='maxclust')
65 | return labels
66 |
67 | # Running times
68 | now1 = datetime.datetime.now()
69 | print("Current date and time:")
70 | print(now1.strftime("%Y-%m-%d %H:%M"))
71 |
72 | # Data location - Please change directories to yours!
73 | fname_data_target = '/home/bmieth/scRNAseq/data/Jim/Visceraltpm_m_fltd_mat.tsv'
74 | fname_gene_names_target = '/home/bmieth/scRNAseq/data/Jim/Visceraltpm_m_fltd_row.tsv'
75 | fname_cell_names_target = '/home/bmieth/scRNAseq/data/Jim/Visceraltpm_m_fltd_col.tsv'
76 | fname_data_source = '/home/bmieth/scRNAseq/data/usoskin/usoskin_m_fltd_mat.tsv'
77 | fname_gene_names_source = '/home/bmieth/scRNAseq/data/usoskin/usoskin_m_fltd_row.tsv'
78 | fname_cell_ids_source = '/home/bmieth/scRNAseq/data/usoskin/usoskin_m_fltd_col.tsv'
79 | fname_labels_source = '/home/bmieth/scRNAseq/data/usoskin/Usoskin_labels_only.xlsx'
80 |
81 | # Result file
82 | fname_final = '/home/bmieth/scRNAseq/results/jims_data/multiple_reps/jimtarget_usoskinsource_level1labels_k7_1000reps.npz'
83 |
84 | # Robustness experiment parameters
85 | reps = 1000 # Number of replication
86 |
87 | # Pre-processing parameters for gene and cell filter
88 | min_expr_genes = 2000
89 | non_zero_threshold_source = 1
90 | non_zero_threshold_target = 4
91 | perc_consensus_genes_source = 0.94
92 | perc_consensus_genes_target = 0.94
93 |
94 | # Number of clusters to obtain
95 | num_cluster = 7
96 |
97 | # Source labels are taken at which level of the original Usoskin publication
98 | labels_level_ind = 3 # 1,2,3
99 |
100 | # NMF parameters
101 | nmf_alpha = 10.0
102 | nmf_l1 = 0.75
103 | nmf_max_iter = 4000
104 | nmf_rel_err = 1e-3
105 |
106 | # Transfer learning parameters
107 | mixes = np.arange(0,0.75,0.05) # range of mixture parameters to use for transfer learning
108 |
109 | # List of accuracy functions to be used
110 | acc_funcs = list()
111 | acc_funcs.append(partial(acc_ari, use_strat=False))
112 | acc_funcs.append(partial(acc_kta, mode=0))
113 |
114 | # Read source data
115 | data_source = pd.read_csv(fname_data_source, sep='\t', header=None).values
116 | gene_names_source = pd.read_csv(fname_gene_names_source, sep='\t', header=None).values
117 | cell_ids_source = pd.read_csv(fname_cell_ids_source, sep='\t', header=None).values
118 |
119 | # Read source labels
120 | print("Load source labels")
121 | df = pd.read_excel(io=fname_labels_source, sheet_name='Tabelle1')
122 | df_cell_ids = df.columns[1:]
123 | df_cell_ids = list(x.encode('ascii','replace') for x in df_cell_ids)
124 | src_labels = df.values[labels_level_ind-1,1:]
125 | src_labels = list(x.encode('ascii','replace') for x in src_labels)
126 |
127 | label_source_names, label_source_counts = np.unique(src_labels, return_counts = True)
128 | print("Source labels: ", label_source_names)
129 | print("Source label counts: ", label_source_counts)
130 |
131 | # Find cell subset/order
132 | cell_intersection = list(set(x[0] for x in cell_ids_source.tolist()).intersection(set(x for x in df_cell_ids)))
133 |
134 | # Adjust source data to only include cells with labels
135 | data_indices = list(list(cell_ids_source).index(x) for x in cell_intersection)
136 | data_source = data_source[:,data_indices]
137 | cell_ids_source = cell_ids_source[data_indices]
138 |
139 | # Adjust order of labels
140 | labels_indices = list(list(df_cell_ids).index(x) for x in cell_intersection)
141 | src_labels = np.asarray(src_labels)[labels_indices]
142 | df_cell_ids = np.asarray(df_cell_ids)[labels_indices]
143 |
144 | # Preprocessing source data
145 | print("Source data dimensions before preprocessing: genes x cells", data_source.shape)
146 | # Cell and gene filter and transformation before the whole procedure
147 | cell_inds = sc.cell_filter(data_source, num_expr_genes=min_expr_genes, non_zero_threshold=non_zero_threshold_source)
148 | data_source = data_source[:,cell_inds]
149 | cell_ids_source = cell_ids_source[cell_inds]
150 | src_labels = src_labels[cell_inds]
151 | gene_inds = sc.gene_filter(data_source, perc_consensus_genes=perc_consensus_genes_source, non_zero_threshold=non_zero_threshold_source)
152 | data_source = data_source[gene_inds, :]
153 | gene_names_source = gene_names_source[gene_inds,:]
154 | data_source = sc.data_transformation_log2(data_source)
155 | # data is now filtered and transformed, don't do it again:
156 | cell_filter_fun = partial(sc.cell_filter, num_expr_genes=0, non_zero_threshold=-1)
157 | gene_filter_fun = partial(sc.gene_filter, perc_consensus_genes=1, non_zero_threshold=-1)
158 | data_transf_fun = sc.no_data_transformation
159 | print("source data dimensions after preprocessing: genes x cells: ", data_source.shape)
160 |
161 | # Read target data
162 | data_target = pd.read_csv(fname_data_target, sep='\t', header=None).values
163 | # reverse log2 for now (dataset is saved in log-format, so we have to undo this)
164 | data_target = np.power(2,data_target)-1
165 | gene_names_target = pd.read_csv(fname_gene_names_target, sep='\t', header=None).values
166 | cell_names_target = pd.read_csv(fname_cell_names_target, sep='\t', header=None).values
167 |
168 |
169 | # Preprocessing target data
170 | print("Target data dimensions before preprocessing: genes x cells", data_target.shape)
171 | # Cell and gene filter and transformation before the whole procedure
172 | cell_inds = sc.cell_filter(data_target, num_expr_genes=min_expr_genes, non_zero_threshold=non_zero_threshold_target)
173 | data_target = data_target[:,cell_inds]
174 | cell_names_target = cell_names_target[cell_inds]
175 | gene_inds = sc.gene_filter(data_target, perc_consensus_genes=perc_consensus_genes_target, non_zero_threshold=non_zero_threshold_target)
176 | data_target = data_target[gene_inds, :]
177 | gene_names_target = gene_names_target[gene_inds,:]
178 | data_target = sc.data_transformation_log2(data_target)
179 | print("Target data dimensions after preprocessing: genes x cells: ", data_target.shape)
180 |
181 | # Find gene subset of genees that appear in both source and target
182 | gene_intersection = list(set(x[0] for x in gene_names_target).intersection(set(x[0] for x in gene_names_source)))
183 |
184 | # Adjust source and target data to only include overlapping genes
185 | data_target_indices = list(list(gene_names_target).index(x) for x in gene_intersection)
186 | data_target = data_target[data_target_indices,]
187 | gene_names_target = gene_names_target[data_target_indices]
188 |
189 | data_source_indices = list(list(gene_names_source).index(x) for x in gene_intersection)
190 | data_source = data_source[data_source_indices,]
191 | gene_names_source = gene_names_source[data_source_indices]
192 |
193 | print("Target data dimensions after taking source intersection: genes x cells: ", data_target.shape)
194 | print("source data dimensions after taking target intersection: genes x cells: ", data_source.shape)
195 |
196 | # Specify dataset sizes
197 | genes = len(gene_intersection) # number of genes
198 | n_src = data_source.shape[1]
199 | n_trg = data_target.shape[1]
200 |
201 | # List of methods to be applied
202 | methods = list()
203 | # original SC3 (SC3 on target data, TargetCluster)
204 | methods.append(partial(method_sc3_ours))
205 | # combined baseline SC3 (SC3 on combined source and target data, ConcatenateCluster)
206 | methods.append(partial(method_sc3_combined_ours))
207 | # transfer via mixing (Transfer learning via mixing source and target before SC3, TransferCluster)
208 | # Experiment for all mixture_parameters
209 | for m in mixes:
210 | methods.append(partial(method_transfer_ours, mix=m, calc_transferability=False))
211 |
212 | # Create results matrix
213 | res = np.zeros((reps, len(acc_funcs), len(methods)))
214 | res_opt_mix_ind = np.zeros((reps,1))
215 | res_opt_mix_aris = np.zeros((reps,1))
216 | exp_counter = 1
217 | num_exps = len(methods)*reps
218 | accs = np.zeros((len(acc_funcs), len(methods)))
219 | trg_labels = np.zeros((n_trg, len(methods)))
220 | trg_labels_reps = np.zeros((n_trg, len(methods), reps))
221 |
222 | # Use perfect number of latent states for nmf and sc3
223 | src_lbl_set = np.unique(src_labels)
224 | n_trg_cluster = num_cluster
225 | n_src_cluster = src_lbl_set.size
226 |
227 | ## Train source once
228 | source_nmf = NmfClustering_initW(data_source, np.arange(data_source.shape[0]), num_cluster=n_src_cluster, labels=src_labels)
229 | source_nmf.apply(k=n_src_cluster, alpha=nmf_alpha, l1=nmf_l1, max_iter=nmf_max_iter, rel_err=nmf_rel_err)
230 |
231 | ## Calculate ARIs and KTAs
232 | source_aris = metrics.adjusted_rand_score(src_labels[source_nmf.remain_cell_inds], source_nmf.cluster_labels)
233 | print('SOURCE ARI = ', source_aris)
234 |
235 | # Multiple replications loop
236 | # MTL/DA mixing parameter loop
237 | for r in range(reps):
238 | res_desc = list()
239 | for m in range(len(methods)):
240 | print(('Running experiment {0} of {1}: Train target data - {2} source cells, {3} genes, {4} target cells and the {5}th method, rep = {6}'.format(exp_counter, num_exps, n_src, genes, n_trg, m+1, r)))
241 | source_nmf.cell_filter_list = list()
242 | source_nmf.gene_filter_list = list()
243 | # source data is already filtered and transformed ...
244 | source_nmf.add_cell_filter(lambda x: np.arange(x.shape[1]).tolist())
245 | source_nmf.add_gene_filter(lambda x: np.arange(x.shape[0]).tolist())
246 | source_nmf.set_data_transformation(lambda x: x)
247 |
248 | # Run method
249 | desc, target_nmf, data_for_SC3,trg_lbls_pred = methods[m](source_nmf, data_target.copy(), num_cluster=n_trg_cluster)
250 | trg_labels[:,m] = trg_lbls_pred
251 | res_desc.append(desc)
252 |
253 | # Evaluate results
254 | print("Evaluation of target results")
255 | accs_desc = list()
256 | if m >=2:
257 | mixed_data, _, _ = target_nmf.get_mixed_data(mix=mixes[m-2], calc_transferability=False)
258 | for f in range(len(acc_funcs)):
259 | if f != 1 or m <= 1:
260 | accs[f,m] = accs[f,m]
261 | accs_descr = "No labels, no ARIs."
262 | else:
263 | accs[f, m], accs_descr = acc_funcs[f]([], mixed_data, [], trg_lbls_pred.copy())
264 | accs_desc.append(accs_descr)
265 | print(('Accuracy: {0} ({1})'.format(accs[f, m], accs_descr)))
266 | perc_done = round(np.true_divide(exp_counter, num_exps)*100, 4)
267 | print(('{0}% of experiments done.'.format(perc_done)))
268 | exp_counter += 1
269 |
270 | # Identify optimal mixture parameter
271 | opt_mix_ind = np.argmax(accs[1, 2:])
272 | opt_mix_aris = accs[0, int(opt_mix_ind+2)]
273 | # Save results
274 | res[r, :, :] = accs
275 | res_opt_mix_ind[r] = opt_mix_ind
276 | res_opt_mix_aris[r] = opt_mix_aris
277 |
278 | trg_labels_reps[:,0,r]=trg_labels[:,0]
279 | trg_labels_reps[:,1,r]=trg_labels[:,1]
280 | trg_labels_reps[:,2,r]=trg_labels[:, opt_mix_ind+2]
281 |
282 | # building consensus matrices
283 | consensus_mat_sc3 = build_consensus_here(trg_labels_reps[:,0,:].T)
284 | consensus_mat_sc3_comb = build_consensus_here(trg_labels_reps[:,1,:].T)
285 | consensus_mat_sc3_mix = build_consensus_here(trg_labels_reps[:,2,:].T)
286 |
287 | # consensus clustering
288 | cons_clustering_sc3 = consensus_clustering_here(consensus_mat_sc3, n_components=n_trg_cluster)
289 | cons_clustering_sc3_comb = consensus_clustering_here(consensus_mat_sc3_comb, n_components=n_trg_cluster)
290 | cons_clustering_sc3_mix = consensus_clustering_here(consensus_mat_sc3_mix, n_components=n_trg_cluster)
291 |
292 | # Save results
293 | np.savez(fname_final, methods=methods, acc_funcs=acc_funcs, res=res, accs_desc=accs_desc, trg_labels = trg_labels, data_target = data_target, method_desc=res_desc, source_aris=source_aris, min_expr_genes=min_expr_genes, non_zero_threshold_target=non_zero_threshold_target, non_zero_threshold_source=non_zero_threshold_source, perc_consensus_genes_source=perc_consensus_genes_source, perc_consensus_genes_target=perc_consensus_genes_target, num_cluster=num_cluster, nmf_alpha=nmf_alpha, nmf_l1=nmf_l1, nmf_max_iter=nmf_max_iter, nmf_rel_err=nmf_rel_err, genes=genes, n_src=n_src, n_trg=n_trg, mixes=mixes, res_opt_mix_ind=res_opt_mix_ind, res_opt_mix_aris=res_opt_mix_aris, labels_source = src_labels, gene_intersection=gene_intersection, cell_names_source=cell_ids_source, cell_names_target=cell_names_target, gene_names_target=gene_names_target, gene_names_source=gene_names_source, cons_clustering_sc3=cons_clustering_sc3, cons_clustering_sc3_comb=cons_clustering_sc3_comb, cons_clustering_sc3_mix=cons_clustering_sc3_mix, reps=reps, trg_labels_reps=trg_labels_reps)
294 |
295 | # Print running times
296 | now2 = datetime.datetime.now()
297 | print("Current date and time:")
298 | print(now2.strftime("%Y-%m-%d %H:%M"))
299 | print("Time passed:")
300 | print(now2-now1)
301 | print('Done.')
302 |
--------------------------------------------------------------------------------
/scripts/experiments/main_wrapper_tasic.py:
--------------------------------------------------------------------------------
1 |
2 | ###################################################
3 | ### ###
4 | ### Complete Experiment on Tasic data ###
5 | ### written by Bettina Mieth, Nico Görnitz, ###
6 | ### Marina Vidovic and Alex Gutteridge ###
7 | ### ###
8 | ###################################################
9 |
10 | # Please change all directories to yours!
11 |
12 | import sys
13 | sys.path.append('/home/bmieth/scRNAseq/implementations')
14 | import logging
15 | logging.basicConfig()
16 | from functools import partial
17 | from experiments_utils import (method_sc3_ours, method_sc3_combined_ours, method_transfer_ours, acc_ari, acc_kta)
18 | from nmf_clustering import NmfClustering_initW
19 | from utils import *
20 | import datetime
21 | from simulation import split_source_target
22 | import pandas as pd
23 | import sys
24 | import numpy as np
25 |
26 | # Running times
27 | now1 = datetime.datetime.now()
28 | print("Current date and time:")
29 | print(now1.strftime("%Y-%m-%d %H:%M"))
30 |
31 | # Data location - Please change directories to yours!
32 | fname_data = '/home/bmieth/scRNAseq/data/matrix'
33 | fname_labels = '/home/bmieth/scRNAseq/data/cell_labels_primary_grouped'
34 | fname_final = '/home/bmieth/scRNAseq/results/mouse_data/mouse_completeoverlap.npz'
35 |
36 | # Parameters
37 | reps = 100 # number of repetitions, 100
38 | n_src = [1000] # number of source data points, 1000
39 | percs_aim = [25, 50, 100, 200, 400, 650] # target sizes to use. (has to be greater than num_cluster!), [25, 50, 100, 200, 400, 650]
40 | mixes = [0.0,0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] # Mixture parameters of transfer learning SC3, [0.0,0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
41 |
42 | # Pre-processing parameters for gene and cell filter
43 | min_expr_genes = 2000
44 | non_zero_threshold = 2
45 | perc_consensus_genes = 0.94
46 | preprocessing_first = True # Careful, for now this only supports True, within-filtering is not implemented
47 |
48 | # Splitting mode defining how data is split in source and target set
49 | splitting_mode = 2 # Split data in source and target randomly stratified (mode = 2, complete overlap) or one exclusive cluster for both target and source (the biggest ones) (mode = 4, non-complete overlap)
50 |
51 | # NMF parameters
52 | nmf_alpha = 10.0
53 | nmf_l1 = 0.75
54 | nmf_max_iter = 4000
55 | nmf_rel_err = 1e-3
56 |
57 | # List of accuracy functions to be used
58 | acc_funcs = list()
59 | acc_funcs.append(partial(acc_ari, use_strat=False))
60 | acc_funcs.append(partial(acc_kta, mode=0))
61 |
62 | # Read data
63 | labels = np.loadtxt(fname_labels, delimiter='\t')
64 | label_names, label_counts = np.unique(labels, return_counts = True)
65 | print("Labels: ", label_names)
66 | print("Counts: ", label_counts)
67 | data = pd.read_csv(fname_data, sep='\t', header=None).values
68 | print("Data dimensions before preprocessing: genes x cells", data.shape)
69 |
70 | if preprocessing_first:
71 | # Cell and gene filter and transformation before the whole procedure
72 | cell_inds = sc.cell_filter(data, num_expr_genes=min_expr_genes, non_zero_threshold=non_zero_threshold)
73 | data = data[:,cell_inds]
74 | labels = labels[cell_inds]
75 | gene_inds = sc.gene_filter(data, perc_consensus_genes=perc_consensus_genes, non_zero_threshold=non_zero_threshold)
76 | data = data[gene_inds, :]
77 | data = sc.data_transformation_log2(data)
78 | # data is now filtered and transformed, don't do it again:
79 | cell_filter_fun = partial(sc.cell_filter, num_expr_genes=0, non_zero_threshold=-1)
80 | gene_filter_fun = partial(sc.gene_filter, perc_consensus_genes=1, non_zero_threshold=-1)
81 | data_transf_fun = sc.no_data_transformation
82 | print("data dimensions after preprocessing: genes x cells: ", data.shape)
83 | print(data.shape)
84 | else:
85 | raise Warning("Within-Filtering is not implemented for R SC3")
86 | # Cell and gene filter and transformation within the procedure
87 | cell_filter_fun = partial(sc.cell_filter, num_expr_genes=min_expr_genes, non_zero_threshold=non_zero_threshold)
88 | gene_filter_fun = partial(sc.gene_filter, perc_consensus_genes=perc_consensus_genes, non_zero_threshold=non_zero_threshold)
89 | data_transf_fun = sc.data_transformation_log2
90 |
91 | if len(np.unique(labels)) > np.min(percs_aim):
92 | print("percs_aim need to be greater than num_cluster!")
93 | sys.exit("error!")
94 |
95 | # Specify dataset sizes
96 | genes = data.shape[0] # number of genes
97 | n_all = data.shape[1]
98 | n_trg = n_all - n_src[0] # overall number of target data points
99 | percs = np.true_divide(np.concatenate(percs_aim, n_trg), n_trg)
100 |
101 | # List of methods to be applied
102 | methods = list()
103 | # original SC3 (SC3 on target data, TargetCluster)
104 | methods.append(partial(method_sc3_ours))
105 | # combined baseline SC3 (SC3 on combined source and target data, ConcatenateCluster)
106 | methods.append(partial(method_sc3_combined_ours))
107 | # transfer via mixing (Transfer learning via mixing source and target before SC3, TransferCluster)
108 | # Experiment for all mixture_parameters
109 | for m in mixes:
110 | methods.append(partial(method_transfer_ours, mix=m, calc_transferability=False))
111 |
112 | # Create results matrix
113 | res = np.zeros((len(n_src), len(acc_funcs), reps, len(percs), len(methods)))
114 | res_opt_mix_ind = np.zeros((len(n_src), reps, len(percs)))
115 | res_opt_mix_aris = np.zeros((len(n_src), reps, len(percs)))
116 | source_aris = np.zeros((len(n_src), reps))
117 | source_ktas = np.zeros((len(n_src), reps))
118 |
119 | # Prepare experiments
120 | params = []
121 | exp_counter = 1
122 | num_exps = len(n_src) * reps * len(percs) * len(methods)
123 |
124 | # Run experiments
125 | for s in range(len(n_src)):
126 | accs = np.zeros((len(acc_funcs), reps, len(percs), len(methods)))
127 | accs_desc = list()
128 | opt_mix_ind = np.zeros((reps, len(percs)))
129 | opt_mix_aris = np.zeros((reps, len(percs)))
130 |
131 | num_strat = np.zeros((reps, len(percs), len(methods)))
132 | res_desc = []
133 | r = 0
134 | while r < reps:
135 | # Split data in source and target randomly stratified (mode = 2) or with exclusive source and target clusters (mode = 4)
136 | src, trg, src_labels, trg_labels = split_source_target(data, labels, mode=splitting_mode, target_ncells=n_trg, source_ncells=n_src[s])
137 |
138 | trg_labels = np.array(trg_labels, dtype=np.int)
139 | src_labels = np.array(src_labels, dtype=np.int)
140 |
141 | # 3.a. Subsampling order for target
142 | inds = np.random.permutation(trg_labels.size)
143 |
144 | # 3.b. Use perfect number of latent states for nmf and sc3
145 | src_lbl_set = np.unique(src_labels)
146 | n_trg_cluster = np.unique(trg_labels).size
147 | n_src_cluster = src_lbl_set.size
148 | ## 3.c. train source once per repetition
149 | source_nmf = NmfClustering_initW(src, np.arange(src.shape[0]), num_cluster=n_src_cluster, labels=src_labels)
150 | source_nmf.apply(k=n_src_cluster, alpha=nmf_alpha, l1=nmf_l1, max_iter=nmf_max_iter, rel_err=nmf_rel_err)
151 |
152 | ## Calculate ARIs and KTAs
153 | source_aris[s, r] = metrics.adjusted_rand_score(src_labels[source_nmf.remain_cell_inds], source_nmf.cluster_labels)
154 | print('ITER(', r+1, '): SOURCE ARI = ', source_aris[s,r])
155 |
156 | # 3.d. Target data subsampling loop
157 | print("Target data subsampling loop")
158 | for i in range(len(percs)):
159 | n_trg_perc = np.int(n_trg * percs[i]+0.5)
160 | p_trg = trg[:, inds[:n_trg_perc]].copy()
161 | p_trg_labels = trg_labels[inds[:n_trg_perc]].copy()
162 | # 4. MTL/DA mixing parameter loop
163 | res_desc = list()
164 | for m in range(len(methods)):
165 | print(('Running experiment {0} of {1}: Train target data of repetition {2} - {3} source cells, {4} genes, {5} target cells and the {6}th method'.format(exp_counter, num_exps, r+1, n_src[s], genes, n_trg_perc, m+1)))
166 | source_nmf.cell_filter_list = list()
167 | source_nmf.gene_filter_list = list()
168 | # source data is already filtered and transformed ...
169 | source_nmf.add_cell_filter(lambda x: np.arange(x.shape[1]).tolist())
170 | source_nmf.add_gene_filter(lambda x: np.arange(x.shape[0]).tolist())
171 | source_nmf.set_data_transformation(lambda x: x)
172 | # Run method
173 | desc,target_nmf, data_for_SC3,trg_lbls_pred = methods[m](source_nmf, p_trg.copy(), num_cluster=n_trg_cluster)
174 | res_desc.append(desc)
175 |
176 | # Evaluate results
177 | print("Evaluation of target results")
178 | accs_desc = list()
179 | if m >=2:
180 | mixed_data, _, _ = target_nmf.get_mixed_data(mix=mixes[m-2], calc_transferability=False)
181 | for f in range(len(acc_funcs)):
182 | if f==0:
183 | accs[f, r, i, m], accs_descr = acc_funcs[f]([], p_trg.copy(), p_trg_labels.copy(), trg_lbls_pred.copy())
184 | elif m>=2:
185 | accs[f, r, i, m], accs_descr = acc_funcs[f](target_nmf, data_for_SC3, p_trg_labels.copy(), trg_lbls_pred.copy())
186 | else:
187 | accs_descr='score not computed for baslines'
188 | accs_desc.append(accs_descr)
189 | print(('Accuracy: {0} ({1})'.format(accs[f, r, i, m], accs_descr)))
190 | perc_done = round(np.true_divide(exp_counter, num_exps)*100, 4)
191 | print(('{0}% of experiments done.'.format(perc_done)))
192 | exp_counter += 1
193 |
194 | # Identify optimal mixture parameter
195 | opt_mix_ind[r, i] = np.argmax(accs[1, r, i, 2:])
196 | opt_mix_aris[r, i] = accs[0, r, i, int(opt_mix_ind[r, i]+2)]
197 | r += 1
198 | # Save results
199 | params.append((s))
200 | res[s, :, :, :, :] = accs
201 | res_opt_mix_ind[s,:,:] = opt_mix_ind
202 | res_opt_mix_aris[s,:,:] = opt_mix_aris
203 |
204 | # Save results
205 | np.savez(fname_final, methods=methods, acc_funcs=acc_funcs, res=res, accs_desc=accs_desc,
206 | method_desc=res_desc, source_aris=source_aris, min_expr_genes=min_expr_genes,
207 | non_zero_threshold=non_zero_threshold, perc_consensus_genes=perc_consensus_genes, nmf_alpha=nmf_alpha, nmf_l1=nmf_l1, nmf_max_iter=nmf_max_iter, nmf_rel_err=nmf_rel_err, percs=percs, reps=reps, genes=genes, n_src=n_src, n_trg=n_trg, mixes=mixes, res_opt_mix_ind=res_opt_mix_ind, res_opt_mix_aris=res_opt_mix_aris)
208 |
209 | # Show running times
210 | now2 = datetime.datetime.now()
211 | print("Current date and time:")
212 | print(now2.strftime("%Y-%m-%d %H:%M"))
213 | print("Time passed:")
214 | print(now2-now1)
215 | print('Done.')
216 |
--------------------------------------------------------------------------------
/scripts/experiments/main_wrapper_tasic_NMF_labels.py:
--------------------------------------------------------------------------------
1 | ###################################################
2 | ### ###
3 | ### Complete Experiment on Tasic data ###
4 | ### using NMF labels for source data ###
5 | ### written by Bettina Mieth, Nico Görnitz, ###
6 | ### Marina Vidovic and Alex Gutteridge ###
7 | ### ###
8 | ###################################################
9 |
10 | # Please change all directories to yours!
11 |
12 | import sys
13 | sys.path.append('/home/bmieth/scRNAseq/implementations')
14 | import logging
15 | logging.basicConfig()
16 | from functools import partial
17 | from experiments_utils import (method_sc3_ours, method_sc3_combined_ours, method_transfer_ours, acc_ari, acc_kta)
18 | from nmf_clustering import NmfClustering, NmfClustering_initW
19 | from utils import *
20 | import datetime
21 | from simulation import split_source_target
22 | import pandas as pd
23 | import sys
24 | import numpy as np
25 |
26 | # Running times
27 | now1 = datetime.datetime.now()
28 | print("Current date and time:")
29 | print(now1.strftime("%Y-%m-%d %H:%M"))
30 |
31 | # Data location - Please change directories to yours!
32 | fname_data = '/home/bmieth/scRNAseq/data/matrix'
33 | # Results file
34 | fname_final = '/home/bmieth/scRNAseq/results/mouse_data_NMF_final/main_results_mouse_NMFlabels.npz'
35 |
36 | # Parameters
37 | reps = 100 # number of repetitions, 100
38 | n_src = [1000] # number of source data points, 1000
39 | percs_aim = [25, 50, 100, 200, 400, 650] # target sizes to use. (has to be greater than num_cluster!), [25, 50, 100, 200, 400, 650]
40 | mixes = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7] # Mixture parameters of transfer learning SC3
41 |
42 | # Pre-processing parameters for gene and cell filter
43 | min_expr_genes = 2000
44 | non_zero_threshold = 2
45 | perc_consensus_genes = 0.94
46 | preprocessing_first = True # Careful, for now this only supports True, within-filtering is not implemented
47 |
48 | # Number of clusters to have in source labels
49 | num_cluster = 18
50 |
51 | # Splitting mode defining how data is split in source and target set
52 | splitting_mode = 2 # Split data in source and target randomly stratified (mode = 2, complete overlap) or one exclusive cluster for both target and source (the biggest ones) (mode = 4, non-complete overlap)
53 |
54 | # NMF parameters
55 | nmf_alpha = 10.0
56 | nmf_l1 = 0.75
57 | nmf_max_iter = 4000
58 | nmf_rel_err = 1e-3
59 |
60 | ari_cutoff = 0.94
61 |
62 | if num_cluster > np.min(percs_aim):
63 | print("percs_aim need to be greater than num_cluster!")
64 | sys.exit("error!")
65 |
66 | # List of accuracy functions to be used
67 | acc_funcs = list()
68 | acc_funcs.append(partial(acc_ari, use_strat=False))
69 | acc_funcs.append(partial(acc_kta, mode=0))
70 |
71 | # Read data
72 | data = pd.read_csv(fname_data, sep='\t', header=None).values
73 | print("Data dimensions before preprocessing: genes x cells", data.shape)
74 |
75 | if preprocessing_first:
76 | # Cell and gene filter and transformation before the whole procedure
77 | cell_inds = sc.cell_filter(data, num_expr_genes=min_expr_genes, non_zero_threshold=non_zero_threshold)
78 | data = data[:,cell_inds]
79 | gene_inds = sc.gene_filter(data, perc_consensus_genes=perc_consensus_genes, non_zero_threshold=non_zero_threshold)
80 | data = data[gene_inds, :]
81 | data = sc.data_transformation_log2(data)
82 | # data is now filtered and transformed, don't do it again:
83 | cell_filter_fun = partial(sc.cell_filter, num_expr_genes=0, non_zero_threshold=-1)
84 | gene_filter_fun = partial(sc.gene_filter, perc_consensus_genes=1, non_zero_threshold=-1)
85 | data_transf_fun = sc.no_data_transformation
86 | print("data dimensions after preprocessing: genes x cells: ", data.shape)
87 | print(data.shape)
88 | else:
89 | raise Warning("Within-Filtering is not implemented for R SC3")
90 | # Cell and gene filter and transformation within the procedure
91 | cell_filter_fun = partial(sc.cell_filter, num_expr_genes=min_expr_genes, non_zero_threshold=non_zero_threshold)
92 | gene_filter_fun = partial(sc.gene_filter, perc_consensus_genes=perc_consensus_genes, non_zero_threshold=non_zero_threshold)
93 | data_transf_fun = sc.data_transformation_log2
94 |
95 | # Generating labels from complete dataset
96 | print("Train complete data")
97 | complete_nmf = None
98 | complete_nmf = NmfClustering(data, np.arange(data.shape[0]), num_cluster=num_cluster, labels=[])
99 | complete_nmf.add_cell_filter(cell_filter_fun)
100 | complete_nmf.add_gene_filter(gene_filter_fun)
101 | complete_nmf.set_data_transformation(data_transf_fun)
102 | complete_nmf.apply(k=num_cluster, alpha=nmf_alpha, l1=nmf_l1, max_iter=nmf_max_iter, rel_err=nmf_rel_err)
103 |
104 | # Get labels
105 | labels = complete_nmf.cluster_labels
106 | label_names, label_counts = np.unique(labels, return_counts = True)
107 | print("Labels: ", label_names)
108 | print("Counts: ", label_counts)
109 |
110 | # Adjust data
111 | data = data[:, complete_nmf.remain_cell_inds]
112 |
113 | # Specify dataset sizes
114 | genes = data.shape[0] # number of genes
115 | n_all = data.shape[1]
116 | n_trg = n_all - n_src[0] # overall number of target data points
117 | percs = np.true_divide(np.concatenate(percs_aim, n_trg), n_trg)
118 |
119 | # List of methods to be applied
120 | methods = list()
121 | # original SC3 (SC3 on target data, TargetCluster)
122 | methods.append(partial(method_sc3_ours))
123 | # combined baseline SC3 (SC3 on combined source and target data, ConcatenateCluster)
124 | methods.append(partial(method_sc3_combined_ours))
125 | # transfer via mixing (Transfer learning via mixing source and target before SC3, TransferCluster)
126 | # Experiment for all mixture_parameters
127 | for m in mixes:
128 | methods.append(partial(method_transfer_ours, mix=m, calc_transferability=False))
129 |
130 | # Create results matrix
131 | res = np.zeros((len(n_src), len(acc_funcs), reps, len(percs), len(methods)))
132 | res_opt_mix_ind = np.zeros((len(n_src), reps, len(percs)))
133 | res_opt_mix_aris = np.zeros((len(n_src), reps, len(percs)))
134 | source_aris = np.zeros((len(n_src), reps))
135 | source_ktas = np.zeros((len(n_src), reps))
136 |
137 | # Prepare experiments
138 | params = []
139 | exp_counter = 1
140 | num_exps = len(n_src) * reps * len(percs) * len(methods)
141 |
142 | # Run experiments
143 | for s in range(len(n_src)):
144 | accs = np.zeros((len(acc_funcs), reps, len(percs), len(methods)))
145 | accs_desc = list()
146 | opt_mix_ind = np.zeros((reps, len(percs)))
147 | opt_mix_aris = np.zeros((reps, len(percs)))
148 |
149 | num_strat = np.zeros((reps, len(percs), len(methods)))
150 | res_desc = []
151 | r = 0
152 | while r < reps:
153 | # Split data in source and target randomly stratified (mode = 2) or with exclusive source and target clusters (mode = 4)
154 | src, trg, src_labels, trg_labels = split_source_target(data, labels, mode=splitting_mode, target_ncells=n_trg, source_ncells=n_src[s])
155 |
156 | trg_labels = np.array(trg_labels, dtype=np.int)
157 | src_labels = np.array(src_labels, dtype=np.int)
158 |
159 | # 3.a. Subsampling order for target
160 | inds = np.random.permutation(trg_labels.size)
161 |
162 | # 3.b. Use perfect number of latent states for nmf and sc3
163 | src_lbl_set = np.unique(src_labels)
164 | n_trg_cluster = np.unique(trg_labels).size
165 | n_src_cluster = src_lbl_set.size
166 | ## 3.c. train source once per repetition
167 | source_nmf = NmfClustering_initW(src, np.arange(src.shape[0]), num_cluster=n_src_cluster, labels=src_labels)
168 | source_nmf.apply(k=n_src_cluster, alpha=nmf_alpha, l1=nmf_l1, max_iter=nmf_max_iter, rel_err=nmf_rel_err)
169 |
170 | ## Calculate ARIs and KTAs
171 | source_aris[s, r] = metrics.adjusted_rand_score(src_labels[source_nmf.remain_cell_inds], source_nmf.cluster_labels)
172 | print('ITER(', r+1, '): SOURCE ARI = ', source_aris[s,r])
173 |
174 | if source_aris[s,r] < ari_cutoff:
175 | continue
176 |
177 | # 3.d. Target data subsampling loop
178 | print("Target data subsampling loop")
179 | for i in range(len(percs)):
180 | n_trg_perc = np.int(n_trg * percs[i]+0.5)
181 | p_trg = trg[:, inds[:n_trg_perc]].copy()
182 | p_trg_labels = trg_labels[inds[:n_trg_perc]].copy()
183 | # 4. MTL/DA mixing parameter loop
184 | res_desc = list()
185 | for m in range(len(methods)):
186 | print(('Running experiment {0} of {1}: Train target data of repetition {2} - {3} source cells, {4} genes, '
187 | '{5} target cells and the {6}th method'.format(exp_counter, num_exps, r+1, n_src[s], genes, n_trg_perc, m+1)))
188 | #plt.subplot(len(percs), len(methods), plot_cnt)
189 | source_nmf.cell_filter_list = list()
190 | source_nmf.gene_filter_list = list()
191 | # source data is already filtered and transformed ...
192 | source_nmf.add_cell_filter(lambda x: np.arange(x.shape[1]).tolist())
193 | source_nmf.add_gene_filter(lambda x: np.arange(x.shape[0]).tolist())
194 | source_nmf.set_data_transformation(lambda x: x)
195 | # Run method
196 | desc, target_nmf, data_for_SC3,trg_lbls_pred = methods[m](source_nmf, p_trg.copy(), num_cluster=n_trg_cluster)
197 | res_desc.append(desc)
198 | # Evaluate results
199 | print("Evaluation of target results")
200 | accs_desc = list()
201 | if m >=2:
202 | mixed_data, _, _ = target_nmf.get_mixed_data(mix=mixes[m-2], calc_transferability=False)
203 | for f in range(len(acc_funcs)):
204 | if f != 1 or m <= 1:
205 | accs[f, r, i, m], accs_descr = acc_funcs[f]([], p_trg.copy(), p_trg_labels.copy(), trg_lbls_pred.copy())
206 | else:
207 | accs[f, r, i, m], accs_descr = acc_funcs[f]([], mixed_data, p_trg_labels.copy(), trg_lbls_pred.copy())
208 | accs_desc.append(accs_descr)
209 | print(('Accuracy: {0} ({1})'.format(accs[f, r, i, m], accs_descr)))
210 | perc_done = round(np.true_divide(exp_counter, num_exps)*100, 4)
211 | print(('{0}% of experiments done.'.format(perc_done)))
212 | exp_counter += 1
213 |
214 | # Identify optimal mixture parameter
215 | opt_mix_ind[r, i] = np.argmax(accs[1, r, i, 2:])
216 | opt_mix_aris[r, i] = accs[0, r, i, int(opt_mix_ind[r, i]+2)]
217 |
218 | r += 1
219 | # Save results
220 | params.append((s))
221 | res[s, :, :, :, :] = accs
222 | res_opt_mix_ind[s,:,:] = opt_mix_ind
223 | res_opt_mix_aris[s,:,:] = opt_mix_aris
224 |
225 | # Save results
226 | np.savez(fname_final, methods=methods, acc_funcs=acc_funcs, res=res, accs_desc=accs_desc,
227 | method_desc=res_desc, source_aris=source_aris, min_expr_genes=min_expr_genes,
228 | non_zero_threshold=non_zero_threshold, perc_consensus_genes=perc_consensus_genes, num_cluster=num_cluster, nmf_alpha=nmf_alpha, nmf_l1=nmf_l1, nmf_max_iter=nmf_max_iter, nmf_rel_err=nmf_rel_err, percs=percs, reps=reps, genes=genes, n_src=n_src, n_trg=n_trg, mixes=mixes, res_opt_mix_ind=res_opt_mix_ind, res_opt_mix_aris=res_opt_mix_aris)
229 |
230 | # Show running times
231 | now2 = datetime.datetime.now()
232 | print("Current date and time:")
233 | print(now2.strftime("%Y-%m-%d %H:%M"))
234 | print("Time passed:")
235 | print(now2-now1)
236 | print('Done.')
237 |
--------------------------------------------------------------------------------
/scripts/plots/evaluate_hockley_robustness_magic_seurat.py:
--------------------------------------------------------------------------------
1 | ###################################################
2 | ### ###
3 | ### Evaluation of Robustness experiment using ###
4 | ### MAGIC or Seurat pre-processed Hockley data ###
5 | ### written by Bettina Mieth, Nico Görnitz, ###
6 | ### Marina Vidovic and Alex Gutteridge ###
7 | ### ###
8 | ###################################################
9 |
10 | # Please change all directories to yours!
11 | import sys
12 | sys.path.append('/home/bmieth/scRNAseq/implementations')
13 | import matplotlib
14 | matplotlib.use('Agg')
15 | import matplotlib.pyplot as plt
16 | import numpy as np
17 | from sklearn.manifold import TSNE
18 |
19 | if __name__ == "__main__":
20 | # Loading data - Please change directories to yours
21 | foo = np.load('/home/bmieth/scRNAseq/results/jims_data/magic/jimtarget_usoskinsource_magic_1000reps.npz')
22 | foo_for_clusterident = np.load('/home/bmieth/scRNAseq/results/jims_data/final_for_pub_k7/jimtarget_usoskinsource_level3labels.npz')
23 | num_exps = foo['reps']
24 |
25 | # mNP and mNFa clusters
26 | print('Counting the numbers for mNP and mNFa clusters!')
27 | # Identify the two clusters
28 | trg_labels_all = foo_for_clusterident['trg_labels']
29 | res_opt_mix_ind = foo_for_clusterident['res_opt_mix_ind']
30 | trg_labels = trg_labels_all[:, res_opt_mix_ind+2]
31 | cl1 = (trg_labels == 6)
32 | cell_names_target = foo_for_clusterident['cell_names_target']
33 | cluster_1 = cell_names_target[cl1].flatten()
34 | cl2 = (trg_labels == 4)
35 | cluster_2 = cell_names_target[cl2].flatten()
36 |
37 | # SC3 Mix with level 3 labels, TransferCluster
38 | trg_labels_l3 = foo['trg_labels_reps']
39 | data_target_preprocessed = foo['data_target']
40 | trg_labels_SC3 = trg_labels_l3[:,0,:]
41 | counter_SC3 = 0
42 | successful_flag = np.ones(num_exps, dtype=bool)
43 |
44 | # SC3 alone, TargetCluster
45 | for i in np.arange(num_exps):
46 | cl1_labels = trg_labels_SC3[cl1,i] .tolist()
47 | cl1_most_common = max(set(cl1_labels), key=cl1_labels.count)
48 | cl2_labels = trg_labels_SC3[cl2,i] .tolist()
49 | cl2_most_common = max(set(cl2_labels), key=cl2_labels.count)
50 | if cl1_most_common != cl2_most_common:
51 | counter_SC3=counter_SC3+1
52 | successful_flag[i] = False
53 |
54 |
55 | # SC3 comb, ConcatenateCluster
56 | trg_labels_SC3_COMB = trg_labels_l3[:,1,:]
57 | counter_SC3_COMB = 0
58 | for i in np.arange(num_exps):
59 | cl1_labels = trg_labels_SC3_COMB[cl1,i] .tolist()
60 | cl1_most_common = max(set(cl1_labels), key=cl1_labels.count)
61 | cl2_labels = trg_labels_SC3_COMB[cl2,i] .tolist()
62 | cl2_most_common = max(set(cl2_labels), key=cl2_labels.count)
63 | if cl1_most_common != cl2_most_common:
64 | counter_SC3_COMB=counter_SC3_COMB+1
65 | else:
66 | successful_flag[i] = False
67 |
68 | trg_labels_l3 = trg_labels_l3[:,2,:]
69 | counter_l3 = 0
70 | for i in np.arange(num_exps):
71 | cl1_labels = trg_labels_l3[cl1,i] .tolist()
72 | cl1_most_common = max(set(cl1_labels), key=cl1_labels.count)
73 | cl2_labels = trg_labels_l3[cl2,i] .tolist()
74 | cl2_most_common = max(set(cl2_labels), key=cl2_labels.count)
75 | if cl1_most_common != cl2_most_common:
76 | counter_l3=counter_l3+1
77 | else:
78 | successful_flag[i] = False
79 |
80 | # Print results (i.e. counts of successful identifications of mNP / mNFa clusters)
81 | print('Counter SC3: ', counter_SC3)
82 | print('Counter SC3 Comb: ', counter_SC3_COMB)
83 | print('Counter SC3 L3: ', counter_l3)
84 |
85 |
86 | # pNF clusters
87 | print('Counting the numbers for pNF clusters!')
88 | # Identify the two clusters
89 | trg_labels_all = foo_for_clusterident['trg_labels']
90 | res_opt_mix_ind = foo_for_clusterident['res_opt_mix_ind']
91 | trg_labels = trg_labels_all[:, res_opt_mix_ind+2]
92 | cl1 = (trg_labels == 0)
93 | cell_names_target = foo_for_clusterident['cell_names_target']
94 | cluster_1 = cell_names_target[cl1].flatten()
95 | cl2 = (trg_labels == 3)
96 | cluster_2 = cell_names_target[cl2].flatten()
97 |
98 | # SC3 Mix with level 3 labels, TransferCluster
99 | trg_labels_l3 = foo['trg_labels_reps']
100 | data_target_preprocessed = foo['data_target']
101 | trg_labels_SC3 = trg_labels_l3[:,0,:]
102 | counter_SC3 = 0
103 |
104 | # SC3 alone, TargetCluster
105 | for i in np.arange(num_exps):
106 | cl1_labels = trg_labels_SC3[cl1,i] .tolist()
107 | cl1_most_common = max(set(cl1_labels), key=cl1_labels.count)
108 | cl2_labels = trg_labels_SC3[cl2,i] .tolist()
109 | cl2_most_common = max(set(cl2_labels), key=cl2_labels.count)
110 | if cl1_most_common != cl2_most_common:
111 | counter_SC3=counter_SC3+1
112 | else:
113 | successful_flag[i] = False
114 |
115 |
116 | # SC3 Comb, ConcatenateCluster
117 | trg_labels_SC3_COMB = trg_labels_l3[:,1,:]
118 | counter_SC3_COMB = 0
119 | for i in np.arange(num_exps):
120 | cl1_labels = trg_labels_SC3_COMB[cl1,i] .tolist()
121 | cl1_most_common = max(set(cl1_labels), key=cl1_labels.count)
122 | cl2_labels = trg_labels_SC3_COMB[cl2,i] .tolist()
123 | cl2_most_common = max(set(cl2_labels), key=cl2_labels.count)
124 | if cl1_most_common != cl2_most_common:
125 | counter_SC3_COMB=counter_SC3_COMB+1
126 | else:
127 | successful_flag[i] = False
128 |
129 | trg_labels_l3 = trg_labels_l3[:,2,:]
130 | counter_l3 = 0
131 | for i in np.arange(num_exps):
132 | cl1_labels = trg_labels_l3[cl1,i] .tolist()
133 | cl1_most_common = max(set(cl1_labels), key=cl1_labels.count)
134 | cl2_labels = trg_labels_l3[cl2,i] .tolist()
135 | cl2_most_common = max(set(cl2_labels), key=cl2_labels.count)
136 | if cl1_most_common != cl2_most_common:
137 | counter_l3=counter_l3+1
138 | else:
139 | successful_flag[i] = False
140 |
141 | # Print results (i.e. counts of successful identifications of pNF clusters)
142 |
143 | print('Counter SC3: ', counter_SC3)
144 | print('Counter SC3 Comb: ', counter_SC3_COMB)
145 | print('Counter SC3 L3: ', counter_l3)
146 |
147 | # pPEP clusters
148 | print('Counting the numbers for pPEP clusters!')
149 | # Identify the two clusters
150 | trg_labels_all = foo_for_clusterident['trg_labels']
151 | res_opt_mix_ind = foo_for_clusterident['res_opt_mix_ind']
152 | trg_labels = trg_labels_all[:, res_opt_mix_ind+2]
153 | cl1 = (trg_labels == 5)
154 | cell_names_target = foo_for_clusterident['cell_names_target']
155 | cluster_1 = cell_names_target[cl1].flatten()
156 | cl2 = (trg_labels == 2)
157 | cluster_2 = cell_names_target[cl2].flatten()
158 |
159 | # SC3 Mix with level 3 labels, TransferCluster
160 | trg_labels_l3 = foo['trg_labels_reps']
161 | data_target_preprocessed = foo['data_target']
162 | trg_labels_SC3 = trg_labels_l3[:,0,:]
163 | counter_SC3 = 0
164 |
165 | # SC3 alone, TargetCluster
166 | for i in np.arange(num_exps):
167 | cl1_labels = trg_labels_SC3[cl1,i] .tolist()
168 | cl1_most_common = max(set(cl1_labels), key=cl1_labels.count)
169 | cl2_labels = trg_labels_SC3[cl2,i] .tolist()
170 | cl2_most_common = max(set(cl2_labels), key=cl2_labels.count)
171 | if cl1_most_common != cl2_most_common:
172 | counter_SC3=counter_SC3+1
173 | else:
174 | successful_flag[i] = False
175 |
176 | # SC3 Comb, ConcatenateCluster
177 | trg_labels_SC3_COMB = trg_labels_l3[:,1,:]
178 | counter_SC3_COMB = 0
179 | for i in np.arange(num_exps):
180 | cl1_labels = trg_labels_SC3_COMB[cl1,i] .tolist()
181 | cl1_most_common = max(set(cl1_labels), key=cl1_labels.count)
182 | cl2_labels = trg_labels_SC3_COMB[cl2,i] .tolist()
183 | cl2_most_common = max(set(cl2_labels), key=cl2_labels.count)
184 | if cl1_most_common != cl2_most_common:
185 | counter_SC3_COMB=counter_SC3_COMB+1
186 | successful_flag[i] = False
187 |
188 | trg_labels_l3 = trg_labels_l3[:,2,:]
189 | counter_l3 = 0
190 | for i in np.arange(num_exps):
191 | cl1_labels = trg_labels_l3[cl1,i] .tolist()
192 | cl1_most_common = max(set(cl1_labels), key=cl1_labels.count)
193 | cl2_labels = trg_labels_l3[cl2,i] .tolist()
194 | cl2_most_common = max(set(cl2_labels), key=cl2_labels.count)
195 | if cl1_most_common != cl2_most_common:
196 | counter_l3=counter_l3+1
197 | else:
198 | successful_flag[i] = False
199 |
200 | # Print results (i.e. counts of successful identifications of pPEP clusters
201 | print('Counter SC3: ', counter_SC3)
202 | print('Counter SC3 Comb: ', counter_SC3_COMB)
203 | print('Counter SC3 L3: ', counter_l3)
204 | print(np.where(successful_flag)[0])
205 |
206 |
207 | print('Done')
208 |
--------------------------------------------------------------------------------
/scripts/plots/main_plots_generated_data.py:
--------------------------------------------------------------------------------
1 | ###################################################
2 | ### ###
3 | ### Plot script for experiment on generated data ###
4 | ### written by Bettina Mieth, Nico Görnitz, ###
5 | ### Marina Vidovic and Alex Gutteridge ###
6 | ### ###
7 | ###################################################
8 |
9 | # Please change all directories to yours!
10 |
11 | import sys
12 | sys.path.append('/home/bmieth/scRNAseq/implementations')
13 | import matplotlib
14 | matplotlib.use('Agg')
15 | import matplotlib.pyplot as plt
16 | import numpy as np
17 | from scipy import stats
18 |
19 |
20 | def plot_percs_optmix(fig_num,res, res_opt_mix_ind,res_opt_mix_aris, accs_desc, method_desc, percs, genes, n_src, n_trg, common, mixes):
21 | ind_genes = 0
22 | ind_src = 0
23 | fcnt = 1
24 | print(common)
25 | for ind_common in reversed(list(range(len(common)))):
26 | # Baseline methods (TargetCluster and ConcatenateCluster)
27 | ari_1_baseline = np.mean(res[ind_src, ind_genes, ind_common, 0, :, :, 0], axis=0)
28 | ari_2_baseline = np.mean(res[ind_src, ind_genes, ind_common, 0, :, :, 1], axis=0)
29 | ste_ari_1_baseline = stats.sem(res[ind_src, ind_genes, ind_common, 0, :, :, 0], axis=0, ddof=0)
30 | ste_ari_2_baseline = stats.sem(res[ind_src, ind_genes, ind_common, 0, :, :, 1], axis=0, ddof=0)
31 |
32 | # Plot with errorbars
33 | plt.subplot(2, 2, fcnt+1)
34 | markers, caps, bars = plt.errorbar(percs, ari_1_baseline, fmt='c', yerr=ste_ari_1_baseline, linewidth=2.0)
35 | [bar.set_alpha(0.5) for bar in bars]
36 | [cap.set_alpha(0.5) for cap in caps]
37 | markers, caps, bars = plt.errorbar(percs, ari_2_baseline, fmt='y', yerr=ste_ari_2_baseline, linewidth=2.0)
38 | [bar.set_alpha(0.5) for bar in bars]
39 | [cap.set_alpha(0.5) for cap in caps]
40 |
41 | # Plot our method (TransferCluster)
42 | ari = np.mean(res_opt_mix_aris[ind_src, ind_genes, ind_common, :, :], axis=0)
43 | ste = stats.sem(res_opt_mix_aris[ind_src, ind_genes, ind_common, :, :], axis=0, ddof=0)
44 | markers, caps, bars = plt.errorbar(percs, ari, fmt='-b', yerr=ste, linewidth=2.0)
45 | [bar.set_alpha(0.5) for bar in bars]
46 | [cap.set_alpha(0.5) for cap in caps]
47 | if common[ind_common] == 0:
48 | plt.title('No overlap', fontsize = 22, x=0.5, y=0.9)
49 | elif common[ind_common] == 3:
50 | plt.title('Incomplete overlap', fontsize = 22, x=0.5, y=0.9)
51 | elif common[ind_common] == 5:
52 | plt.title('Complete overlap', fontsize = 22, x=0.5, y=0.9)
53 | plt.ylabel('ARI', fontsize=16)
54 |
55 | plt.xlabel('Target cells', fontsize=16)
56 | plt.xlim([np.min(percs), np.max(percs)])
57 | percs_now = np.delete(percs, 1)
58 | plt.xticks(percs_now, np.array(percs_now * n_trg, dtype=np.int), fontsize=13)
59 | plt.ylim([0., 1.])
60 | plt.yticks(fontsize=13)
61 | fcnt += 1
62 |
63 | plt.legend(['TargetCluster', 'ConcatenateCluster', 'TransferCluster'], fontsize=13, loc=4)
64 |
65 |
66 | if __name__ == "__main__":
67 |
68 | # Figure direction to save to
69 | fname_plot ='/home/bmieth/scRNAseq/results/toy_data_final/main_results_toydata_figure_'
70 | # Data location - please change directory to yours
71 | foo = np.load('/home/bmieth/scRNAseq/results/toy_data_final/main_results_toydata.npz')
72 |
73 | # Load data
74 | methods = foo['methods']
75 | acc_funcs = foo['acc_funcs']
76 | res = foo['res'] # n_src x genes x common x acc_funcs x reps x percs x methods
77 | res_opt_mix_ind = foo['res_opt_mix_ind']
78 | res_opt_mix_aris = foo['res_opt_mix_aris']
79 | source_aris = foo['source_aris'] # n_src x genes x common x reps
80 | accs_desc = foo['accs_desc']
81 | print(accs_desc)
82 | method_desc = foo['method_desc']
83 | percs = foo['percs']
84 | # reps = foo['reps']
85 | genes = foo['genes']
86 | n_src = foo['n_src']
87 | n_trg = foo['n_trg']
88 | common = foo['common']
89 | mixes = foo['mixes']
90 | print('n_src x genes x common x acc_funcs x reps x percs x methods')
91 | print('Result dimensionality: ', res.shape)
92 | print('n_src x genes x common x reps x percs')
93 | print('Result optimal mixture parameter', res_opt_mix_ind.shape)
94 |
95 | # Plot figure
96 | fig = plt.figure(figsize=(16,12))
97 | plot_percs_optmix(1, res, res_opt_mix_ind,res_opt_mix_aris, accs_desc, method_desc, percs, genes, n_src, n_trg, common, mixes)
98 | plt.savefig(fname_plot+'1'+'.jpg')
99 | print('Done')
100 |
--------------------------------------------------------------------------------
/scripts/plots/main_plots_hockley.py:
--------------------------------------------------------------------------------
1 | ###################################################
2 | ### ###
3 | ### Plot script for experiment on Hockley data ###
4 | ### written by Bettina Mieth, Nico Görnitz, ###
5 | ### Marina Vidovic and Alex Gutteridge ###
6 | ### ###
7 | ###################################################
8 |
9 | # Please change all directories to yours!
10 |
11 | import sys
12 | sys.path.append('/home/bmieth/scRNAseq/implementations')
13 | import matplotlib
14 | matplotlib.use('Agg')
15 | import matplotlib.pyplot as plt
16 | import numpy as np
17 | from sklearn.manifold import TSNE
18 | from sklearn.decomposition import PCA
19 |
20 |
21 | def plot_tsne(data_matrix, labels, method_string):
22 | #generate a list of markers and another of colors
23 | markers = ["." , "," , "o" , "v" , "^" , "<", ">","." , "," , "o" , "v" , "^" , "<", ">"]
24 | colors = ['r','g','b','c','m', 'y', 'k', 'k', 'y','m', 'b','c','g','r']
25 | num_cluster = len(np.unique(labels))
26 | clusters = np.unique(labels)
27 |
28 | model = TSNE(n_components=2, random_state=0, init='pca', metric='euclidean', perplexity=30, method='exact')
29 | ret = model.fit_transform(data_matrix.T)
30 | for i in range(num_cluster):
31 | plt.scatter(ret[labels==clusters[i], 0], ret[labels==clusters[i], 1], 20, marker=markers[i], color = colors[i], label=i)
32 | plt.title(method_string)
33 |
34 |
35 | if __name__ == "__main__":
36 | # Figure direction to save to
37 | fname_plot ='/home/bmieth/scRNAseq/results/jims_data/final_for_pub_k7/jimtarget_usoskinsource_figure_'
38 | # Experimental results location - please change directory to yours
39 | foo_NMF = np.load('/home/bmieth/scRNAseq/results/jims_data/final_for_pub_k7/jimtarget_usoskinsource_NMFlabels.npz')
40 | foo_l1 = np.load('/home/bmieth/scRNAseq/results/jims_data/final_for_pub_k7/jimtarget_usoskinsource_level1labels.npz')
41 | foo_l2 = np.load('/home/bmieth/scRNAseq/results/jims_data/final_for_pub_k7/jimtarget_usoskinsource_level2labels.npz')
42 | foo_l3 = np.load('/home/bmieth/scRNAseq/results/jims_data/final_for_pub_k7/jimtarget_usoskinsource_level3labels.npz')
43 |
44 | # Data location - please change directory to yours
45 | fname_data_target = '/home/bmieth/scRNAseq/data/Jim/Visceraltpm_m_fltd_mat.tsv'
46 | fname_data_source = '/home/bmieth/scRNAseq/data/usoskin/usoskin_m_fltd_mat.tsv'
47 |
48 | ## TSNE plots of results
49 | # TargetCluster
50 | trg_labels = foo_NMF['trg_labels']
51 | data_target_preprocessed = foo_NMF['data_target']
52 | fig = plt.figure(figsize=(16,12))
53 | plt.subplot(2,3,1)
54 | plot_tsne(data_target_preprocessed, trg_labels[:, 0], method_string = 'TargetCluster')
55 |
56 | ## SC3 comb results, ConcatenateCluster
57 | trg_labels = foo_NMF['trg_labels']
58 | data_target_preprocessed = foo_NMF['data_target']
59 | plt.subplot(2,3,2)
60 | plot_tsne(data_target_preprocessed, trg_labels[:, 1], method_string = 'ConcatenateCluster')
61 |
62 | # SC3 Mix with NMF labels (TransferCluster with NMF labels)
63 | trg_labels = foo_NMF['trg_labels']
64 | data_target_preprocessed = foo_NMF['data_target']
65 | res_opt_mix_ind = foo_NMF['res_opt_mix_ind']
66 | plt.subplot(2,3,3)
67 | plot_tsne(data_target_preprocessed, trg_labels[:, res_opt_mix_ind+2], method_string = 'TransferCluster with NMF labels')
68 |
69 | ## SC3 Mix with level 1 labels (TransferCluster with level 1 labels)
70 | trg_labels = foo_l1['trg_labels']
71 | data_target_preprocessed = foo_l1['data_target']
72 | res_opt_mix_ind = foo_l1['res_opt_mix_ind']
73 | plt.subplot(2,3,4)
74 | plot_tsne(data_target_preprocessed, trg_labels[:, res_opt_mix_ind+2], method_string = 'TransferCluster with level 1 labels')
75 |
76 | ## SC3 Mix with level 2 labels (TransferCluster with level 2 labels)
77 | trg_labels = foo_l2['trg_labels']
78 | data_target_preprocessed = foo_l2['data_target']
79 | res_opt_mix_ind = foo_l2['res_opt_mix_ind']
80 | plt.subplot(2,3,5)
81 | plot_tsne(data_target_preprocessed, trg_labels[:, res_opt_mix_ind+2], method_string = 'TransferCluster with level 2 labels')
82 |
83 | ## SC3 Mix with level 3 labels (TransferCluster with level 3 labels)
84 | trg_labels = foo_l3['trg_labels']
85 | data_target_preprocessed = foo_l3['data_target']
86 | res_opt_mix_ind = foo_l3['res_opt_mix_ind']
87 | plt.subplot(2,3,6)
88 | plot_tsne(data_target_preprocessed, trg_labels[:, res_opt_mix_ind+2], method_string = 'TransferCluster with level 3 labels')
89 |
90 | plt.savefig(fname_plot+'S9.jpg')
91 |
92 | print('Done')
93 |
--------------------------------------------------------------------------------
/scripts/plots/main_plots_hockley_magic.py:
--------------------------------------------------------------------------------
1 | ###################################################
2 | ### ###
3 | ### Plot script for experiment on Hockley data ###
4 | ### using MAGIC pre-processed data ###
5 | ### written by Bettina Mieth, Nico Görnitz, ###
6 | ### Marina Vidovic and Alex Gutteridge ###
7 | ### ###
8 | ###################################################
9 |
10 | # Please change all directories to yours!
11 |
12 | import sys
13 | sys.path.append('/home/bmieth/scRNAseq/implementations')
14 | import matplotlib
15 | matplotlib.use('Agg')
16 | import matplotlib.pyplot as plt
17 | import numpy as np
18 | from sklearn.manifold import TSNE
19 | from sklearn.decomposition import PCA
20 |
21 |
22 | def plot_tsne(data_matrix, labels, method_string):
23 | #generate a list of markers and another of colors
24 | markers = ["." , "," , "o" , "v" , "^" , "<", ">","." , "," , "o" , "v" , "^" , "<", ">"]
25 | colors = ['r','g','b','c','m', 'y', 'k', 'k', 'y','m', 'b','c','g','r']
26 | num_cluster = len(np.unique(labels))
27 | clusters = np.unique(labels)
28 |
29 | model = TSNE(n_components=2, random_state=0, init='pca', metric='euclidean', perplexity=30, method='exact')
30 | ret = model.fit_transform(data_matrix.T)
31 | for i in range(num_cluster):
32 | plt.scatter(ret[labels==clusters[i], 0], ret[labels==clusters[i], 1], 20, marker=markers[i], color = colors[i], label=i)
33 | #plt.title(method_string +', {0} cluster'.format(num_cluster))
34 | plt.title(method_string)
35 | #plt.legend()
36 |
37 |
38 | if __name__ == "__main__":
39 | # Figure direction to save to
40 | fname_plot ='/home/bmieth/scRNAseq/results/jims_data/magic/jimtarget_usoskinsource_magic'
41 | # Experimental results location - please change directory to yours
42 | foo_l3 = np.load('/home/bmieth/scRNAseq/results/jims_data/magic/jimtarget_usoskinsource_magic_without_filter.npz')
43 | # Data location - please change directory to yours
44 | foo_data = np.load('/home/bmieth/scRNAseq/results/jims_data/final_for_pub_k7/jimtarget_usoskinsource_NMFlabels.npz')
45 | data_target = foo_data['data_target']
46 |
47 | ## TSNE plots of results
48 | # SC3 results (TargetCluster)
49 | trg_labels = foo_l3['trg_labels']
50 | fig = plt.figure(figsize=(16,6))
51 | plt.subplot(1,3,1)
52 | plot_tsne(data_target, trg_labels[:, 0], method_string = 'TargetCluster')
53 |
54 | ## SC3 comb results (ConcatenateCluster)
55 | plt.subplot(1,3,2)
56 | plot_tsne(data_target, trg_labels[:, 1], method_string = 'ConcatenateCluster')
57 |
58 | ## TransferCluster with level 3 labels
59 | res_opt_mix_ind = foo_l3['res_opt_mix_ind']
60 | plt.subplot(1,3,3)
61 | plot_tsne(data_target, trg_labels[:, res_opt_mix_ind+2], method_string = 'TransferCluster with level 3 labels')
62 |
63 | plt.savefig(fname_plot+'.jpg')
64 |
65 | print('Done')
66 |
--------------------------------------------------------------------------------
/scripts/plots/main_plots_hockley_robustness.py:
--------------------------------------------------------------------------------
1 | ###################################################
2 | ### ###
3 | ### Plot script for robustness experiment ###
4 | ### on Hockley data ###
5 | ### written by Bettina Mieth, Nico Görnitz, ###
6 | ### Marina Vidovic and Alex Gutteridge ###
7 | ### ###
8 | ###################################################
9 |
10 | # Please change all directories to yours!
11 |
12 | import sys
13 | sys.path.append('/home/bmieth/scRNAseq/implementations')
14 | import matplotlib
15 | matplotlib.use('Agg')
16 | import matplotlib.pyplot as plt
17 | import numpy as np
18 | from sklearn.manifold import TSNE
19 | from sklearn.decomposition import PCA
20 |
21 |
22 | def plot_tsne(data_matrix, labels, method_string):
23 | #generate a list of markers and another of colors
24 | markers = ["." , "," , "o" , "v" , "^" , "<", ">","." , "," , "o" , "v" , "^" , "<", ">", "." ]
25 | colors = ['r','g','b','c','m', 'y', 'k', 'k', 'y','m', 'b','c','g','r']
26 | num_cluster = len(np.unique(labels))
27 | clusters = np.unique(labels)
28 |
29 | model = TSNE(n_components=2, random_state=0, init='pca', metric='euclidean', perplexity=30, method='exact')
30 | ret = model.fit_transform(data_matrix.T)
31 | for i in range(num_cluster):
32 | plt.scatter(ret[labels==clusters[i], 0], ret[labels==clusters[i], 1], 20, marker=markers[i], color = colors[i], label=i)
33 | plt.title(method_string +', {0} cluster'.format(num_cluster))
34 | plt.legend()
35 |
36 | if __name__ == "__main__":
37 | # Figure direction to save to
38 | fname_plot ='/home/bmieth/scRNAseq/results/jims_data/multiple_reps/jimtarget_usoskinsource_k7_1000reps_figure_'
39 | # Experimental results location - please change directory to yours
40 | foo_NMF = np.load('/home/bmieth/scRNAseq/results/jims_data/multiple_reps/jimtarget_usoskinsource_NMFlabels_k7_1000reps.npz')
41 | foo_l1 = np.load('/home/bmieth/scRNAseq/results/jims_data/multiple_reps/jimtarget_usoskinsource_level1labels_k7_1000reps.npz')
42 | foo_l2 = np.load('/home/bmieth/scRNAseq/results/jims_data/multiple_reps/jimtarget_usoskinsource_level2labels_k7_1000reps.npz')
43 | foo_l3 = np.load('/home/bmieth/scRNAseq/results/jims_data/multiple_reps/jimtarget_usoskinsource_level3labels_k7_1000reps.npz')
44 |
45 | # Data location - please change directory to yours
46 | fname_data_target = '/home/bmieth/scRNAseq/data/Jim/Visceraltpm_m_fltd_mat.tsv'
47 | fname_data_source = '/home/bmieth/scRNAseq/data/usoskin/usoskin_m_fltd_mat.tsv'
48 |
49 | # TSNE plots of results
50 | # SC3 results, Consensus results for TargetCluster
51 | trg_labels = foo_l3['cons_clustering_sc3']
52 | data_target_preprocessed = foo_NMF['data_target']
53 | fig = plt.figure(figsize=(16,12))
54 | plt.subplot(2,3,1)
55 | plot_tsne(data_target_preprocessed, trg_labels, method_string = 'SC3')
56 |
57 | # SC3 comb results, Consensus results for ConcatenateCluster
58 | trg_labels = foo_l3['cons_clustering_sc3_comb']
59 | data_target_preprocessed = foo_NMF['data_target']
60 | plt.subplot(2,3,2)
61 | plot_tsne(data_target_preprocessed, trg_labels, method_string = 'SC3 Comb')
62 |
63 | # SC3 Mix with NMF labels, Consensus results for TransferCluster with NMF labels
64 | trg_labels = foo_NMF['cons_clustering_sc3_mix']
65 | data_target_preprocessed = foo_NMF['data_target']
66 | plt.subplot(2,3,3)
67 | plot_tsne(data_target_preprocessed, trg_labels, method_string = 'SC3 Mix with NMF labels')
68 |
69 | ## SC3 Mix with level 1 labels, Consensus results for TransferCluster with level 1 labels
70 | trg_labels = foo_l1['cons_clustering_sc3_mix']
71 | data_target_preprocessed = foo_l1['data_target']
72 | plt.subplot(2,3,4)
73 | plot_tsne(data_target_preprocessed, trg_labels, method_string = 'SC3 Mix with level 1 labels')
74 |
75 | ## SC3 Mix with level 2 labels, Consensus results for TransferCluster with level 2 labels
76 | trg_labels = foo_l2['cons_clustering_sc3_mix']
77 | data_target_preprocessed = foo_l2['data_target']
78 | plt.subplot(2,3,5)
79 | plot_tsne(data_target_preprocessed, trg_labels, method_string = 'SC3 Mix with level 2 labels')
80 |
81 | # SC3 Mix with level 3 labels, Consensus results for TransferCluster with level 3 labels
82 | trg_labels = foo_l3['cons_clustering_sc3_mix']
83 | data_target_preprocessed = foo_l3['data_target']
84 | plt.subplot(2,3,6)
85 | plot_tsne(data_target_preprocessed, trg_labels, method_string = 'SC3 Mix with level 3 labels')
86 |
87 | plt.savefig(fname_plot+'tsne_plots.jpg')
88 |
89 | print('Done')
90 |
--------------------------------------------------------------------------------
/scripts/plots/main_plots_hockley_seurat.py:
--------------------------------------------------------------------------------
1 | ###################################################
2 | ### ###
3 | ### Plot script for experiment on Hockley data ###
4 | ### using Seurat pre-processed data ###
5 | ### written by Bettina Mieth, Nico Görnitz, ###
6 | ### Marina Vidovic and Alex Gutteridge ###
7 | ### ###
8 | ###################################################
9 |
10 | # Please change all directories to yours!
11 |
12 | import sys
13 | sys.path.append('/home/bmieth/scRNAseq/implementations')
14 | import matplotlib
15 | matplotlib.use('Agg')
16 | import matplotlib.pyplot as plt
17 | import numpy as np
18 | from sklearn.manifold import TSNE
19 | from sklearn.decomposition import PCA
20 |
21 |
22 | def plot_tsne(data_matrix, labels, method_string):
23 | #generate a list of markers and another of colors
24 | markers = ["." , "," , "o" , "v" , "^" , "<", ">","." , "," , "o" , "v" , "^" , "<", ">"]
25 | colors = ['r','g','b','c','m', 'y', 'k', 'k', 'y','m', 'b','c','g','r']
26 | num_cluster = len(np.unique(labels))
27 | clusters = np.unique(labels)
28 |
29 | model = TSNE(n_components=2, random_state=0, init='pca', metric='euclidean', perplexity=30, method='exact')
30 | ret = model.fit_transform(data_matrix.T)
31 | for i in range(num_cluster):
32 | plt.scatter(ret[labels==clusters[i], 0], ret[labels==clusters[i], 1], 20, marker=markers[i], color = colors[i], label=i)
33 | #plt.title(method_string +', {0} cluster'.format(num_cluster))
34 | plt.title(method_string)
35 | #plt.legend()
36 |
37 |
38 | if __name__ == "__main__":
39 | # Figure direction to save to
40 | fname_plot ='/home/bmieth/scRNAseq/results/jims_data/seurat/jimtarget_usoskinsource_2_'
41 | # Experimental results location - please change directory to yours
42 | foo_l3 = np.load('/home/bmieth/scRNAseq/results/jims_data/seurat/jimtarget_usoskinsource_level3labels_k7_after_seurat_new.npz')
43 | # Data location - please change directory to yours
44 | foo_data = np.load('/home/bmieth/scRNAseq/results/jims_data/final_for_pub_k7/jimtarget_usoskinsource_NMFlabels.npz')
45 | data_target = foo_data['data_target']
46 |
47 | ## TSNE plots of results
48 | # SC3 results (TargetCluster)
49 | trg_labels = foo_l3['trg_labels']
50 | fig = plt.figure(figsize=(16,6))
51 | plt.subplot(1,3,1)
52 | plot_tsne(data_target, trg_labels[:, 0], method_string = 'TargetCluster')
53 |
54 | ## SC3 comb results (ConcatenateCluster)
55 | plt.subplot(1,3,2)
56 | plot_tsne(data_target, trg_labels[:, 1], method_string = 'ConcatenateCluster')
57 |
58 | # SC3 Mix with level 3 labels (TransferCluster with level 3 labels)
59 | res_opt_mix_ind = foo_l3['res_opt_mix_ind']
60 | plt.subplot(1,3,3)
61 | plot_tsne(data_target, trg_labels[:, res_opt_mix_ind+2], method_string = 'TransferCluster with level 3 labels')
62 |
63 | plt.savefig(fname_plot+'Seurat.jpg')
64 |
65 | print('Done')
66 |
--------------------------------------------------------------------------------
/scripts/plots/main_plots_tasic.py:
--------------------------------------------------------------------------------
1 | ###################################################
2 | ### ###
3 | ### Plot script for experiments on Tasic data ###
4 | ### written by Bettina Mieth, Nico Görnitz, ###
5 | ### Marina Vidovic and Alex Gutteridge ###
6 | ### ###
7 | ###################################################
8 |
9 | # Please change all directories to yours!
10 | import sys
11 | sys.path.append('/home/bmieth/scRNAseq/implementations')
12 | import matplotlib
13 | matplotlib.use('Agg')
14 | import matplotlib.pyplot as plt
15 | import numpy as np
16 | import pdb
17 | from scipy import stats
18 | import pandas as pd
19 | from scipy import stats
20 |
21 |
22 | def plot_main_opt_mix(fig_num, res, res_opt_mix_ind,res_opt_mix_aris, accs_desc, method_desc, percs, genes, n_src, n_trg, mixes, overlap_setting, source_label_setting):
23 | # Setting up plot
24 | ind_src = 0
25 | plt.figure(fig_num)
26 |
27 | # Baseline methods (TargetCluster and ConcatenateCluster)
28 | ari_1_baseline = np.mean(res[ind_src, 0, :, :, 0], axis=0)
29 | ari_2_baseline = np.mean(res[ind_src, 0, :, :, 1], axis=0)
30 | # Standard errors
31 | ste_ari_1_baseline = stats.sem(res[ind_src, 0, :, :, 0], axis=0, ddof=0)
32 | ste_ari_2_baseline = stats.sem(res[ind_src, 0, :, :, 1], axis=0, ddof=0)
33 |
34 | # Plot with errorbars
35 | markers, caps, bars = plt.errorbar(percs, ari_1_baseline, fmt='c', yerr=ste_ari_1_baseline, linewidth=2.0)
36 | [bar.set_alpha(0.5) for bar in bars]
37 | [cap.set_alpha(0.5) for cap in caps]
38 | markers, caps, bars = plt.errorbar(percs, ari_2_baseline, fmt='y', yerr=ste_ari_2_baseline, linewidth=2.0)
39 | [bar.set_alpha(0.5) for bar in bars]
40 | [cap.set_alpha(0.5) for cap in caps]
41 |
42 | # Plot our method (TransferCluster)
43 | ari = np.mean(res_opt_mix_aris[ind_src, :, :], axis=0)
44 | ste = stats.sem(res_opt_mix_aris[ind_src, :, :], axis=0, ddof=0)
45 | markers, caps, bars = plt.errorbar(percs, ari, fmt='-b', yerr=ste, linewidth=2.0)
46 |
47 | [bar.set_alpha(0.5) for bar in bars]
48 | [cap.set_alpha(0.5) for cap in caps]
49 | if overlap_setting == 0:
50 | plt.title('Complete overlap', fontsize=22, x=0.5, y=0.93)
51 | else:
52 | plt.title('Incomplete overlap', fontsize=22, x=0.5, y=0.93)
53 | if source_label_setting == 0:
54 | plt.text( x=0.15, y=0.88, s='Ground truth labels from NMF clustering', fontsize= 14)
55 | else:
56 | plt.text( x=0.15, y=0.88, s='Ground truth labels from original publication', fontsize= 14)
57 | plt.text( x=0.15, y=0.88, s='Ground truth labels from NMF clustering', fontsize= 14)
58 | plt.xlabel('Target cells', fontsize=16)
59 | plt.ylabel('ARI', fontsize=16)
60 | plt.xlim([np.min(percs), np.max(percs)])
61 | plt.xticks(percs, np.array(percs * n_trg, dtype=np.int), fontsize=13)
62 | plt.yticks(fontsize=13)
63 | plt.ylim([0.0, 1.0])
64 | plt.legend(['TargetCluster', 'ConcatenateCluster', 'TransferCluster'], fontsize=13, loc=4)
65 |
66 | if __name__ == "__main__":
67 | # Figure direction to save to
68 | fname_plot ='/home/bmieth/scRNAseq/results/mouse_data_final/main_results_mouse_all_four'
69 | # Location of experimental results - change to yours
70 | foo_com_orig = np.load('/home/bmieth/scRNAseq/results/mouse_data_final/main_results_mouse_18clusters_completeoverlap.npz')
71 | foo_incom_orig = np.load('/home/bmieth/scRNAseq/results/mouse_data_final/main_results_mouse_18clusters_incompleteoverlap.npz')
72 | foo_com_NMF = np.load('/home/bmieth/scRNAseq/results/mouse_data_NMF_final/main_results_mouse_NMFlabels_18cluster_completeoverlap.npz')
73 | foo_incom_NMF = np.load('/home/bmieth/scRNAseq/results/mouse_data_NMF_final/main_results_mouse_NMFlabels_18cluster_incompleteoverlap.npz')
74 |
75 | # Load data complete overlap + NMF labels
76 | res = foo_com_NMF['res'] # n_src x genes x common x acc_funcs x reps x percs x methods
77 | res_opt_mix_ind = foo_com_NMF['res_opt_mix_ind']
78 | res_opt_mix_aris = foo_com_NMF['res_opt_mix_aris']
79 | accs_desc = foo_com_NMF['accs_desc']
80 | method_desc = foo_com_NMF['method_desc']
81 | percs = foo_com_NMF['percs']
82 | genes = foo_com_NMF['genes']
83 | n_src = foo_com_NMF['n_src']
84 | n_trg = foo_com_NMF['n_trg']
85 | mixes = foo_com_NMF['mixes']
86 |
87 | # Plot figure of complete overlap + NMF labels
88 | fig = plt.figure(figsize=(16,16))
89 | plt.subplot(2,2,1)
90 | plot_main_opt_mix(1,res, res_opt_mix_ind,res_opt_mix_aris, accs_desc, method_desc, percs, genes, n_src, n_trg, mixes, overlap_setting = 0, source_label_setting = 0)
91 |
92 | # Load data incomplete overlap + NMF labels
93 | res = foo_incom_NMF['res'] # n_src x genes x common x acc_funcs x reps x percs x methods
94 | res_opt_mix_ind = foo_incom_NMF['res_opt_mix_ind']
95 | res_opt_mix_aris = foo_incom_NMF['res_opt_mix_aris']
96 | accs_desc = foo_incom_NMF['accs_desc']
97 | method_desc = foo_incom_NMF['method_desc']
98 | percs = foo_incom_NMF['percs']
99 | genes = foo_incom_NMF['genes']
100 | n_src = foo_incom_NMF['n_src']
101 | n_trg = foo_incom_NMF['n_trg']
102 | mixes = foo_incom_NMF['mixes']
103 |
104 | # Plot figure of incomplete overlap + NMF labels
105 | plt.subplot(2,2,2)
106 | plot_main_opt_mix(1,res, res_opt_mix_ind,res_opt_mix_aris, accs_desc, method_desc, percs, genes, n_src, n_trg, mixes, overlap_setting = 1, source_label_setting = 0)
107 |
108 | # Load data complete overlap + real labels
109 | res = foo_com_orig['res'] # n_src x genes x common x acc_funcs x reps x percs x methods
110 | res_opt_mix_ind = foo_com_orig['res_opt_mix_ind']
111 | res_opt_mix_aris = foo_com_orig['res_opt_mix_aris']
112 | accs_desc = foo_com_orig['accs_desc']
113 | method_desc = foo_com_orig['method_desc']
114 | percs = foo_com_orig['percs']
115 | genes = foo_com_orig['genes']
116 | n_src = foo_com_orig['n_src']
117 | n_trg = foo_com_orig['n_trg']
118 | mixes = foo_com_orig['mixes']
119 |
120 | # Plot figure of complete overlap + real labels
121 | plt.subplot(2,2,3)
122 | plot_main_opt_mix(1,res, res_opt_mix_ind,res_opt_mix_aris, accs_desc, method_desc, percs, genes, n_src, n_trg, mixes, overlap_setting = 0, source_label_setting = 1)
123 |
124 | # Load data incomplete overlap + real labels
125 | res = foo_incom_orig['res'] # n_src x genes x common x acc_funcs x reps x percs x methods
126 | res_opt_mix_ind = foo_incom_orig['res_opt_mix_ind']
127 | res_opt_mix_aris = foo_incom_orig['res_opt_mix_aris']
128 | accs_desc = foo_incom_orig['accs_desc']
129 | method_desc = foo_incom_orig['method_desc']
130 | percs = foo_incom_orig['percs']
131 | genes = foo_incom_orig['genes']
132 | n_src = foo_incom_orig['n_src']
133 | n_trg = foo_incom_orig['n_trg']
134 | mixes = foo_incom_orig['mixes']
135 |
136 | # Plot figure of incomplete overlap + real labels
137 | plt.subplot(2,2,4)
138 | plot_main_opt_mix(1,res, res_opt_mix_ind,res_opt_mix_aris, accs_desc, method_desc, percs, genes, n_src, n_trg, mixes, overlap_setting = 1, source_label_setting = 1)
139 | plt.savefig(fname_plot+'.jpg')
140 |
141 | print('Done')
142 |
143 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | try:
2 | from setuptools import setup
3 | except ImportError:
4 | from distutils.core import setup
5 |
6 | config = {
7 | 'description': 'Single-cell RNA-seq multitask clustering toolbox',
8 | 'url': 'https://github.com/nicococo/scRNA',
9 | 'author': 'Nico Goernitz, Bettina Mieth, Marina Vidovic, Alex Gutteridge',
10 | 'author_email': 'nico.goernitz@tu-berlin.de',
11 | 'version': '2019.08',
12 | 'install_requires': ['nose', 'scikit-learn','numpy', 'scipy', 'matplotlib', 'pandas'],
13 | 'packages': ['scRNA'],
14 | 'package_dir' : {'scRNA': 'scRNA'},
15 | # 'package_data': {'scRNA': ['gene_names.txt']},
16 | 'scripts': ['bin/scRNA-source.sh', 'bin/scRNA-target.sh', 'bin/scRNA-generate-data.sh'],
17 | 'name': 'scRNA',
18 | 'classifiers': ['Intended Audience :: Science/Research',
19 | 'Programming Language :: Python',
20 | 'Topic :: Scientific/Engineering',
21 | 'Operating System :: POSIX',
22 | 'Operating System :: Unix',
23 | 'Operating System :: MacOS',
24 | 'Programming Language :: Python :: 3']
25 | }
26 |
27 | setup(**config)
28 |
--------------------------------------------------------------------------------
/tests/test_transfer.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import numpy as np
3 |
4 | from functools import partial
5 |
6 | from scRNA.nmf_clustering import NmfClustering, DaNmfClustering
7 | from scRNA.sc3_clustering_impl import cell_filter, gene_filter
8 |
9 |
10 | class MyTestCase(unittest.TestCase):
11 |
12 | def test_source_preprocessing(self):
13 | src = [
14 | [1,2, 0,0,0],
15 | [0,0,10,11,0],
16 | [0,0,0,0,0]
17 | ]
18 | src = np.array(src, dtype=np.float)
19 | print(src)
20 |
21 | ids = np.array(['lbl0', 'lbl1', 'lbl2'])
22 | nmf = NmfClustering(src, ids, 2)
23 | nmf.add_cell_filter(partial(cell_filter, num_expr_genes=1, non_zero_threshold=1))
24 | nmf.add_gene_filter(partial(gene_filter, perc_consensus_genes=0.96, non_zero_threshold=1))
25 | nmf.apply()
26 |
27 | print(nmf.pp_data)
28 | print(nmf.remain_cell_inds)
29 | print(nmf.remain_gene_inds)
30 |
31 | print(nmf.cluster_labels)
32 | # numpy array testing
33 | np.testing.assert_array_equal(nmf.pp_data, src[:2,:4])
34 | np.testing.assert_array_equal(nmf.remain_gene_inds, np.arange(2))
35 | np.testing.assert_array_equal(nmf.remain_cell_inds, np.arange(4))
36 |
37 |
38 | def test_target(self):
39 | src = [
40 | [1,2, 0,0,0],
41 | [0,0,10,11,0],
42 | [1,2,1,2,0],
43 | [0,0,0,0,0]
44 | ]
45 | src = np.array(src, dtype=np.float)
46 | trg = [
47 | [2,4,6,6,0],
48 | [2,1,2,2,0],
49 | [1.1,2.3,1.2,2.1,0],
50 | [0,0,0,0,0]
51 | ]
52 | trg = np.array(trg, dtype=np.float)
53 |
54 | ids = np.array(['lbl0', 'lbl1', 'lbl2', 'lbl3'])
55 | nmf = NmfClustering(src, ids, 2)
56 | nmf.add_cell_filter(partial(cell_filter, num_expr_genes=1, non_zero_threshold=1))
57 | nmf.add_gene_filter(partial(gene_filter, perc_consensus_genes=0.96, non_zero_threshold=1))
58 | nmf.apply()
59 |
60 | trg_ids = np.array(['lbl0', 'lbl1', 'lbl20', 'lbl3'])
61 | da_nmf = DaNmfClustering(nmf, trg, trg_ids, 2)
62 | da_nmf.add_cell_filter(partial(cell_filter, num_expr_genes=1, non_zero_threshold=1))
63 | da_nmf.add_gene_filter(partial(gene_filter, perc_consensus_genes=0.96, non_zero_threshold=1))
64 | mixed, _, _ = da_nmf.get_mixed_data(mix=0.)
65 |
66 | print('-------------')
67 | print((da_nmf.src.pp_data))
68 | print('-------------')
69 | print((da_nmf.pp_data))
70 | print('-------------')
71 | print(mixed)
72 | print('-------------')
73 | W, H, H2 = da_nmf.intermediate_model
74 | print((W.dot(H)))
75 |
76 |
77 | print('-------------')
78 | print((da_nmf.remain_gene_inds))
79 | print((da_nmf.src.remain_gene_inds))
80 |
81 | # print nmf.remain_cell_inds
82 | # print nmf.remain_gene_inds
83 |
84 | # print nmf.cluster_labels
85 | # numpy array testing
86 | # np.testing.assert_array_equal(nmf.pp_data, src[:2,:4])
87 | # np.testing.assert_array_equal(nmf.remain_gene_inds, np.arange(2))
88 | # np.testing.assert_array_equal(nmf.remain_cell_inds, np.arange(4))
89 |
90 |
91 | if __name__ == '__main__':
92 | unittest.main()
93 |
--------------------------------------------------------------------------------