├── out └── .DS_Store ├── Makefile ├── src ├── sp.cpp ├── correlation.cpp ├── scoup_resume.cpp ├── scoup.cpp ├── sp.h ├── node.h └── ou.h ├── data └── init.txt └── README.md /out/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hmatsu1226/SCOUP/HEAD/out/.DS_Store -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | #Makefile 2 | all: scoup scoup_resume cor sp 3 | scoup: src/scoup.cpp src/ou.h src/node.h 4 | g++ -o scoup src/scoup.cpp 5 | scoup_resume: src/scoup_resume.cpp src/ou.h src/node.h 6 | g++ -o scoup_resume src/scoup_resume.cpp 7 | cor: src/correlation.cpp src/ou.h src/node.h 8 | g++ -o cor src/correlation.cpp 9 | sp: src/sp.cpp src/sp.h 10 | g++ -o sp src/sp.cpp -lblas -llapack -------------------------------------------------------------------------------- /src/sp.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "sp.h" 6 | 7 | using namespace std; 8 | 9 | int main(int argc, char* argv[]){ 10 | FILE *fin_expression, *fin_initial, *fout_pseudo_time, *fout_mst, *fout_pca; 11 | if((fin_expression=fopen(argv[1], "r")) == NULL){ 12 | printf("cannot open expression data\n"); 13 | return 1; 14 | } 15 | if((fin_initial=fopen(argv[2], "r")) == NULL){ 16 | printf("cannot open initial distribution data\n"); 17 | return 1; 18 | } 19 | if((fout_pseudo_time=fopen(argv[3], "w")) == NULL){ 20 | printf("cannot open Output_file1 (pseudo-time)\n"); 21 | return 1; 22 | } 23 | if((fout_pca=fopen(argv[4], "w")) == NULL){ 24 | printf("cannot open Output_file2 (PCA)\n"); 25 | return 1; 26 | } 27 | 28 | int gene_num = atoi(argv[5]); 29 | int cell_num = atoi(argv[6]); 30 | int dim = atoi(argv[7]); 31 | 32 | Pseudo_Time PT(gene_num, cell_num, dim); 33 | PT.Set_expression(fin_expression); 34 | PT.Set_initial_parameter(fin_initial); 35 | 36 | //Prim 37 | PT.Prim(fout_pseudo_time, fout_pca); 38 | 39 | return 0; 40 | } -------------------------------------------------------------------------------- /src/correlation.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "ou.h" 7 | 8 | using namespace std; 9 | 10 | int main(int argc, char* argv[]){ 11 | int K = 1; 12 | 13 | int ch; 14 | extern char *optarg; 15 | extern int optind, opterr; 16 | while((ch = getopt(argc, argv, "k:")) != -1){ 17 | switch(ch){ 18 | case 'k': 19 | K = atoi(optarg); 20 | break; 21 | case ':': 22 | printf("invalid option\n"); 23 | return 1; 24 | case '?': 25 | printf("invalid option\n"); 26 | return 1; 27 | } 28 | } 29 | 30 | FILE *fin_expression, *fin_init, *fin_gene_para, *fin_cell_para, *fout_nexp, *fout_cor; 31 | if((fin_expression=fopen(argv[1], "r")) == NULL){ 32 | printf("cannot open expression data\n"); 33 | return 1; 34 | } 35 | if((fin_init=fopen(argv[2], "r")) == NULL){ 36 | printf("cannot open initial distribution data\n"); 37 | return 1; 38 | } 39 | if((fin_gene_para=fopen(argv[3], "r")) == NULL){ 40 | printf("cannot open gene and lineage parameters data\n"); 41 | return 1; 42 | } 43 | if((fin_cell_para=fopen(argv[4], "r")) == NULL){ 44 | printf("cannot open cell parameters data\n"); 45 | return 1; 46 | } 47 | if((fout_nexp=fopen(argv[5], "w")) == NULL){ 48 | printf("cannot open Output_file1 (normalized expression)\n"); 49 | return 1; 50 | } 51 | if((fout_cor=fopen(argv[6], "w")) == NULL){ 52 | printf("cannot open Output_file2 (correlation matrix)\n"); 53 | return 1; 54 | } 55 | 56 | int gene_num = atoi(argv[7]); 57 | int cell_num = atoi(argv[8]); 58 | 59 | Continuous_OU_process OU(gene_num, cell_num, K); 60 | 61 | OU.Set_expression(fin_expression); 62 | if(OU.Set_initial_parameter(fin_init) == 1){ 63 | return 1; 64 | } 65 | if(OU.Set_optimized_parameter(fin_gene_para, fin_cell_para) == 1){ 66 | return 1; 67 | } 68 | 69 | //calc normalized expression 70 | OU.Print_correlation(fout_nexp, fout_cor); 71 | 72 | return 0; 73 | } -------------------------------------------------------------------------------- /src/scoup_resume.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "ou.h" 7 | 8 | using namespace std; 9 | 10 | int main(int argc, char* argv[]){ 11 | int K, max_ite1, max_ite2; 12 | double alpha_min, alpha_max, t_min, t_max, sigma_squared_min, thresh; 13 | K = 1; 14 | max_ite1 = 100; 15 | max_ite2 = 100; 16 | alpha_min = 0.1; 17 | alpha_max = 100; 18 | t_min = 0.001; 19 | t_max = 2.0; 20 | sigma_squared_min = 0.1; 21 | thresh = 0.01; 22 | 23 | int ch; 24 | extern char *optarg; 25 | extern int optind, opterr; 26 | while((ch = getopt(argc, argv, "k:m:M:a:A:t:T:s:e:")) != -1){ 27 | switch(ch){ 28 | case 'k': 29 | K = atoi(optarg); 30 | break; 31 | case 'm': 32 | max_ite1 = atoi(optarg); 33 | break; 34 | case 'M': 35 | max_ite2 = atoi(optarg); 36 | break; 37 | case 'a': 38 | alpha_min = atof(optarg); 39 | break; 40 | case 'A': 41 | alpha_max = atof(optarg); 42 | break; 43 | case 't': 44 | t_min = atof(optarg); 45 | break; 46 | case 'T': 47 | t_max = atof(optarg); 48 | break; 49 | case 's': 50 | sigma_squared_min = atof(optarg); 51 | break; 52 | case 'e': 53 | thresh = atof(optarg); 54 | break; 55 | case ':': 56 | printf("invalid option\n"); 57 | return 1; 58 | case '?': 59 | printf("invalid option\n"); 60 | return 1; 61 | } 62 | } 63 | 64 | FILE *fin_expression, *fin_init, *fin_opt_gene_para, *fin_opt_cell_para, *fout_gene_para, *fout_cell_para, *fout_ll; 65 | if((fin_expression=fopen(argv[optind], "r")) == NULL){ 66 | printf("cannot open expression data\n"); 67 | return 1; 68 | } 69 | if((fin_init=fopen(argv[optind+1], "r")) == NULL){ 70 | printf("cannot open initial distribution data\n"); 71 | return 1; 72 | } 73 | if((fin_opt_gene_para=fopen(argv[optind+2], "r")) == NULL){ 74 | printf("cannot open semi optimized gene and lineage parameters\n"); 75 | return 1; 76 | } 77 | if((fin_opt_cell_para=fopen(argv[optind+3], "r")) == NULL){ 78 | printf("cannot open semi optimized cell parameters\n"); 79 | return 1; 80 | } 81 | if((fout_gene_para=fopen(argv[optind+4], "w")) == NULL){ 82 | printf("cannot open Output_file1 (parameters related to gene and lineage)\n"); 83 | } 84 | if((fout_cell_para=fopen(argv[optind+5], "w")) == NULL){ 85 | printf("cannot open Output_file2 (parameters related to cell)\n"); 86 | } 87 | if((fout_ll=fopen(argv[optind+6], "w")) == NULL){ 88 | printf("cannot open Output_file3 (log-likelihood)\n"); 89 | } 90 | 91 | int gene_num = atoi(argv[optind+7]); 92 | int cell_num = atoi(argv[optind+8]); 93 | 94 | Continuous_OU_process OU(gene_num, cell_num, K, max_ite1, max_ite2, alpha_min, alpha_max, t_min, t_max, sigma_squared_min, thresh); 95 | 96 | OU.Set_expression(fin_expression); 97 | if(OU.Set_initial_parameter(fin_init) == 1){ 98 | return 1; 99 | } 100 | if(OU.Set_optimized_parameter(fin_opt_gene_para, fin_opt_cell_para) == 1){ 101 | return 1; 102 | } 103 | 104 | OU.EM(); 105 | 106 | OU.Print_cell_parameter(fout_cell_para); 107 | OU.Print_gene_parameter(fout_gene_para); 108 | OU.Print_ll(fout_ll); 109 | 110 | return 0; 111 | } -------------------------------------------------------------------------------- /src/scoup.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "ou.h" 7 | 8 | using namespace std; 9 | 10 | int main(int argc, char* argv[]){ 11 | int K, max_ite1, max_ite2; 12 | double alpha_min, alpha_max, t_min, t_max, sigma_squared_min, thresh; 13 | K = 1; 14 | max_ite1 = 1000; 15 | max_ite2 = 10000; 16 | alpha_min = 0.1; 17 | alpha_max = 100; 18 | t_min = 0.001; 19 | t_max = 2.0; 20 | sigma_squared_min = 0.1; 21 | thresh = 0.01; 22 | 23 | int ch; 24 | extern char *optarg; 25 | extern int optind, opterr; 26 | while((ch = getopt(argc, argv, "k:m:M:a:A:t:T:s:e:")) != -1){ 27 | switch(ch){ 28 | case 'k': 29 | K = atoi(optarg); 30 | break; 31 | case 'm': 32 | max_ite1 = atoi(optarg); 33 | break; 34 | case 'M': 35 | max_ite2 = atoi(optarg); 36 | break; 37 | case 'a': 38 | alpha_min = atof(optarg); 39 | break; 40 | case 'A': 41 | alpha_max = atof(optarg); 42 | break; 43 | case 't': 44 | t_min = atof(optarg); 45 | break; 46 | case 'T': 47 | t_max = atof(optarg); 48 | break; 49 | case 's': 50 | sigma_squared_min = atof(optarg); 51 | break; 52 | case 'e': 53 | thresh = atof(optarg); 54 | break; 55 | case ':': 56 | printf("invalid option\n"); 57 | return 1; 58 | case '?': 59 | printf("invalid option\n"); 60 | return 1; 61 | } 62 | } 63 | 64 | FILE *fin_expression, *fin_init, *fin_time, *fout_gene_para, *fout_cell_para, *fout_ll; 65 | if((fin_expression=fopen(argv[optind], "r")) == NULL){ 66 | printf("cannot open expression data\n"); 67 | return 1; 68 | } 69 | if((fin_init=fopen(argv[optind+1], "r")) == NULL){ 70 | printf("cannot open initial distribution data\n"); 71 | return 1; 72 | } 73 | if((fin_time=fopen(argv[optind+2], "r")) == NULL){ 74 | printf("cannot open initial pseudo-time data\n"); 75 | return 1; 76 | } 77 | if((fout_gene_para=fopen(argv[optind+3], "w")) == NULL){ 78 | printf("cannot open Output_file1 (parameters related to gene and lineage)\n"); 79 | } 80 | if((fout_cell_para=fopen(argv[optind+4], "w")) == NULL){ 81 | printf("cannot open Output_file2 (parameters related to cell)\n"); 82 | } 83 | if((fout_ll=fopen(argv[optind+5], "w")) == NULL){ 84 | printf("cannot open Output_file3 (log-likelihood)\n"); 85 | } 86 | 87 | int gene_num = atoi(argv[optind+6]); 88 | int cell_num = atoi(argv[optind+7]); 89 | 90 | Continuous_OU_process OU(gene_num, cell_num, K, max_ite1, max_ite2, alpha_min, alpha_max, t_min, t_max, sigma_squared_min, thresh); 91 | 92 | OU.Set_expression(fin_expression); 93 | if(OU.Set_initial_parameter(fin_init) == 1){ 94 | return 1; 95 | } 96 | if(OU.Set_time(fin_time) == 1){ 97 | return 1; 98 | } 99 | OU.Set_initial_parameter2(); 100 | 101 | //initialization with K=1 model 102 | for(int i=0; i 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "node.h" 13 | 14 | using namespace std; 15 | 16 | extern "C" int dsyev_(char *jobz, char *uplo, int *n, double *a, int *lda, double *w, double *work, int *lwork, int *info); 17 | 18 | class Pseudo_Time{ 19 | public: 20 | int _gene_num; 21 | int _cell_num; 22 | int _dim; 23 | 24 | Pseudo_Time(int g, int c, int dim){ 25 | _gene_num = g; 26 | _cell_num = c; 27 | _dim = dim; 28 | genes.resize(_gene_num); 29 | cells.resize(_cell_num); 30 | 31 | for(int i=0; i<_gene_num; i++){ 32 | genes[i].Init(i); 33 | } 34 | for(int i=0; i<_cell_num; i++){ 35 | cells[i].Init(_gene_num); 36 | } 37 | } 38 | 39 | double Dist(vector > &pos, int id1, int id2, int dim){ 40 | double ret = 0.0; 41 | for(int i=0; i > normalized_data(data_num, vector(_gene_num, 0)); 52 | vector ave(_gene_num, 0); 53 | vector var(_gene_num, 0); 54 | //average 55 | for(int i=0; i<_gene_num; i++){ 56 | for(int j=0; j<_cell_num; j++){ 57 | ave[i] += cells[j].Get_expression(i); 58 | } 59 | ave[i] /= _cell_num; 60 | } 61 | //variance 62 | for(int i=0; i<_gene_num; i++){ 63 | for(int j=0; j<_cell_num; j++){ 64 | var[i] += (cells[j].Get_expression(i) - ave[i]) * (cells[j].Get_expression(i) - ave[i]); 65 | } 66 | var[i] /= _cell_num; 67 | } 68 | //normalization 69 | for(int i=0; i<_cell_num; i++){ 70 | for(int j=0; j<_gene_num; j++){ 71 | if(var[j] != 0){ 72 | normalized_data[i][j] = (cells[i].Get_expression(j) - ave[j])/sqrtf(var[j]); 73 | } 74 | else{ 75 | normalized_data[i][j] = (cells[i].Get_expression(j) - ave[j]); 76 | } 77 | } 78 | } 79 | //add root cell 80 | for(int i=0; i<_gene_num; i++){ 81 | if(var[i] != 0){ 82 | normalized_data[data_num-1][i] = (genes[i].Initial_expression() - ave[i])/sqrtf(var[i]); 83 | } 84 | else{ 85 | normalized_data[data_num-1][i] = (genes[i].Initial_expression() - ave[i]); 86 | } 87 | } 88 | 89 | //calculate variance-covariance matrix 90 | double tmp; 91 | vector var_cov_matrix(_gene_num*_gene_num, 0); 92 | for(int i=0; i<_gene_num-1; i++){ 93 | for(int j=i; j<_gene_num; j++){ 94 | for(int k=0; k w(_gene_num, 0); 117 | vector work(lwork, 0); 118 | char jobz = 'V', uplo = 'U'; 119 | dsyev_(&jobz, &uplo, &_gene_num, &var_cov_matrix[0], &_gene_num, &w[0], &work[0], &lwork, &info); 120 | 121 | vector > z(data_num, vector(_dim, 0)); 122 | for (int i=0; i > edges_cost(data_num, vector(data_num,0)); 136 | for(int i=0; i > mst(data_num, vector(data_num,0)); 147 | vector checked_node; 148 | vector uncheked_node(data_num-1); 149 | vector pseudo_time(data_num, 0); 150 | for(int i=0; i edges_cost[checked_node[j]][uncheked_node[k]]){ 159 | min = edges_cost[checked_node[j]][uncheked_node[k]]; 160 | min_node_from = checked_node[j]; 161 | min_node_to = uncheked_node[k]; 162 | erase_id = k; 163 | } 164 | } 165 | } 166 | mst[min_node_from][min_node_to] = 1; 167 | checked_node.push_back(min_node_to); 168 | uncheked_node.erase(uncheked_node.begin() + erase_id); 169 | 170 | // 171 | pseudo_time[min_node_to] = pseudo_time[min_node_from] + min; 172 | if(max_pseudo_time < pseudo_time[min_node_to]){ 173 | max_pseudo_time = pseudo_time[min_node_to]; 174 | } 175 | } 176 | 177 | //normalize pseudo time 178 | for(int i=0; i<_cell_num; i++){ 179 | pseudo_time[i] /= max_pseudo_time; 180 | cells[i].Add_time(pseudo_time[i]); 181 | fprintf(ftime, "%d\t%lf\n", i, pseudo_time[i]); 182 | } 183 | 184 | for(int i=0; i 32 | ``` 33 | 34 | * Input_file1 : G x C matrix of expression data 35 | * Input_file2 : Initial distribution data 36 | * Output_file1 : Pseudo-time estimates 37 | * Output_file2 : Coordinates of PCA 38 | * G : The number of genes 39 | * C : The number of cells 40 | * D : The number of PCA dimensions 41 | 42 | ##### Format of Input_file1 43 | The Input_file1 is the G x C matrix of expression data (separated with 'TAB'). 44 | Each row corresponds to each gene, and each column corresponds to each cell. 45 | 46 | ##### Example of Input_file1 47 | ``` 48 | 0.33 -4.95 -1.37 -4.07 ... 49 | 5.01 4.45 3.82 3.02 ... 50 | . 51 | . 52 | . 53 | ``` 54 | 55 | ##### Format of Input_file2 56 | The Input_file2 contains the mean and variance of the initial normal distribution. 57 | 58 | * Col1 : Index of a gene (0-origin) 59 | * Col2 : Mean of the initial distribution for a gene 60 | * Col3 : Variance of the initial distribution for a gene 61 | 62 | ##### Example of Input_file2 63 | ``` 64 | 0 0.0 1.7 65 | 1 1.0 2.3 66 | 2 -2.0 5.9 67 | ``` 68 | 69 | ##### Format of Output_file1 70 | The Output_file1 contains the pseudo-time estimates. 71 | 72 | * Col1 : Index of a cell (0-origin) 73 | * Col2 : Pseudo-time of a cell 74 | 75 | ##### Example of Output_file1 76 | ``` 77 | 0 0.826988 78 | 1 0.102140 79 | 2 0.758120 80 | ``` 81 | 82 | ##### Format of Output_file2 83 | The Output_file2 contains the coordinates of PCA. 84 | 85 | * Col1 : Index of a cell (0-origin) 86 | * Col2 - Col(D+1) : Coordinates of a cell 87 | 88 | This file contain (C+1) lines and the last line corresponds to the root cell defined by the mean of the initial distribution. 89 | 90 | ##### Example of Output_file2 91 | ``` 92 | 0 3.04 0.42 93 | 1 -21.21 -1.52 94 | 2 5.76 0.48 95 | ``` 96 | 97 | 98 | ## Running SCOUP 99 | Estimate the parameters of Mixute Ornstein-Uhlenbeck process. 100 | 101 | ##### Usage 102 | ``` 103 | ./scoup 104 | ``` 105 | 106 | * Input_file1 : G x C matrix of expression data 107 | * Input_file2 : Initial distribution data 108 | * Input_file3 : Initial pseudo-time data 109 | * Output_file1 : Optimized parameters related to genes and lineages 110 | * Output_file2 : Optimized parameters related to cells 111 | * Output_file3 : Log-likelihood 112 | * G : The number of genes 113 | * C : The number of cells 114 | 115 | ##### Options 116 | 117 | * -k INT : The number of lineages (default is 1) 118 | * -m INT : Upper bound of EM iteration (without pseudo-time optimization). The detailed explanation is described in the supplementary text. (default is 1,000) 119 | * -M INT : Upper bound of EM iteration (including pseudo-time optimization) (default is 10,000). 120 | * -a DOUBLE : Lower bound of alpha (default is 0.1) 121 | * -A DOUBLE : Upper bound of alpha (default is 100) 122 | * -t DOUBLE : Lower bound of pseudo-time (default is 0.001) 123 | * -T DOUBLE : Upper bound of pseudo-time (default is 2.0) 124 | * -s DOUBLE : Lower bound of sigma squared (default is 0.1) 125 | 126 | ##### Example of running SCOUP 127 | ``` 128 | ./scoup -k 2 data/data.txt data/init.txt out/time_sp.txt out/gpara.txt out/cpara.txt out/ll.txt 500 100 129 | ``` 130 | 131 | ##### Format of Input_file1 132 | This is the expression data matrix data and is the same data as the Input_file1 of SP. 133 | 134 | ##### Format of Input_file2 135 | This is initial distribution and is the same data as the Input_file2 of SP. 136 | 137 | ##### Format of Input_file3 138 | This is the pseudo-time for initialization and is the same as the **Output_file1** of SP. 139 | 140 | ##### Format of Output_file1 141 | The Output_file1 contains the optimized parameters related to genes and lineages. 142 | 143 | * First line 144 | * Col1 and Col2 : Space 145 | * Col3 - Col(K+2) : The probability of each lineage (pi_k) 146 | * After first line 147 | * Col1 : alpha_g 148 | * Col2 : sigma_g^2 149 | * Col3 - Col(K+1) : theta_{gk} 150 | 151 | ##### Example of Output_file1 152 | ``` 153 | 0.509804 0.490196 154 | 0.501610 2.528400 -6.338714 -2.273163 155 | 0.309094 13.046904 3.545862 0.337260 156 | 0.223226 4.212808 -4.443503 9.629989 157 | 2.707472 14.221109 3.959898 -2.353994 158 | 4.361342 34.646044 1.392565 0.789397 159 | ``` 160 | 161 | ##### Format of Output_file2 162 | 163 | * Col1 : Pseudo-time of a cell 164 | * Col2 - Col(K) : Responsibility for each lineage 165 | 166 | 167 | ##### Example of Output_file2 168 | ``` 169 | 0.941979 0.990196 0.009804 170 | 2.000000 0.990196 0.009804 171 | 2.000000 0.990196 0.009804 172 | 1.102146 0.990196 0.009804 173 | 0.839387 0.990196 0.009804 174 | ``` 175 | 176 | ##### Format of Output_file3 177 | The log-likelihood 178 | 179 | ##### Exapmle of Output_file3 180 | ``` 181 | ``` 182 | 183 | 184 | ## Running SCOUP from the middle of the activity 185 | Re-estimate parameters from the middle of the activity. 186 | 187 | ##### Usage 188 | ``` 189 | ./scoup_resume 190 | ``` 191 | 192 | * Input_file1 : G x C matrix of expression data 193 | * Input_file2 : Initial distribution data 194 | * Input_file3 : ** Semi-optimized gene and lineage parameters (Output_file1 of scoup) ** 195 | * Input_file4 : ** Semi-optimized cell parameters (Output_file2 of scoup) ** 196 | * Output_file1 : Optimized parameters related to genes and lineages 197 | * Output_file2 : Optimized parameters related to cells 198 | * Output_file3 : Log-likelihood 199 | * G : The number of genes 200 | * C : The number of cells 201 | 202 | ##### Options 203 | It is the same as the Options of "scoup". 204 | 205 | ##### Example of running SCOUP 206 | ``` 207 | ./scoup_resume -k 2 -e 0.0001 data/data.txt data/init.txt out/gpara.txt out/cpara.txt out/gpara_2.txt out/cpara_2.txt out/ll_2.txt 500 100 208 | ``` 209 | 210 | ##### Format of Input_file1 211 | This is the same as the Input_file1 of "scoup". 212 | 213 | ##### Format of Input_file2 214 | This is the same as the Input_file2 of "scoup". 215 | 216 | ##### Format of Input_file3 217 | This is the parameters related to genes and lineages and is the same as the **Output_file1** of SCOUP. 218 | 219 | ##### Format of Input_file4 220 | This is the parameters related to cells and is the same as the **Output_file2** of "scoup". 221 | 222 | ##### Format of Output_file1, 2, 3 223 | These file are the same as the output files of SCOUP. 224 | 225 | 226 | ## Running Correlation analysis 227 | Calculate the correlation between genes after standardization. 228 | 229 | ##### Usage 230 | ``` 231 | ./cor 232 | ``` 233 | 234 | * Input_file1 : G x C matrix of expression data 235 | * Input_file2 : Initial distribution data 236 | * Input_file3 : Optimized gene and lineage parameters (Output_file1 of scoup) 237 | * Input_file4 : Optimized cell parameters (Output_file2 of scoup) 238 | * Output_file1 : Standardized expression matrix 239 | * Output_file2 : G x G correlation matrix 240 | * G : The number of genes 241 | * C : The number of cells 242 | 243 | ##### Options 244 | 245 | ##### Example of running Correlation analysis 246 | ``` 247 | ./cor data/data.txt data/init.txt out/gpara.txt out/cpara.txt out/nexp.txt out/cor.txt 500 100 248 | ``` 249 | 250 | ##### Format of Output_file1 251 | The Output_file1 contains the standardized expression data. 252 | 253 | ##### Format of Output_file2 254 | The Output_file2 contains the correlation for the standardized expression data. 255 | 256 | ## License 257 | Copyright (c) 2015 Hirotaka Matsumoto 258 | Released under the MIT license 259 | -------------------------------------------------------------------------------- /src/node.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | using namespace std; 13 | 14 | class GENE; 15 | class CELL; 16 | class LINEAGE; 17 | 18 | vector genes; 19 | vector cells; 20 | vector lineages; 21 | 22 | double Gaussian(double x, double mean, double sigma_squared){ 23 | return 1.0/sqrt(2 * M_PI * sigma_squared) * exp(-(x-mean)*(x-mean)/(2*sigma_squared)); 24 | } 25 | 26 | double Log_Gaussian(double x, double mean, double sigma_squared){ 27 | return -0.5 * log(2 * M_PI * sigma_squared) - (x-mean)*(x-mean)/(2*sigma_squared); 28 | } 29 | 30 | double logsumexp(double x, double y){ 31 | if(isinf(x)){ 32 | printf("inf at logsumexp %lf\n", x); 33 | return y; 34 | } 35 | else if(isinf(y)){ 36 | printf("inf at logsumexp %lf\n", y); 37 | return x; 38 | } 39 | else if(x > y) return (x + log1p(exp(-x+y))); 40 | else return (y + log1p(exp(-y+x))); 41 | } 42 | 43 | class LINEAGE{ 44 | public: 45 | int _gene_num; 46 | double _pi; 47 | double _new_pi; 48 | vector _theta; 49 | vector _new_theta; 50 | 51 | void Init(int g, double pi){ 52 | _gene_num = g; 53 | _theta.resize(g, 0); 54 | _new_theta.resize(g, 0); 55 | _pi = pi; 56 | } 57 | 58 | double Pi(){ 59 | return _pi; 60 | } 61 | 62 | void Add_theta(int gene_id, double theta){ 63 | _theta[gene_id] = theta; 64 | } 65 | 66 | void Add_new_theta(int gene_id, double theta){ 67 | _new_theta[gene_id] = theta; 68 | } 69 | 70 | void Add_new_pi(double pi){ 71 | _new_pi = pi; 72 | } 73 | 74 | void Normalize_new_pi(double sum){ 75 | _new_pi /= sum; 76 | } 77 | 78 | double Theta(int gene_id){ 79 | return _theta[gene_id]; 80 | } 81 | 82 | void Update_parameter(){ 83 | _pi = _new_pi; 84 | for(int i=0; i<_gene_num; i++){ 85 | _theta[i] = _new_theta[i]; 86 | } 87 | } 88 | 89 | int Convergence(double t, double thresh); 90 | }; 91 | 92 | class GENE{ 93 | public: 94 | int _gene_id; 95 | int _K; 96 | double _alpha; 97 | double _sigma_squared; 98 | double _new_alpha; 99 | double _new_sigma_squared; 100 | double _tmp_alpha; 101 | double _tmp_sigma_squared; 102 | 103 | double _initial_expression; 104 | double _initial_sigma_squared; 105 | double _new_initial_expression; 106 | double _new_initial_sigma_squared; 107 | 108 | double _theta_null; 109 | double _new_theta_null; 110 | 111 | double _sigma_squared_MG; 112 | 113 | void Init(int k, int gene_id, double alpha, double sigma){ 114 | _gene_id = gene_id; 115 | _K = k; 116 | _alpha = alpha; 117 | _sigma_squared = 1.0; 118 | } 119 | 120 | void Init(int gene_id){ 121 | _gene_id = gene_id; 122 | } 123 | 124 | void Add_initial_expression(double expression){ 125 | _initial_expression = expression; 126 | } 127 | 128 | void Add_new_initial_expression(double expression){ 129 | _new_initial_expression = expression; 130 | } 131 | 132 | void Add_initial_dispersion(double dispersion){ 133 | _initial_sigma_squared = dispersion; 134 | } 135 | 136 | void Add_new_initial_dispersion(double dispersion){ 137 | _new_initial_sigma_squared = dispersion; 138 | } 139 | 140 | void Add_new_alpha(double alpha){ 141 | _new_alpha = alpha; 142 | } 143 | 144 | void Add_new_sigma_squared(double sigma){ 145 | _new_sigma_squared = sigma; 146 | } 147 | 148 | void Add_theta_null(double theta){ 149 | _theta_null = theta; 150 | } 151 | 152 | void Add_new_theta_null(double theta){ 153 | _new_theta_null = theta; 154 | } 155 | 156 | void Add_sigma_squared_MG(double sigma){ 157 | _sigma_squared_MG = sigma; 158 | } 159 | 160 | double OU(double x, double t, int k){ 161 | double mean = exp(-_alpha * t)*_initial_expression + (1 - exp(-_alpha*t))*lineages[k].Theta(_gene_id); 162 | double variance = _sigma_squared*(1-exp(-2*_alpha*t))/(2*_alpha); 163 | 164 | double ret = Gaussian(x, mean, variance); 165 | return ret; 166 | } 167 | 168 | double LogOU(double x, double t, int k){ 169 | double mean = exp(-_alpha * t)*_initial_expression + (1 - exp(-_alpha*t))*lineages[k].Theta(_gene_id); 170 | double variance = _sigma_squared*(1-exp(-2*_alpha*t))/(2*_alpha) + exp(-2*_alpha*t)*_initial_sigma_squared; 171 | 172 | double ret = Log_Gaussian(x, mean, variance); 173 | return ret; 174 | } 175 | 176 | double LogOU_with_new_parameter(double x, double t, int k){ 177 | double mean = exp(-_new_alpha * t)*_initial_expression + (1 - exp(-_new_alpha*t))*lineages[k]._new_theta[_gene_id]; 178 | double variance = _new_sigma_squared*(1-exp(-2*_new_alpha*t))/(2*_new_alpha) + exp(-2*_new_alpha*t)*_initial_sigma_squared; 179 | 180 | double ret = Log_Gaussian(x, mean, variance); 181 | if(isinf(ret)){ 182 | //for debug 183 | printf("-inf at OU %lf %lf %lf\n", x, mean, variance); 184 | return -DBL_MIN; 185 | } 186 | else{ 187 | return ret; 188 | } 189 | } 190 | 191 | //todo 192 | double LogOU_with_new_theta(double x, double t, int k){ 193 | double mean = exp(-_alpha * t)*_initial_expression + (1 - exp(-_alpha*t))*lineages[k]._new_theta[_gene_id]; 194 | double variance = _sigma_squared*(1-exp(-2*_alpha*t))/(2*_alpha) + exp(-2*_alpha*t)*_initial_sigma_squared; 195 | 196 | double ret = Log_Gaussian(x, mean, variance); 197 | if(isinf(ret)){ 198 | //for debug 199 | printf("-inf at OU %lf %lf %lf\n", x, mean, variance); 200 | return -DBL_MIN; 201 | } 202 | else{ 203 | return ret; 204 | } 205 | } 206 | 207 | double LogOU_with_new_sigma(double x, double t, int k, double sigma){ 208 | double mean = exp(-_alpha * t)*_initial_expression + (1 - exp(-_alpha*t))*lineages[k].Theta(_gene_id); 209 | double variance = sigma*(1-exp(-2*_alpha*t))/(2*_alpha) + exp(-2*_alpha*t)*_initial_sigma_squared; 210 | 211 | double ret = Log_Gaussian(x, mean, variance); 212 | if(isinf(ret)){ 213 | //for debug 214 | printf("-inf at OU %lf %lf %lf\n", x, mean, variance); 215 | return -DBL_MIN; 216 | } 217 | else{ 218 | return ret; 219 | } 220 | } 221 | 222 | double LogOU_with_new_alpha(double x, double t, int k, double alpha){ 223 | double mean = exp(-alpha * t)*_initial_expression + (1 - exp(-alpha*t))*lineages[k].Theta(_gene_id); 224 | double variance = _sigma_squared*(1-exp(-2*alpha*t))/(2*alpha) + exp(-2*alpha*t)*_initial_sigma_squared; 225 | 226 | double ret = Log_Gaussian(x, mean, variance); 227 | if(isinf(ret)){ 228 | //for debug 229 | printf("-inf at OU %lf %lf %lf\n", x, mean, variance); 230 | return -DBL_MIN; 231 | } 232 | else{ 233 | return ret; 234 | } 235 | } 236 | 237 | double LogOU_null_model(double x, double t){ 238 | double mean = exp(-_alpha * t)*_initial_expression + (1 - exp(-_alpha*t))*_theta_null; 239 | double variance = _sigma_squared*(1-exp(-2*_alpha*t))/(2*_alpha) + exp(-2*_alpha*t)*_initial_sigma_squared; 240 | 241 | double ret = Log_Gaussian(x, mean, variance); 242 | if(isinf(ret)){ 243 | //for debug 244 | printf("-inf at OU %lf %lf %lf\n", x, mean, variance); 245 | return -DBL_MIN; 246 | } 247 | else{ 248 | return ret; 249 | } 250 | } 251 | 252 | double LogOU_null_model2(double x, double t){ 253 | double mean = exp(-_alpha * t)*_initial_expression + (1 - exp(-_alpha*t))*_initial_expression; 254 | double variance = _sigma_squared*(1-exp(-2*_alpha*t))/(2*_alpha) + exp(-2*_alpha*t)*_initial_sigma_squared; 255 | 256 | double ret = Log_Gaussian(x, mean, variance); 257 | if(isinf(ret)){ 258 | //for debug 259 | printf("-inf at OU %lf %lf %lf\n", x, mean, variance); 260 | return -DBL_MIN; 261 | } 262 | else{ 263 | return ret; 264 | } 265 | } 266 | 267 | double Mean_of_OU(double x, double t, int k){ 268 | double mean = exp(-_alpha * t)*_initial_expression + (1 - exp(-_alpha*t))*lineages[k].Theta(_gene_id); 269 | return mean; 270 | } 271 | 272 | double Expectation(double t, int k){ 273 | return exp(-_alpha * t)*_initial_expression + (1 - exp(-_alpha*t))*lineages[k].Theta(_gene_id); 274 | } 275 | 276 | double Alpha(){ 277 | return _alpha; 278 | } 279 | 280 | double Sigma_squared(){ 281 | return _sigma_squared; 282 | } 283 | 284 | double Initial_expression(){ 285 | return _initial_expression; 286 | } 287 | 288 | double Initial_dispersion(){ 289 | return _initial_sigma_squared; 290 | } 291 | 292 | double X0(){ 293 | return _initial_expression; 294 | } 295 | 296 | double Theta_null(){ 297 | return _theta_null; 298 | } 299 | 300 | void Update_parameter(){ 301 | _alpha = _new_alpha; 302 | _sigma_squared = _new_sigma_squared; 303 | } 304 | 305 | void Update_null_parameter(){ 306 | _theta_null = _new_theta_null; 307 | _alpha = _new_alpha; 308 | _sigma_squared = _new_sigma_squared; 309 | } 310 | 311 | void Update_null_parameter2(){ 312 | _theta_null = _new_theta_null; 313 | } 314 | 315 | void Updata_null_parameter2_2(){ 316 | _alpha = _new_alpha; 317 | _sigma_squared = _new_sigma_squared; 318 | } 319 | 320 | void Store_parameter(){ 321 | _tmp_alpha = _alpha; 322 | _tmp_sigma_squared = _sigma_squared; 323 | } 324 | 325 | void Restore_parameter(){ 326 | _alpha = _tmp_alpha; 327 | _sigma_squared = _tmp_sigma_squared; 328 | } 329 | 330 | int Convergence(double t, double thresh){ 331 | double tmp1, tmp2; 332 | 333 | tmp1 = (_sigma_squared*(1-exp(-_alpha*t)))/(2*_alpha); 334 | tmp2 = (_new_sigma_squared*(1-exp(-_new_alpha*t)))/(2*_new_alpha); 335 | if(fabs(tmp1 - tmp2) > thresh){ 336 | return 0; 337 | } 338 | 339 | return 1; 340 | } 341 | }; 342 | 343 | class CELL{ 344 | public: 345 | int _gene_num; 346 | int _K; 347 | int _flag_differentiation; 348 | 349 | double _old_time; 350 | double _time; 351 | double _new_time; 352 | vector _expression; 353 | vector _missing; 354 | vector _gamma; 355 | vector > _gamma_gene; 356 | vector _time_gene; 357 | 358 | void Init(int g, int k){ 359 | _gene_num = g; 360 | _K = k; 361 | _expression.resize(g, 0); 362 | _missing.resize(g, 0); 363 | _gamma.resize(k, 0); 364 | _gamma_gene.resize(_gene_num); 365 | for(int i=0; i<_gene_num; i++){ 366 | _gamma_gene[i].resize(_K, 0.0); 367 | } 368 | _time_gene.resize(g, 0); 369 | } 370 | 371 | void Init(int g){ 372 | _gene_num = g; 373 | _expression.resize(g, 0); 374 | _missing.resize(g, 0); 375 | } 376 | 377 | void Init_for_gene_time(){ 378 | for(int i=0; i<_gene_num; i++){ 379 | _time_gene[i] = _time; 380 | } 381 | } 382 | 383 | void Add_expression(int id, double expression, int missing_flag){ 384 | if(missing_flag == 0){ 385 | _expression[id] = expression; 386 | } 387 | else{ 388 | _expression[id] = 0; 389 | _missing[id] = 1; 390 | } 391 | } 392 | 393 | void Add_time(double t){ 394 | _time = t; 395 | } 396 | 397 | void Add_new_time(double t){ 398 | _new_time = t; 399 | } 400 | 401 | void Add_new_gene_time(int id, double t){ 402 | _time_gene[id] = t; 403 | } 404 | 405 | void Add_flag_dif(int flag){ 406 | _flag_differentiation = flag; 407 | } 408 | 409 | int Flag_dif(){ 410 | return _flag_differentiation; 411 | } 412 | 413 | double Get_expression(int id){ 414 | return _expression[id]; 415 | } 416 | 417 | double Time(){ 418 | return _time; 419 | } 420 | 421 | double Gene_time(int id){ 422 | return _time_gene[id]; 423 | } 424 | 425 | double Gamma(int id){ 426 | return _gamma[id]; 427 | } 428 | double Gamma_gene(int gene, int state){ 429 | return _gamma_gene[gene][state]; 430 | } 431 | 432 | double Xn(int id){ 433 | return _expression[id]; 434 | } 435 | 436 | void Calc_responsibility(){ 437 | //calculate in log 438 | for(int i=0; i<_K; i++){ 439 | _gamma[i] = log(lineages[i].Pi()); 440 | for(int j=0; j<_gene_num; j++){ 441 | //todo 442 | if(_missing[j] == 0){ 443 | _gamma[i] += genes[j].LogOU(_expression[j], _time, i); 444 | } 445 | } 446 | } 447 | 448 | //logsum version 449 | long double logsum = _gamma[0]; 450 | for(int i=1; i<_K; i++){ 451 | logsum = logsumexp(logsum, _gamma[i]); 452 | } 453 | 454 | //todo 455 | if(isinf(logsum)){ 456 | printf("aa %LF %LF %LF\n", logsum, _gamma[0], _gamma[1]); 457 | 458 | int max_id = -1; 459 | double max = -DBL_MIN; 460 | for(int i=0; i<_K; i++){ 461 | if(max < _gamma[i]){ 462 | max_id = i; 463 | max = _gamma[i]; 464 | } 465 | } 466 | for(int i=0; i<_K; i++){ 467 | if(i == max_id){ 468 | _gamma[i] = 1.0; 469 | } 470 | else{ 471 | _gamma[i] = 0.0; 472 | } 473 | } 474 | } 475 | else{ 476 | for(int i=0; i<_K; i++){ 477 | _gamma[i] = expl(_gamma[i] - logsum); 478 | } 479 | } 480 | 481 | //todo 482 | //add pseudocount to avoid overfitting 483 | double sum=0.0; 484 | for(int i=0; i<_K; i++){ 485 | _gamma[i] += 0.01; 486 | sum += _gamma[i]; 487 | } 488 | for(int i=0; i<_K; i++){ 489 | _gamma[i] /= sum; 490 | } 491 | } 492 | 493 | /* 494 | void Calc_missing_value(){ 495 | for(int i=0; i<_gene_num; i++){ 496 | //todo 497 | if(_missing[i] == 0){ 498 | continue; 499 | } 500 | 501 | //todo 502 | double mean = 0; 503 | for(int j=0; j<_K; j++){ 504 | mean += _gamma[j] * genes[i].Mean_of_OU(_expression[i], _time, j); 505 | } 506 | _expression[i] = mean; 507 | } 508 | } 509 | */ 510 | 511 | void Random_responsibility(){ 512 | srand((unsigned) time(NULL)); 513 | //calculate in log 514 | double sum = 0; 515 | for(int i=0; i<_K; i++){ 516 | _gamma[i] = ((double)rand())/RAND_MAX; 517 | sum += _gamma[i]; 518 | } 519 | 520 | for(int i=0; i<_K; i++){ 521 | _gamma[i] /= sum; 522 | } 523 | } 524 | 525 | void Calc_responsibility_of_gene(){ 526 | for(int i=0; i<_gene_num; i++){ 527 | //calculate in log 528 | for(int j=0; j<_K; j++){ 529 | _gamma_gene[i][j] = 0; 530 | //todo 531 | if(_missing[i] == 0){ 532 | _gamma_gene[i][j] += genes[i].LogOU(_expression[i], _time, j); 533 | } 534 | } 535 | 536 | //logsum version 537 | double logsum = _gamma_gene[i][0]; 538 | for(int j=1; j<_K; j++){ 539 | logsum = logsumexp(logsum, _gamma_gene[i][j]); 540 | } 541 | 542 | for(int j=0; j<_K; j++){ 543 | _gamma_gene[i][j] = exp(_gamma_gene[i][j] - logsum); 544 | } 545 | } 546 | } 547 | 548 | double Expectation(int g, double t, int k){ 549 | if(t > _time){ 550 | double dt = t - _time; 551 | double at = genes[g].Alpha() * dt; 552 | 553 | //todo 554 | return exp(-at)*Xn(g) + (1-exp(-at))*lineages[k].Theta(g); 555 | } 556 | else{ 557 | double mean1, mean2, var1, var2; 558 | 559 | double dt = _time - t; 560 | double at = genes[g].Alpha() * dt; 561 | mean1 = exp(at)*Xn(g) + (1-exp(at))*lineages[k].Theta(g); 562 | var1 = exp(2*at)*genes[g].Sigma_squared()*(1-exp(-2*at))/(2*genes[g].Alpha()); 563 | 564 | at = genes[g].Alpha() * t; 565 | mean2 = exp(-at)*genes[g].Initial_expression() + (1-exp(-at))*lineages[k].Theta(g); 566 | var2 = genes[g].Sigma_squared()*(1-exp(-2*at))/(2*genes[g].Alpha()) + exp(-2*at)*genes[g].Initial_dispersion(); 567 | 568 | return (var2*mean1 + var1*mean2)/(var1 + var2); 569 | } 570 | } 571 | 572 | double Var(int g, double t, int k){ 573 | if(t > _time){ 574 | double dt = t - _time; 575 | double at = genes[g].Alpha() * dt; 576 | 577 | return genes[g].Sigma_squared()*(1-exp(-2*at))/(2*genes[g].Alpha()); 578 | } 579 | else{ 580 | double var1, var2; 581 | 582 | double dt = _time - t; 583 | double at = genes[g].Alpha() * dt; 584 | var1 = exp(2*at)*genes[g].Sigma_squared()*(1-exp(-2*at))/(2*genes[g].Alpha()); 585 | 586 | at = genes[g].Alpha() * t; 587 | var2 = genes[g].Sigma_squared()*(1-exp(-2*at))/(2*genes[g].Alpha()) + exp(-2*at)*genes[g].Initial_dispersion(); 588 | 589 | return (var1*var2)/(var1 + var2); 590 | } 591 | } 592 | 593 | void Update_parameter(){ 594 | _old_time = _time; 595 | _time = _new_time; 596 | } 597 | 598 | int Convergence(double thresh){ 599 | if(fabs(_old_time - _new_time) > thresh){ 600 | return 0; 601 | } 602 | 603 | return 1; 604 | } 605 | }; 606 | 607 | int LINEAGE::Convergence(double t, double thresh){ 608 | if(fabs(_pi - _new_pi) > thresh){ 609 | return 0; 610 | } 611 | 612 | double tmp1, tmp2; 613 | for(int g=0; g<_gene_num; g++){ 614 | tmp1 = exp(- genes[g]._new_alpha * t) * _new_theta[g]; 615 | tmp2 = exp(- genes[g]._alpha * t) * _theta[g]; 616 | if(fabs(tmp1 - tmp2) > thresh){ 617 | return 0; 618 | } 619 | } 620 | return 1; 621 | } 622 | 623 | -------------------------------------------------------------------------------- /src/ou.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include "node.h" 14 | 15 | using namespace std; 16 | 17 | class Continuous_OU_process{ 18 | public: 19 | int _gene_num; 20 | int _cell_num; 21 | int _K; 22 | 23 | double _max_time; 24 | double _min_time; 25 | double _min_alpha; 26 | double _max_alpha; 27 | double _min_sigma_squared; 28 | 29 | double _init_alpha; 30 | double _init_siqma_squared; 31 | 32 | double _old_ll; 33 | 34 | int _max_ite1; 35 | int _max_ite2; 36 | double _thresh; 37 | 38 | Continuous_OU_process(int g, int c, int k, int max_ite1, int max_ite2, double alpha_min, double alpha_max, double t_min, double t_max, double sigma_squared_min, double thresh){ 39 | _max_ite1 = max_ite1; 40 | _max_ite2 = max_ite2; 41 | _init_alpha = 5.0; 42 | _init_siqma_squared = 1.0; 43 | _old_ll = -DBL_MIN; 44 | _min_time = t_min; 45 | _max_time = t_max; 46 | _min_alpha = alpha_min; 47 | _max_alpha = alpha_max; 48 | _min_sigma_squared = sigma_squared_min; 49 | _thresh = thresh; 50 | 51 | _gene_num = g; 52 | _cell_num = c; 53 | _K = k; 54 | genes.resize(_gene_num); 55 | cells.resize(_cell_num); 56 | lineages.resize(_K); 57 | 58 | for(int i=0; i<_gene_num; i++){ 59 | genes[i].Init(_K, i, _init_alpha, _init_siqma_squared); 60 | } 61 | for(int i=0; i<_cell_num; i++){ 62 | cells[i].Init(_gene_num, _K); 63 | } 64 | for(int i=0; i<_K; i++){ 65 | lineages[i].Init(_gene_num, 1.0/_K); 66 | } 67 | } 68 | 69 | Continuous_OU_process(int g, int c, int k){ 70 | _gene_num = g; 71 | _cell_num = c; 72 | _K = k; 73 | genes.resize(_gene_num); 74 | cells.resize(_cell_num); 75 | lineages.resize(_K); 76 | 77 | for(int i=0; i<_gene_num; i++){ 78 | genes[i].Init(_K, i, _init_alpha, _init_siqma_squared); 79 | } 80 | for(int i=0; i<_cell_num; i++){ 81 | cells[i].Init(_gene_num, _K); 82 | } 83 | for(int i=0; i<_K; i++){ 84 | lineages[i].Init(_gene_num, 1.0/_K); 85 | } 86 | } 87 | 88 | void Init_EM(){ 89 | //Initialize with random responsibility 90 | Set_parameter(); 91 | M_step(); 92 | //update parameters 93 | for(int j=0; j<_K; j++){ 94 | lineages[j].Update_parameter(); 95 | } 96 | for(int j=0; j<_gene_num; j++){ 97 | genes[j].Update_parameter(); 98 | } 99 | } 100 | 101 | int EM(){ 102 | double ll=0; 103 | 104 | int step=100, id=1; 105 | for(int i=0; i<_max_ite1; i++){ 106 | E_step(); 107 | M_step(); 108 | 109 | //update parameters 110 | for(int j=0; j<_K; j++){ 111 | lineages[j].Update_parameter(); 112 | } 113 | for(int j=0; j<_gene_num; j++){ 114 | genes[j].Update_parameter(); 115 | } 116 | 117 | ll = Log_likelihood(); 118 | 119 | //debug 120 | if(i%step == 0){ 121 | printf("%d-th iteration in first EM\nlog-likelihood: %lf\n", i, ll); 122 | } 123 | 124 | _old_ll = ll; 125 | } 126 | 127 | for(int i=0; i<_max_ite2; i++){ 128 | E_step(); 129 | Optimize_time(); 130 | M_step(); 131 | 132 | //check parameter convergenece 133 | if(Convergence() == 1){ 134 | break; 135 | } 136 | 137 | if(i == _max_ite2-1){ 138 | break; 139 | } 140 | 141 | //update parameters 142 | for(int j=0; j<_K; j++){ 143 | lineages[j].Update_parameter(); 144 | } 145 | for(int j=0; j<_gene_num; j++){ 146 | genes[j].Update_parameter(); 147 | } 148 | 149 | ll = Log_likelihood(); 150 | 151 | //debug 152 | if(i%step == 0){ 153 | printf("%d-th iteration in second EM\nlog-likelihood: %lf\n", i, ll); 154 | } 155 | 156 | _old_ll = ll; 157 | 158 | } 159 | 160 | return 0; 161 | } 162 | 163 | void E_step(){ 164 | //calculate the responsibility of the latent value Z 165 | for(int i=0; i<_cell_num; i++){ 166 | cells[i].Calc_responsibility(); 167 | } 168 | } 169 | 170 | void M_step(){ 171 | //optimize theta 172 | double new_theta; 173 | long double numerator = 0.0; 174 | long double denominator = 0.0; 175 | long double e_minus_at_power; 176 | 177 | for(int i=0; i<_gene_num; i++){ 178 | for(int j=0; j<_K; j++){ 179 | new_theta = lineages[j].Theta(i); 180 | numerator = 0.0; 181 | denominator = 0.0; 182 | for(int k=0; k<_cell_num; k++){ 183 | e_minus_at_power = exp(-genes[i].Alpha()*cells[k].Time()); 184 | numerator += cells[k].Gamma(j) * 2 * (cells[k].Xn(i) - e_minus_at_power*X0(i,k,j) - (1 - e_minus_at_power)*lineages[j].Theta(i)) / (genes[i].Alpha()*(1+e_minus_at_power)); 185 | //numerator += cells[k].Gamma(j) * ((cells[k].Xn(i)+genes[i].X0()-2*lineages[j].Theta(i))*(cosh(genes[i].Alpha()*cells[k].Time())-1)/sinh(genes[i].Alpha()*cells[k].Time()) + cells[k].Xn(i) - genes[i].X0()) / genes[i].Alpha(); 186 | denominator += cells[k].Gamma(j) * cells[k].Time(); 187 | } 188 | new_theta += numerator/denominator; 189 | lineages[j].Add_new_theta(i, new_theta); 190 | } 191 | } 192 | 193 | //optimize alpha 194 | double new_alpha; 195 | double f_alpha; 196 | double at; 197 | for(int i=0; i<_gene_num; i++){ 198 | numerator = 0.0; 199 | denominator = 0.0; 200 | for(int j=0; j<_cell_num; j++){ 201 | for(int k=0; k<_K; k++){ 202 | at = genes[i].Alpha() * cells[j].Time(); 203 | f_alpha = genes[i].Sigma_squared()/(genes[i].Alpha()*genes[i].Alpha()); 204 | f_alpha -= genes[i].Sigma_squared()/genes[i].Alpha() * cells[j].Time()*cosh(at)/sinh(at); 205 | f_alpha += 1.0/(genes[i].Alpha()*sinh(at)) * (-cosh(at) + at/sinh(at)) * (X0_minus_theta_squared(i,j,k) + Xn_minus_theta_squared(i,j,k)); 206 | f_alpha += 2.0/(genes[i].Alpha()*sinh(at)) * (1 - at*cosh(at)/sinh(at)) * X0_minus_theta_times_Xn_minus_theta(i,j,k); 207 | 208 | numerator += cells[j].Gamma(k) * (-cells[j].Time()*genes[i].Sigma_squared() - X0_minus_theta_squared(i, j, k) + Xn_minus_theta_squared(i, j, k)); 209 | denominator += cells[j].Gamma(k) * f_alpha; 210 | } 211 | } 212 | new_alpha = numerator/denominator; 213 | 214 | if(new_alpha < _min_alpha){ 215 | new_alpha = _min_alpha; 216 | } 217 | else if(new_alpha > _max_alpha){ 218 | new_alpha = _max_alpha; 219 | } 220 | 221 | genes[i].Add_new_alpha(new_alpha); 222 | } 223 | 224 | //optimize sigma_squared 225 | int effective_cell_num; 226 | double new_sigma_squared; 227 | for(int i=0; i<_gene_num; i++){ 228 | effective_cell_num = 0; 229 | new_sigma_squared = 0.0; 230 | for(int j=0; j<_cell_num; j++){ 231 | effective_cell_num++; 232 | 233 | at = genes[i].Alpha() * cells[j].Time(); 234 | for(int k=0; k<_K; k++){ 235 | new_sigma_squared += cells[j].Gamma(k) * 2 * genes[i].Alpha() / (1-exp(-2*at)) * (Xn_minus_theta_squared(i,j,k) - 2*exp(-at)*X0_minus_theta_times_Xn_minus_theta(i,j,k) + exp(-2*at)*X0_minus_theta_squared(i,j,k) ); 236 | } 237 | } 238 | new_sigma_squared /= effective_cell_num; 239 | 240 | if(new_sigma_squared < _min_sigma_squared){ 241 | new_sigma_squared = _min_sigma_squared; 242 | } 243 | 244 | genes[i].Add_new_sigma_squared(new_sigma_squared); 245 | } 246 | 247 | //optimize pi 248 | double new_pi, sum_pi=0; 249 | for(int i=0; i<_K; i++){ 250 | new_pi = 0.0; 251 | for(int j=0; j<_cell_num; j++){ 252 | new_pi += cells[j].Gamma(i); 253 | } 254 | lineages[i].Add_new_pi(new_pi); 255 | 256 | sum_pi += new_pi; 257 | } 258 | //normalization 259 | for(int i=0; i<_K; i++){ 260 | lineages[i].Normalize_new_pi(sum_pi); 261 | } 262 | } 263 | 264 | void Optimize_time(){ 265 | //optimize t 266 | int max_ite=100; 267 | double at, old_time, new_time, pre_time, E, f1, f2; 268 | for(int i=0; i<_cell_num; i++){ 269 | new_time = cells[i].Time(); 270 | pre_time = cells[i].Time(); 271 | 272 | for(int ite=0; ite _max_time){ 303 | new_time = _min_time; 304 | } 305 | 306 | if(fabs(old_time-new_time) < _thresh){ 307 | break; 308 | } 309 | } 310 | 311 | double ll0, ll1, ll2, ll3; 312 | ll0 = Log_likelihood_of_cell(i, new_time); 313 | ll1 = Log_likelihood_of_cell(i, _min_time); 314 | ll2 = Log_likelihood_of_cell(i, _max_time); 315 | ll3 = Log_likelihood_of_cell(i, pre_time); 316 | 317 | if(ll3 > ll0 && ll3 > ll1 && ll3 > ll2){ 318 | cells[i].Add_new_time(pre_time); 319 | } 320 | else if(ll0 >= ll1 && ll0 >= ll2){ 321 | cells[i].Add_new_time(new_time); 322 | } 323 | else if(ll1 > ll2){ 324 | cells[i].Add_new_time(_min_time); 325 | } 326 | else{ 327 | cells[i].Add_new_time(_max_time); 328 | } 329 | cells[i].Update_parameter(); 330 | } 331 | } 332 | 333 | void Calc_gene_responsibility(){ 334 | for(int i=0; i<_cell_num; i++){ 335 | cells[i].Calc_responsibility_of_gene(); 336 | } 337 | } 338 | 339 | //for null modek K=1 340 | void EM_for_null_model(int gene_id){ 341 | Set_null_parameter(); 342 | int max_ite = 10; 343 | 344 | for(int i=0; i _max_alpha){ 396 | new_alpha = _max_alpha; 397 | } 398 | genes[i].Add_new_alpha(new_alpha); 399 | 400 | //optimize sigma_squared 401 | int effective_cell_num; 402 | double new_sigma_squared; 403 | effective_cell_num = 0; 404 | new_sigma_squared = 0.0; 405 | for(int j=0; j<_cell_num; j++){ 406 | effective_cell_num++; 407 | 408 | at = genes[i].Alpha() * cells[j].Time(); 409 | new_sigma_squared += 2 * genes[i].Alpha() / (1-exp(-2*at)) * (Xn_minus_theta_squared(i,j) - 2*exp(-at)*X0_minus_theta_times_Xn_minus_theta(i,j) + exp(-2*at)*X0_minus_theta_squared(i,j) ); 410 | } 411 | new_sigma_squared /= effective_cell_num; 412 | genes[i].Add_new_sigma_squared(new_sigma_squared); 413 | } 414 | 415 | void EM_for_null_model2(int gene_id){ 416 | int max_ite = 100; 417 | 418 | genes[gene_id].Add_new_theta_null(genes[gene_id].X0()); 419 | genes[gene_id].Update_null_parameter2(); 420 | 421 | for(int i=0; i _max_alpha){ 459 | new_alpha = _max_alpha; 460 | } 461 | genes[i].Add_new_alpha(new_alpha); 462 | 463 | //optimize sigma_squared 464 | int effective_cell_num = 0; 465 | double new_sigma_squared = 0; 466 | for(int j=0; j<_cell_num; j++){ 467 | effective_cell_num++; 468 | 469 | at = genes[i].Alpha() * cells[j].Time(); 470 | new_sigma_squared += 2 * genes[i].Alpha() / (1-exp(-2*at)) * (Xn_minus_theta_squared(i,j) - 2*exp(-at)*X0_minus_theta_times_Xn_minus_theta(i,j) + exp(-2*at)*X0_minus_theta_squared(i,j) ); 471 | } 472 | new_sigma_squared /= effective_cell_num; 473 | genes[i].Add_new_sigma_squared(new_sigma_squared); 474 | } 475 | 476 | double X0(int i, int j, int k){ 477 | double at = genes[i].Alpha() * cells[j].Time(); 478 | double var = 1.0/(2*genes[i].Alpha()/(genes[i].Sigma_squared()*(exp(2*at)-1)) + 1.0/genes[i].Initial_dispersion()); 479 | double mu; 480 | if(var < 0.000001){ 481 | mu = exp(at)*cells[j].Xn(i)+(1-exp(at))*lineages[k].Theta(i); 482 | } 483 | else{ 484 | mu = (2*genes[i].Alpha()/(genes[i].Sigma_squared()*(exp(2*at)-1))*(exp(at)*cells[j].Xn(i)+(1-exp(at))*lineages[k].Theta(i)) + genes[i].Initial_expression()/genes[i].Initial_dispersion()) * var; 485 | } 486 | return mu; 487 | } 488 | 489 | //for null model 490 | double X0(int i, int j){ 491 | double at = genes[i].Alpha() * cells[j].Time(); 492 | double var = 1.0/(2*genes[i].Alpha()/(genes[i].Sigma_squared()*(exp(2*at)-1)) + 1.0/genes[i].Initial_dispersion()); 493 | double mu = (2*genes[i].Alpha()/(genes[i].Sigma_squared()*(exp(2*at)-1))*(exp(at)*cells[j].Xn(i)+(1-exp(at))*genes[i].Theta_null()) + genes[i].Initial_expression()/genes[i].Initial_dispersion()) * var; 494 | return mu; 495 | } 496 | 497 | double X0_minus_mu_squared(int i, int j, int k){ 498 | double at = genes[i].Alpha() * cells[j].Time(); 499 | double var = 1.0/(2*genes[i].Alpha()/(genes[i].Sigma_squared()*(exp(2*at)-1)) + 1.0/genes[i].Initial_dispersion()); 500 | double mu; 501 | if(var < 0.000001){ 502 | mu = exp(at)*cells[j].Xn(i)+(1-exp(at))*lineages[k].Theta(i); 503 | } 504 | else{ 505 | mu = (2*genes[i].Alpha()/(genes[i].Sigma_squared()*(exp(2*at)-1))*(exp(at)*cells[j].Xn(i)+(1-exp(at))*lineages[k].Theta(i)) + genes[i].Initial_expression()/genes[i].Initial_dispersion()) * var; 506 | } 507 | double tmp = mu - genes[i].Initial_expression(); 508 | 509 | return (tmp * tmp) + var; 510 | } 511 | 512 | //for null model 513 | double X0_minus_mu_squared(int i, int j){ 514 | double at = genes[i].Alpha() * cells[j].Time(); 515 | double var = 1.0/(2*genes[i].Alpha()/(genes[i].Sigma_squared()*(exp(2*at)-1)) + 1.0/genes[i].Initial_dispersion()); 516 | double mu = (2*genes[i].Alpha()/(genes[i].Sigma_squared()*(exp(2*at)-1))*(exp(at)*cells[j].Xn(i)+(1-exp(at))*genes[i].Theta_null()) + genes[i].Initial_expression()/genes[i].Initial_dispersion()) * var; 517 | double tmp = mu - genes[i].Initial_expression(); 518 | 519 | return (tmp * tmp) + var; 520 | } 521 | 522 | double X0_minus_theta_squared(int i, int j, int k){ 523 | double at = genes[i].Alpha() * cells[j].Time(); 524 | double var = 1.0/(2*genes[i].Alpha()/(genes[i].Sigma_squared()*(exp(2*at)-1)) + 1.0/genes[i].Initial_dispersion()); 525 | double mu; 526 | if(var < 0.000001){ 527 | mu = exp(at)*cells[j].Xn(i)+(1-exp(at))*lineages[k].Theta(i); 528 | } 529 | else{ 530 | mu = (2*genes[i].Alpha()/(genes[i].Sigma_squared()*(exp(2*at)-1))*(exp(at)*cells[j].Xn(i)+(1-exp(at))*lineages[k].Theta(i)) + genes[i].Initial_expression()/genes[i].Initial_dispersion()) * var; 531 | } 532 | double tmp = mu - lineages[k].Theta(i); 533 | 534 | return (tmp * tmp) + var; 535 | } 536 | 537 | double X0_minus_theta_squared(int i, int j, int k, double t){ 538 | double at = genes[i].Alpha() * t; 539 | double var = 1.0/(2*genes[i].Alpha()/(genes[i].Sigma_squared()*(exp(2*at)-1)) + 1.0/genes[i].Initial_dispersion()); 540 | double mu; 541 | if(var < 0.000001){ 542 | mu = exp(at)*cells[j].Xn(i)+(1-exp(at))*lineages[k].Theta(i); 543 | } 544 | else{ 545 | mu = (2*genes[i].Alpha()/(genes[i].Sigma_squared()*(exp(2*at)-1))*(exp(at)*cells[j].Xn(i)+(1-exp(at))*lineages[k].Theta(i)) + genes[i].Initial_expression()/genes[i].Initial_dispersion()) * var; 546 | } 547 | double tmp = mu - lineages[k].Theta(i); 548 | 549 | return (tmp * tmp) + var; 550 | } 551 | 552 | //for null model 553 | double X0_minus_theta_squared(int i, int j){ 554 | double at = genes[i].Alpha() * cells[j].Time(); 555 | double var = 1.0/(2*genes[i].Alpha()/(genes[i].Sigma_squared()*(exp(2*at)-1)) + 1.0/genes[i].Initial_dispersion()); 556 | double mu = (2*genes[i].Alpha()/(genes[i].Sigma_squared()*(exp(2*at)-1))*(exp(at)*cells[j].Xn(i)+(1-exp(at))*genes[i].Theta_null()) + genes[i].Initial_expression()/genes[i].Initial_dispersion()) * var; 557 | double tmp = mu - genes[i].Theta_null(); 558 | 559 | return (tmp * tmp) + var; 560 | } 561 | 562 | double Xn_minus_theta_squared(int i, int j, int k){ 563 | double tmp = cells[j].Xn(i) - lineages[k].Theta(i); 564 | return tmp * tmp; 565 | } 566 | 567 | //for null model 568 | double Xn_minus_theta_squared(int i, int j){ 569 | double tmp = cells[j].Xn(i) - genes[i].Theta_null(); 570 | return tmp * tmp; 571 | } 572 | 573 | double X0_minus_theta_times_Xn_minus_theta(int i, int j, int k){ 574 | double at = genes[i].Alpha() * cells[j].Time(); 575 | double var = 1.0/(2*genes[i].Alpha()/(genes[i].Sigma_squared()*(exp(2*at)-1)) + 1.0/genes[i].Initial_dispersion()); 576 | double mu; 577 | if(var < 0.000001){ 578 | mu = exp(at)*cells[j].Xn(i)+(1-exp(at))*lineages[k].Theta(i); 579 | } 580 | else{ 581 | mu = (2*genes[i].Alpha()/(genes[i].Sigma_squared()*(exp(2*at)-1))*(exp(at)*cells[j].Xn(i)+(1-exp(at))*lineages[k].Theta(i)) + genes[i].Initial_expression()/genes[i].Initial_dispersion()) * var; 582 | } 583 | double tmp1 = mu - lineages[k].Theta(i); 584 | double tmp2 = cells[j].Xn(i) - lineages[k].Theta(i); 585 | return tmp1 * tmp2; 586 | } 587 | 588 | double X0_minus_theta_times_Xn_minus_theta(int i, int j, int k, double t){ 589 | double at = genes[i].Alpha() * t; 590 | double var = 1.0/(2*genes[i].Alpha()/(genes[i].Sigma_squared()*(exp(2*at)-1)) + 1.0/genes[i].Initial_dispersion()); 591 | double mu; 592 | if(var < 0.000001){ 593 | mu = exp(at)*cells[j].Xn(i)+(1-exp(at))*lineages[k].Theta(i); 594 | } 595 | else{ 596 | mu = (2*genes[i].Alpha()/(genes[i].Sigma_squared()*(exp(2*at)-1))*(exp(at)*cells[j].Xn(i)+(1-exp(at))*lineages[k].Theta(i)) + genes[i].Initial_expression()/genes[i].Initial_dispersion()) * var; 597 | } 598 | double tmp1 = mu - lineages[k].Theta(i); 599 | double tmp2 = cells[j].Xn(i) - lineages[k].Theta(i); 600 | return tmp1 * tmp2; 601 | } 602 | 603 | //for null model 604 | double X0_minus_theta_times_Xn_minus_theta(int i, int j){ 605 | double at = genes[i].Alpha() * cells[j].Time(); 606 | double var = 1.0/(2*genes[i].Alpha()/(genes[i].Sigma_squared()*(exp(2*at)-1)) + 1.0/genes[i].Initial_dispersion()); 607 | double mu = (2*genes[i].Alpha()/(genes[i].Sigma_squared()*(exp(2*at)-1))*(exp(at)*cells[j].Xn(i)+(1-exp(at))*genes[i].Theta_null()) + genes[i].Initial_expression()/genes[i].Initial_dispersion()) * var; 608 | double tmp1 = mu - genes[i].Theta_null(); 609 | double tmp2 = cells[j].Xn(i) - genes[i].Theta_null(); 610 | return tmp1 * tmp2; 611 | } 612 | 613 | double Log_likelihood(){ 614 | double ll = 0.0; 615 | double tmp1, tmp2; 616 | for(int i=0; i<_cell_num; i++){ 617 | for(int j=0; j<_K; j++){ 618 | tmp1 = log(lineages[j].Pi()); 619 | for(int k=0; k<_gene_num; k++){ 620 | if(cells[i]._missing[k] != 1){ 621 | tmp1 += genes[k].LogOU(cells[i].Xn(k), cells[i].Time(), j); 622 | } 623 | } 624 | 625 | if(j == 0) 626 | tmp2 = tmp1; 627 | else 628 | tmp2 = logsumexp(tmp2, tmp1); 629 | } 630 | 631 | ll += tmp2; 632 | } 633 | return ll; 634 | } 635 | 636 | double Log_likelihood_null_model(){ 637 | double ll = 0.0; 638 | double tmp1; 639 | for(int i=0; i<_cell_num; i++){ 640 | tmp1 = 0.0; 641 | for(int k=0; k<_gene_num; k++){ 642 | if(cells[i]._missing[k] != 1){ 643 | tmp1 += genes[k].LogOU_null_model(cells[i].Xn(k), cells[i].Time()); 644 | } 645 | } 646 | 647 | ll += tmp1; 648 | } 649 | return ll; 650 | } 651 | 652 | double Log_likelihood_of_cell(int id){ 653 | double tmp1, tmp2; 654 | 655 | for(int j=0; j<_K; j++){ 656 | tmp1 = log(lineages[j].Pi()); 657 | for(int k=0; k<_gene_num; k++){ 658 | if(cells[id]._missing[k] != 1){ 659 | tmp1 += genes[k].LogOU(cells[id].Xn(k), cells[id].Time(), j); 660 | } 661 | } 662 | 663 | if(j == 0) 664 | tmp2 = tmp1; 665 | else 666 | tmp2 = logsumexp(tmp2, tmp1); 667 | } 668 | 669 | return tmp2; 670 | } 671 | 672 | double Log_likelihood_of_cell(int id, double tmp_time){ 673 | double tmp1, tmp2; 674 | 675 | for(int j=0; j<_K; j++){ 676 | tmp1 = log(lineages[j].Pi()); 677 | for(int k=0; k<_gene_num; k++){ 678 | if(cells[id]._missing[k] != 1){ 679 | tmp1 += genes[k].LogOU(cells[id].Xn(k), tmp_time, j); 680 | } 681 | } 682 | 683 | if(j == 0) 684 | tmp2 = tmp1; 685 | else 686 | tmp2 = logsumexp(tmp2, tmp1); 687 | } 688 | 689 | return tmp2; 690 | } 691 | 692 | int Set_initial_parameter(FILE *fp){ 693 | int count, id; 694 | double expression, dispersion; 695 | for(int i=0; i<_gene_num; i++){ 696 | count = fscanf(fp, "%d\t%lf\t%lf\n", &id, &expression, &dispersion); 697 | if(count == EOF){ 698 | printf("error at reading initial parameter\n"); 699 | return 1; 700 | } 701 | 702 | genes[i].Add_initial_expression(expression); 703 | genes[i].Add_initial_dispersion(dispersion); 704 | } 705 | return 0; 706 | } 707 | 708 | void Set_initial_parameter2(){ 709 | srand((unsigned)time(NULL)); 710 | int sample_size = _cell_num/_K; 711 | double tmp_theta; 712 | 713 | for(int i=0; i<_gene_num; i++){ 714 | for(int j=0; j<_K; j++){ 715 | tmp_theta = 0.0; 716 | for(int k=0; k > expr(_gene_num, vector(_cell_num, 0)); 996 | 997 | for(int i=0; i<_cell_num; i++){ 998 | for(int j=0; j<_gene_num; j++){ 999 | //todo mixture 1000 | id = 0; 1001 | 1002 | at = genes[j].Alpha() * cells[i].Time(); 1003 | mean = exp(-at)*genes[j].Initial_expression() + (1 - exp(-at))*lineages[id].Theta(j); 1004 | variance = genes[j].Sigma_squared()*(1-exp(-2*at))/(2*genes[j].Alpha()) + exp(-2*at)*genes[j].Initial_dispersion(); 1005 | normalized_value = (cells[i].Xn(j) - mean)/(sqrtf(variance)); 1006 | 1007 | expr[j][i] = normalized_value; 1008 | } 1009 | } 1010 | 1011 | //print normalized expression 1012 | for(int g=0; g<_gene_num; g++){ 1013 | for(int c=0; c<_cell_num; c++){ 1014 | fprintf(fp, "%lf", expr[g][c]); 1015 | if(c != _cell_num-1){ 1016 | fprintf(fp, "\t"); 1017 | } 1018 | else{ 1019 | fprintf(fp, "\n"); 1020 | } 1021 | } 1022 | } 1023 | 1024 | //correlation 1025 | int effective_size; 1026 | double mean1, mean2, cov, var1, var2, cor; 1027 | for(int i=0; i<_gene_num; i++){ 1028 | for(int j=0; j<_gene_num; j++){ 1029 | effective_size = 0; 1030 | mean1 = 0; 1031 | mean2 = 0; 1032 | for(int c=0; c<_cell_num; c++){ 1033 | mean1 += expr[i][c]; 1034 | mean2 += expr[j][c]; 1035 | effective_size++; 1036 | } 1037 | mean1 /= effective_size; 1038 | mean2 /= effective_size; 1039 | 1040 | cov = 0; 1041 | var1 = 0; 1042 | var2 = 0; 1043 | for(int c=0; c<_cell_num; c++){ 1044 | cov += (expr[i][c] - mean1) * (expr[j][c] - mean2); 1045 | var1 += (expr[i][c] - mean1) * (expr[i][c] - mean1); 1046 | var2 += (expr[j][c] - mean2) * (expr[j][c] - mean2); 1047 | } 1048 | 1049 | if(effective_size <= 1){ 1050 | cor = 0; 1051 | } 1052 | else if(i == j){ 1053 | cor = 0; 1054 | } 1055 | else{ 1056 | cor = cov/(sqrtf(var1) * sqrtf(var2)); 1057 | } 1058 | fprintf(fcor, "%lf", cor); 1059 | 1060 | if(j != _gene_num-1){ 1061 | fprintf(fcor, "\t"); 1062 | } 1063 | else{ 1064 | fprintf(fcor, "\n"); 1065 | } 1066 | } 1067 | } 1068 | } 1069 | 1070 | /* 1071 | //for debug 1072 | void Check_parameter_optimization(){ 1073 | srand((unsigned) time(NULL)); 1074 | 1075 | for(int i=0; i<5; i++){ 1076 | string tmp="out/check_time_"; 1077 | tmp += to_string(i); 1078 | tmp += ".txt"; 1079 | FILE *fout = fopen(tmp.c_str(), "w"); 1080 | int cell_id = rand()%100; 1081 | double opt_time = cells[cell_id].Time(); 1082 | fprintf(fout, "%lf\t%lf\n", opt_time, Log_likelihood()); 1083 | for(double t=0.1; t<2.0; t+= 0.1){ 1084 | cells[cell_id].Add_new_time(t); 1085 | cells[cell_id].Update_parameter(); 1086 | fprintf(fout, "%lf\t%lf\n", t, Log_likelihood()); 1087 | } 1088 | 1089 | cells[cell_id].Add_new_time(opt_time); 1090 | cells[cell_id].Update_parameter(); 1091 | fclose(fout); 1092 | } 1093 | 1094 | for(int i=0; i<5; i++){ 1095 | string tmp="out/check_alpha_theta_"; 1096 | tmp += to_string(i); 1097 | tmp += ".txt"; 1098 | FILE *fout = fopen(tmp.c_str(), "w"); 1099 | int gene_id = rand()%500; 1100 | double opt_alpha = genes[gene_id].Alpha(); 1101 | double opt_theta = lineages[0].Theta(gene_id); 1102 | fprintf(fout, "%lf\t%lf\t%lf\n", opt_alpha, opt_theta, Log_likelihood()); 1103 | for(double a=0.1; a<10; a+= 0.5){ 1104 | for(double theta=-10; theta<11; theta+=1){ 1105 | genes[gene_id]._alpha = a; 1106 | lineages[0]._theta[gene_id] = theta; 1107 | fprintf(fout, "%lf\t%lf\t%lf\n", a, theta, Log_likelihood()); 1108 | } 1109 | } 1110 | 1111 | genes[gene_id]._alpha = opt_alpha; 1112 | lineages[0]._theta[gene_id] = opt_theta; 1113 | } 1114 | 1115 | for(int i=0; i<5; i++){ 1116 | string tmp="out/check_sigma_theta_"; 1117 | tmp += to_string(i); 1118 | tmp += ".txt"; 1119 | FILE *fout = fopen(tmp.c_str(), "w"); 1120 | int gene_id = rand()%500; 1121 | double opt_sigma = genes[gene_id].Sigma_squared(); 1122 | double opt_theta = lineages[0].Theta(gene_id); 1123 | fprintf(fout, "%lf\t%lf\t%lf\n", opt_sigma, opt_theta, Log_likelihood()); 1124 | for(double sigma=0.1; sigma<100; sigma+= 5.0){ 1125 | for(double theta=-10; theta<11; theta+=1){ 1126 | genes[gene_id]._sigma_squared = sigma; 1127 | lineages[0]._theta[gene_id] = theta; 1128 | fprintf(fout, "%lf\t%lf\t%lf\n", sigma, theta, Log_likelihood()); 1129 | } 1130 | } 1131 | 1132 | genes[gene_id]._sigma_squared = opt_sigma; 1133 | lineages[0]._theta[gene_id] = opt_theta; 1134 | } 1135 | 1136 | for(int i=0; i<5; i++){ 1137 | string tmp="out/check_alpha_sigma_"; 1138 | tmp += to_string(i); 1139 | tmp += ".txt"; 1140 | FILE *fout = fopen(tmp.c_str(), "w"); 1141 | int gene_id = rand()%500; 1142 | double opt_alpha = genes[gene_id].Alpha(); 1143 | double opt_sigma = genes[gene_id].Sigma_squared(); 1144 | fprintf(fout, "%lf\t%lf\t%lf\n", opt_alpha, opt_sigma, Log_likelihood()); 1145 | for(double a=0.1; a<10; a+= 0.5){ 1146 | for(double sigma=0.1; sigma<100; sigma+= 5.0){ 1147 | genes[gene_id]._alpha = a; 1148 | genes[gene_id]._sigma_squared = sigma; 1149 | fprintf(fout, "%lf\t%lf\t%lf\n", a, sigma, Log_likelihood()); 1150 | } 1151 | } 1152 | 1153 | genes[gene_id]._alpha = opt_alpha; 1154 | genes[gene_id]._sigma_squared = opt_sigma; 1155 | } 1156 | } 1157 | */ 1158 | }; 1159 | 1160 | --------------------------------------------------------------------------------