;
112 |
113 | std::vector triplets;
114 | triplets.reserve(nnzero);
115 |
116 | while( hts_getline(htsf, KS_SEP_LINE, &str) ){
117 | if( !str.l ) break;
118 |
119 | offsets = ksplit(&str, 0, &n_fields);
120 |
121 | if( n_fields != 3 ){
122 | std::cerr << "ERROR: Malformed eigen file.\n";
123 | // std::cerr << "Line " << triplets.size() << "\n";
124 | // std::cerr << "No. fields " << n_fields << "\n";
125 | abort();
126 | }
127 |
128 | int ii = std::atoi(str.s + offsets[0]);
129 | int jj = std::atoi(str.s + offsets[1]);
130 | double val = std::atof(str.s + offsets[2]);
131 |
132 | triplets.push_back(td(ii,jj,val));
133 | }
134 |
135 | eig_vec.setFromTriplets(triplets.begin(), triplets.end());
136 |
137 | // -------------------------
138 | // 5) Close input file.
139 |
140 | ks_free(&str);
141 | hts_close(htsf);
142 | }
143 |
144 |
145 | void write_eigen(const std::string& file_prefix, Eigen::SparseMatrix& eig_vec, Eigen::VectorXd& eig_val, const std::vector& kp_ids){
146 |
147 | // std::cerr << "Start write eigen.\n"; // NOTE: DEBUG LINE
148 |
149 | // std::cerr << "Prefix: " << file_prefix << "\n"; // NOTE: DEBUG LINE
150 |
151 | std::string file_name = file_prefix + ".eigen.gz";
152 |
153 |
154 | // std::cerr << "Prefix: " << file_prefix << "\n"; // NOTE: DEBUG LINE
155 |
156 | // file_name = std::regex_replace(file_name, std::regex(".eigen.gz.eigen.gz"), std::string(".eigen.gz"));
157 |
158 | // std::cerr << "Processed file name.\n"; // NOTE: DEBUG LINE
159 |
160 | // std::cerr << file_name << "\n"; // NOTE: DEBUG LINE
161 |
162 | // ----------------------------------------------
163 | // ----------------------------------------------
164 | // Open output file
165 |
166 | BGZF* out_file = bgzf_open(file_name.c_str(), "w");
167 |
168 | // std::cerr << "Opened file.\n"; // NOTE: DEBUG LINE
169 |
170 | // -------------------------
171 | // 1) First line contains number of samples, number of eigenvectors, number non-zero.
172 |
173 | write_to_bgzf(std::to_string( kp_ids.size() ), out_file );
174 | write_to_bgzf("\t", out_file );
175 | write_to_bgzf(std::to_string( eig_val.size() ), out_file );
176 | write_to_bgzf("\t", out_file );
177 | write_to_bgzf(std::to_string( eig_vec.nonZeros() ), out_file );
178 | write_to_bgzf("\n", out_file );
179 |
180 | // std::cerr << "1\n"; // NOTE: DEBUG LINE
181 |
182 | // -------------------------
183 | // 2) Second line contains list of sample IDs
184 |
185 | write_to_bgzf(kp_ids[0], out_file);
186 | for( int i = 1; i < kp_ids.size(); i++ ){
187 | write_to_bgzf("\t", out_file );
188 | write_to_bgzf(kp_ids[i], out_file );
189 | }
190 | write_to_bgzf("\n", out_file );
191 |
192 | // std::cerr << "2\n"; // NOTE: DEBUG LINE
193 |
194 | // -------------------------
195 | // 3) Third line contains eigenvalues.
196 |
197 | write_to_bgzf(std::to_string(eig_val.coeffRef(0)), out_file );
198 | for( int i = 1; i < eig_val.size(); i++ ){
199 | write_to_bgzf("\t", out_file );
200 | write_to_bgzf(std::to_string(eig_val.coeffRef(i)), out_file );
201 | }
202 | write_to_bgzf("\n", out_file);
203 |
204 | // std::cerr << "3\n"; // NOTE: DEBUG LINE
205 |
206 | // -------------------------
207 | // 4) Remaining lines contain eigenvector triplets.
208 |
209 | for (int i = 0; i < eig_vec.outerSize(); i++)
210 | {
211 | for (Eigen::SparseMatrix::InnerIterator trip(eig_vec,i); trip; ++trip)
212 | {
213 | write_to_bgzf(std::to_string( trip.row() ), out_file );
214 | write_to_bgzf("\t", out_file );
215 | write_to_bgzf(std::to_string( trip.col() ), out_file );
216 | write_to_bgzf("\t", out_file );
217 | write_to_bgzf(std::to_string( trip.value() ), out_file );
218 | write_to_bgzf("\n", out_file );
219 | }
220 | }
221 |
222 | // -------------------------
223 | // 5) Close the output file.
224 |
225 | bgzf_close(out_file);
226 |
227 | }
228 |
229 |
230 |
231 | void update_blocks( int i, int j, std::vector& cluster_ids, std::vector>& clusters){
232 | if( i == j ){
233 | // A diagonal element, potentially block of size 1.
234 | // This does not affect block structure.
235 | return;
236 | }
237 |
238 | int c_i = cluster_ids[i];
239 | int c_j = cluster_ids[j];
240 |
241 | if( c_i > c_j ){
242 | std::swap(c_i, c_j);
243 | std::swap(i, j);
244 | }
245 |
246 | if( c_i < 0 && c_j < 0 ){
247 |
248 | // Neither i nor j has been assigned to a block.
249 | // Create a new block that comprises the pair.
250 |
251 | cluster_ids[i] = clusters.size();
252 | cluster_ids[j] = clusters.size();
253 | clusters.push_back({i,j});
254 |
255 | }else if( c_i < 0 ){
256 |
257 | // Since c_i < c_j, only i might never have been assigned.
258 | // If i was never assigned, then add it to j's block.
259 |
260 | cluster_ids[i] = c_j;
261 | clusters[c_j].insert(i);
262 |
263 | }else if(c_i != c_j){
264 |
265 | // i and j were previously assigned to different blocks.
266 | // We must merge these two blocks together.
267 |
268 | // Add all members of j's block into i's block.
269 | clusters[c_i].insert(clusters[c_j].begin(), clusters[c_j].end());
270 |
271 | // Set block ids of members of block c_j to c_i.
272 | for( const int& k : clusters[c_j] ){
273 | cluster_ids[k] = c_i;
274 | }
275 |
276 | // Delete the block c_j, as it has been merged into c_i.
277 | clusters.erase (clusters.begin() + c_j );
278 |
279 | // There is now one less block.
280 | // For all blocks > c_j, decrement block id by 1.
281 | for( int& k : cluster_ids ){
282 | if( k > c_j ){
283 | k -= 1;
284 | }
285 | }
286 |
287 | }
288 | return;
289 | }
290 |
291 |
292 | void read_sparse_GRM(const std::string& filename, Eigen::SparseMatrix& GRM, const std::vector& kp_ids, const double& r_scale, const int& r_col, std::vector>& related)
293 | {
294 | int n = kp_ids.size();
295 |
296 | GRM = Eigen::SparseMatrix(n,n);
297 |
298 | if( filename == "" ){
299 | GRM.setIdentity();
300 | return;
301 | }
302 |
303 | std::vector cluster_ids;
304 | std::vector> clusters;
305 |
306 | std::unordered_map id_map;
307 | for( int i = 0; i < n; i++ ){
308 | id_map[ kp_ids[i] ] = i;
309 | cluster_ids.push_back(-1);
310 | }
311 |
312 | std::vector id1, id2;
313 | std::vector val;
314 |
315 | data_parser dp;
316 | dp.add_field(id1, 0);
317 | dp.add_field(id2, 1);
318 | dp.add_field(val, r_col - 1);
319 |
320 | int nr = 0;
321 | dp.parse_file(filename, nr);
322 |
323 | using td = Eigen::Triplet;
324 |
325 | std::vector triplets;
326 |
327 | bool add_diag = true;
328 |
329 | for( int i = 0; i < nr; i++ )
330 | {
331 | const auto f1 = id_map.find(id1[i]);
332 | const auto f2 = id_map.find(id2[i]);
333 |
334 | if( add_diag ){
335 | if( id1[i] == id2[i] ){
336 | add_diag = false;
337 | }
338 | }
339 |
340 | if( f1 != id_map.end() && f2 != id_map.end() )
341 | {
342 | int& ii = f1->second;
343 | int& jj = f2->second;
344 |
345 | update_blocks( ii, jj, cluster_ids, clusters);
346 |
347 | triplets.push_back(td(ii,jj,r_scale*val[i]));
348 | if( ii != jj ){
349 | triplets.push_back(td(jj,ii,r_scale*val[i]));
350 | }
351 | }
352 |
353 | }
354 | if( add_diag ){
355 | for(int i = 0; i < n; ++i) triplets.push_back(td(i,i,1.0));
356 | }
357 |
358 | related.clear();
359 | for( const auto& s : clusters ){
360 | related.push_back(std::vector(s.begin(), s.end()));
361 | }
362 |
363 | GRM.setFromTriplets(triplets.begin(), triplets.end());
364 | }
365 |
--------------------------------------------------------------------------------
/src/eigenData.hpp:
--------------------------------------------------------------------------------
1 | #ifndef EIGENDATA_HPP
2 | #define EIGENDATA_HPP
3 |
4 | #include
5 | #include
6 | #include
7 | #include
8 |
9 | #include "setOptions.hpp"
10 | #include "mapID.hpp"
11 | #include "htsWrappers.hpp"
12 | #include "dataParser.hpp"
13 | #include "genotypeData.hpp"
14 |
15 | #include
16 | #include
17 |
18 | void update_blocks( int i, int j, std::vector& cluster_ids, std::vector>& clusters);
19 |
20 | void read_eigen(const std::string& file_name, Eigen::SparseMatrix& eig_vec, Eigen::VectorXd& eig_val, const std::vector& kp_ids);
21 |
22 | void write_eigen(const std::string& file_prefix, Eigen::SparseMatrix& eig_vec, Eigen::VectorXd& eig_val, const std::vector& kp_ids);
23 |
24 | void read_sparse_GRM(const std::string& filename, Eigen::SparseMatrix& GRM, const std::vector& kp_ids, const double& r_scale, const int& r_col, std::vector>& related);
25 |
26 | void read_dense_GRM(const std::string&, Eigen::MatrixXd&, std::vector&);
27 |
28 | #endif
--------------------------------------------------------------------------------
/src/fitUtils.cpp:
--------------------------------------------------------------------------------
1 | /* fitUtils:
2 |
3 | Copyright (C) 2020
4 | Author: Corbin Quick
5 |
6 | This file is a part of APEX.
7 |
8 | APEX is distributed "AS IS" in the hope that it will be
9 | useful, but WITHOUT ANY WARRANTY; without even the implied
10 | warranty of MERCHANTABILITY, NON-INFRINGEMENT, or FITNESS
11 | FOR A PARTICULAR PURPOSE.
12 |
13 | The above copyright notice and disclaimer of warranty must
14 | be included in all copies or substantial portions of APEX.
15 | */
16 |
17 | #include "fitUtils.hpp"
18 |
19 |
20 | Eigen::MatrixXd resid( const Eigen::MatrixXd& Y, const Eigen::MatrixXd& X ){
21 | return Y - X * (X.transpose() * X).ldlt().solve(X.transpose() * Y);
22 | }
23 |
24 | void rank_normalize(Eigen::MatrixXd& Y){
25 | double m = Y.cols();
26 | double n = Y.rows();
27 |
28 | std::vector z((int) n);
29 | std::vector rk((int) n);
30 |
31 | double mu = 0;
32 | double sd = 0;
33 | for(int i = 0; i < n; ++i){
34 | z[i] = qnorm( ((double)i+1.0)/((double)n+1.0) );
35 | mu += z[i];
36 | sd += z[i]*z[i];
37 | }
38 | sd = std::sqrt( sd/(n - 1) - mu*mu/( n*(n - 1.0) ) );
39 | mu = mu/n;
40 | for(int i = 0; i < n; ++i){
41 | z[i] = (z[i] - mu)/sd;
42 | }
43 |
44 | for(int j = 0; j < m; ++j){
45 |
46 | std::vector v(n);
47 | for(int i = 0; i< n; ++i){
48 | v[i] = Y(i,j);
49 | }
50 |
51 | std::vector ranks = rank_vector(v);
52 |
53 | for(int i = 0; i< n; ++i){
54 | Y(i,j) = z[ ranks[i]-1 ];
55 | }
56 | }
57 | }
58 |
59 | void scale_and_center(Eigen::MatrixXd& Y, std::vector& sd_vec){
60 | double m = Y.cols();
61 | double n = Y.rows();
62 |
63 | if( sd_vec.size() != m ) sd_vec = std::vector(m, 0.0);
64 |
65 | for(int j = 0; j < m; ++j){
66 | double mu = 0;
67 | double sd = 0;
68 | for(int i = 0; i < n; ++i){
69 | mu += Y(i,j);
70 | sd += Y(i,j)*Y(i,j);
71 | }
72 | sd = std::sqrt( sd/(n - 1) - mu*mu/( n*(n - 1.0) ) );
73 | sd_vec[j] = sd;
74 | mu = mu/n;
75 | for(int i = 0; i < n; ++i){
76 | Y(i,j) = (Y(i,j) - mu)/sd;
77 | }
78 | }
79 | }
80 |
81 | void scale_and_center(Eigen::MatrixXd& Y){
82 | double m = Y.cols();
83 | double n = Y.rows();
84 |
85 | for(int j = 0; j < m; j++){
86 | double mu = 0;
87 | double sd = 0;
88 | for(int i = 0; i < n; i++){
89 | mu += Y(i,j);
90 | sd += Y(i,j)*Y(i,j);
91 | }
92 | sd = std::sqrt( sd/(n - 1) - mu*mu/( n*(n - 1.0) ) );
93 | mu = mu/n;
94 | for(int i = 0; i < n; i++){
95 | Y(i,j) = (Y(i,j) - mu)/sd;
96 | }
97 | }
98 | }
99 |
100 | Eigen::MatrixXd get_half_hat_matrix(const Eigen::MatrixXd& X){
101 | Eigen::SelfAdjointEigenSolver XtX_es(X.transpose() * X);
102 | Eigen::VectorXd lambda = XtX_es.eigenvalues();
103 | for( auto& a : lambda ){
104 | if( a <= 0.00 ){
105 | std::cerr << "ERROR: Covariate matrix is not full rank. Check input. \n";
106 | abort();
107 | }
108 | a = 1/std::sqrt(a);
109 | }
110 | Eigen::MatrixXd U = X * XtX_es.eigenvectors() * lambda.asDiagonal();
111 | return U;
112 | }
113 |
114 | void make_half_hat_matrix(Eigen::MatrixXd& X){
115 | Eigen::SelfAdjointEigenSolver XtX_es(X.transpose() * X);
116 | Eigen::VectorXd lambda = XtX_es.eigenvalues();
117 | for( auto& a : lambda ){
118 | if( a <= 0.00 ){
119 | std::cerr << "ERROR: Covariate matrix is not full rank. Check input. \n";
120 | abort();
121 | }
122 | a = 1/std::sqrt(a);
123 | }
124 | X *= XtX_es.eigenvectors();
125 | X *= lambda.asDiagonal();
126 | return;
127 | }
128 |
129 | Eigen::MatrixXd resid_from_half_hat( const Eigen::MatrixXd& Y, const Eigen::MatrixXd& C ){
130 | Eigen::MatrixXd CtY = C.transpose() * Y;
131 | return (Y - C * CtY).eval();
132 | }
133 |
134 | void make_resid_from_half_hat( Eigen::MatrixXd& Y, const Eigen::MatrixXd& C ){
135 | Eigen::MatrixXd CtY = C.transpose() * Y;
136 | Y.noalias() -= (C *CtY);
137 | return;
138 | }
139 |
140 | Eigen::VectorXd resid_vec_from_half_hat( const Eigen::VectorXd& Y, const Eigen::MatrixXd& C ){
141 | Eigen::VectorXd CtY = C.transpose() * Y;
142 | Eigen::VectorXd Yhat = C * CtY;
143 | return (Y - Yhat).eval();
144 | }
145 |
146 | void appendInterceptColumn( Eigen::MatrixXd &X ){
147 | X.conservativeResize(Eigen::NoChange, X.cols()+1);
148 | X.col(X.cols()-1) = Eigen::VectorXd::Ones(X.rows());
149 | return;
150 | }
151 |
152 |
153 |
--------------------------------------------------------------------------------
/src/fitUtils.hpp:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (C) 2020
3 | Author: Corbin Quick
4 |
5 | This file is a part of APEX.
6 |
7 | APEX is distributed "AS IS" in the hope that it will be
8 | useful, but WITHOUT ANY WARRANTY; without even the implied
9 | warranty of MERCHANTABILITY, NON-INFRINGEMENT, or FITNESS
10 | FOR A PARTICULAR PURPOSE.
11 |
12 | The above copyright notice and disclaimer of warranty must
13 | be included in all copies or substantial portions of APEX.
14 | */
15 |
16 | #ifndef FITUTILS_HPP
17 | #define FITUTILS_HPP
18 |
19 | #include
20 | #include
21 | #include
22 |
23 | #include
24 | #include
25 |
26 | #include "mathStats.hpp"
27 |
28 |
29 | inline void printMeanDiag( const Eigen::SparseMatrix& mat ){
30 | std::cout << "Mean diagonal element = " << mat.diagonal().mean() << "\n";
31 | }
32 |
33 | inline void printMeanDiag( const Eigen::MatrixXd& mat ){
34 | std::cout << "Mean diagonal element = " << mat.diagonal().mean() << "\n";
35 | }
36 |
37 | void appendInterceptColumn(Eigen::MatrixXd&);
38 | Eigen::MatrixXd get_half_hat_matrix(const Eigen::MatrixXd&);
39 | Eigen::MatrixXd resid_from_half_hat(const Eigen::MatrixXd&, const Eigen::MatrixXd&);
40 |
41 | Eigen::VectorXd resid_vec_from_half_hat(const Eigen::VectorXd& Y, const Eigen::MatrixXd&);
42 | void make_half_hat_matrix(Eigen::MatrixXd& X);
43 | void make_resid_from_half_hat( Eigen::MatrixXd& Y, const Eigen::MatrixXd& C );
44 |
45 |
46 | void scale_and_center(Eigen::MatrixXd&, std::vector&);
47 | void scale_and_center(Eigen::MatrixXd&);
48 | void rank_normalize (Eigen::MatrixXd&);
49 |
50 | #endif
51 |
52 |
--------------------------------------------------------------------------------
/src/genotypeData.cpp:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (C) 2020
3 | Author: Corbin Quick
4 |
5 | This file is a part of APEX.
6 |
7 | APEX is distributed "AS IS" in the hope that it will be
8 | useful, but WITHOUT ANY WARRANTY; without even the implied
9 | warranty of MERCHANTABILITY, NON-INFRINGEMENT, or FITNESS
10 | FOR A PARTICULAR PURPOSE.
11 |
12 | The above copyright notice and disclaimer of warranty must
13 | be included in all copies or substantial portions of APEX.
14 | */
15 |
16 | #include "genotypeData.hpp"
17 |
18 | bool sp_geno_fmt = true;
19 |
20 | void genotype_data::read_bcf_variants(bcf_srs_t*& sr, bcf_hdr_t*& hdr, int& n_var, bool store_geno, bool scan_geno)
21 | {
22 |
23 | if( store_geno ){
24 | std::cerr << "Processed genotype data for ";
25 | }else{
26 | std::cerr << "Processed variant data for ";
27 | }
28 | std::string iter_cerr_suffix = " variants ... ";
29 |
30 | print_iter_cerr(1, 0, iter_cerr_suffix);
31 | int last = 0;
32 |
33 | initialize_genotypes(n_var);
34 | n_var = 0;
35 | geno_size = 0;
36 |
37 | while( bcf_sr_next_line(sr) )
38 | {
39 | bcf1_t *rec = bcf_sr_get_line(sr,0);
40 | if( process_bcf_variant(rec, hdr, store_geno, scan_geno) ){
41 | n_var++;
42 | if( store_geno ) geno_size++;
43 | }
44 | thinned_iter_cerr(last, n_var, iter_cerr_suffix, 2500);
45 | }
46 | print_iter_cerr(last, n_var, iter_cerr_suffix);
47 |
48 | geno_start = 0;
49 |
50 | get_ld_index();
51 |
52 | return;
53 | }
54 |
55 | int genotype_data::get_ld_index()
56 | {
57 | int bp = global_opts::ld_window_bp;
58 |
59 | int max_entries = 0;
60 |
61 | int ii = 0;
62 | int ii_s = 0;
63 | int ii_e = 0;
64 | while( ii < pos.size() ){
65 | while( pos[ii] - pos[ii_s] > bp && ii_s < pos.size() ){
66 | ii_s++;
67 | }
68 | while( pos[ii_e] - pos[ii] <= bp && ii_e < pos.size() ){
69 | ii_e++;
70 | }
71 |
72 | ld_index_range.push_back( std::make_pair(ii, ii_e) );
73 |
74 | if( ii_e - ii - 1 > max_entries ){
75 | max_entries = ii_e - ii - 1;
76 | }
77 |
78 | ii++;
79 | }
80 | return max_entries;
81 | }
82 |
83 | void genotype_data::clear_genotypes()
84 | {
85 | genotypes.setZero();
86 | genotypes.data().squeeze();
87 | }
88 |
89 | void genotype_data::clear()
90 | {
91 | n_variants = 0;
92 |
93 | chr.clear();
94 | pos.clear();
95 | ref.clear();
96 | alt.clear();
97 |
98 | mean.clear();
99 | var.clear();
100 |
101 | flipped.clear();
102 |
103 | clear_genotypes();
104 | }
105 |
106 | inline bool genotype_data::add_bcf_genotypes(int*& gt_rec, const int& col_n, double& mean_, double& var_, bool& flip_, const bool store_geno)
107 | {
108 | int n = 0;
109 | mean_ = 0;
110 | var_ = 0;
111 | flip_ = false;
112 | sparse_gt gts;
113 | std::vector missing;
114 | if( sp_geno_fmt ){
115 | gts.resize(ids.idx.size());
116 | }else{
117 | if( dense_genotypes.rows() < ids.keep.size() ){
118 | dense_genotypes.conservativeResize(ids.keep.size(), Eigen::NoChange);
119 | }
120 | if( dense_genotypes.cols() < col_n ){
121 | dense_genotypes.conservativeResize(Eigen::NoChange, col_n + chunk_size);
122 | }
123 | }
124 | int n_obs = 0;
125 |
126 | int n_2 = 0;
127 | int n_1 = 0;
128 |
129 | for(const int& i: ids.idx){
130 |
131 | int a1 = bcf_gt_allele(gt_rec[i*2+0]);
132 | int a2 = bcf_gt_allele(gt_rec[i*2+1]);
133 | if(a1 < 0 || a2 < 0)
134 | {
135 | if( global_opts::exclude_missing > 0 ){
136 | return false;
137 | }else{
138 | // a1 = a1 < 0 ? 0 : a1;
139 | // a2 = a2 < 0 ? 0 : a2;
140 | if(store_geno){
141 | if( sp_geno_fmt ){
142 | gts.set_gt(n,-1);
143 | }else{
144 | missing.push_back(i);
145 | }
146 | }
147 | }
148 | }else{
149 |
150 | if( a1 + a2 > 0 ){
151 | a1 += a2;
152 | if( a1 > 1){
153 | n_2++;
154 | }else{
155 | n_1++;
156 | }
157 | if(store_geno){
158 | if( sp_geno_fmt ){
159 | gts.set_gt(n,a1);
160 | }else{
161 | // DO DENSE
162 | // std::cerr << "DO DENSE";
163 | dense_genotypes(n, col_n) = a1;
164 | }
165 | }
166 | mean_ += a1;
167 | var_ += a1*a1;
168 | }
169 | n_obs++;
170 | }
171 | n++;
172 | }
173 |
174 | mean_ = mean_/((double) n_obs);
175 | var_ = (var_ - n_obs*mean_*mean_)/ ( (double) n_obs + 1 );
176 |
177 | if( n_2 > n_obs - n_1 - n_2 ){
178 | flip_ = true;
179 | mean_ = 2.0 - mean_;
180 | if( store_geno && sp_geno_fmt ) gts.flip(n);
181 | }
182 | if( store_geno && sp_geno_fmt ){
183 | // checkResize(col_n);
184 | gts.add_gt_sparsemat(genotypes, col_n);
185 | }
186 | if( store_geno && !sp_geno_fmt && missing.size() > 0 ){
187 | for(const int& i : missing ){
188 | dense_genotypes(i, col_n) = mean_;
189 | }
190 | }
191 |
192 | return true;
193 | }
194 |
195 | inline bool genotype_data::add_bcf_dosages(float*& ds_rec, const int& col_n, double& mean_, double& var_, bool& flip_, const bool store_geno)
196 | {
197 | int n = 0;
198 | int n_obs = 0;
199 | mean_ = 0;
200 | var_ = 0;
201 | flip_ = false;
202 | sparse_ds sp_ds;
203 | std::vector missing;
204 | if( sp_geno_fmt ){
205 | sp_ds.resize(ids.idx.size());
206 | }else{
207 | if( dense_genotypes.rows() < ids.keep.size() ){
208 | dense_genotypes.conservativeResize(ids.keep.size(), Eigen::NoChange);
209 | }
210 | if( dense_genotypes.cols() < col_n ){
211 | dense_genotypes.conservativeResize(Eigen::NoChange, col_n + chunk_size);
212 | }
213 | }
214 |
215 | int n_2 = 0;
216 | int n_0 = 0;
217 |
218 | for(const int& i: ids.idx)
219 | {
220 | float ds = ds_rec[i];
221 | if( ds < 0 ){
222 | if(store_geno){
223 | if( sp_geno_fmt ){
224 | sp_ds.set_ds(n,-1.00);
225 | }else{
226 | missing.push_back(i);
227 | }
228 | }
229 | }else{
230 | if( ds <= global_opts::dosage_thresh ){
231 | ds = 0.00;
232 | n_0++;
233 | }else if( ds >= 2.00 - global_opts::dosage_thresh ){
234 | ds = 2.00;
235 | n_2++;
236 | }
237 | if(store_geno){
238 | if( sp_geno_fmt ){
239 | sp_ds.set_ds(n,ds);
240 | }else{
241 | // std::cerr << "DO DENSE";
242 | dense_genotypes(n, col_n) = ds;
243 | }
244 | }
245 | mean_ += ds;
246 | var_ += ds*ds;
247 | n_obs++;
248 | }
249 | n++;
250 | }
251 |
252 | mean_ = mean_/((double) n_obs);
253 | var_ = (var_ - n_obs*mean_*mean_)/ ( (double) n_obs + 1 );
254 |
255 | if( n_2 > n_0 ){
256 | flip_ = true;
257 | mean_ = 2.0 - mean_;
258 | if( store_geno && sp_geno_fmt ) sp_ds.flip();
259 | }
260 | if( store_geno && sp_geno_fmt ){
261 | // checkResize(col_n);
262 | sp_ds.add_ds_sparsemat(genotypes, col_n);
263 | }
264 | if( store_geno && !sp_geno_fmt && missing.size() > 0 ){
265 | for(const int& i : missing ){
266 | dense_genotypes(i, col_n) = mean_;
267 | }
268 | }
269 |
270 | return true;
271 | }
272 |
273 | void genotype_data::read_bcf_header(bcf_hdr_t* hdr){
274 |
275 | n_samples = bcf_hdr_nsamples(hdr);
276 |
277 | for(int i = 0; i < n_samples; i++){
278 | ids.file.push_back(hdr->samples[i]);
279 | }
280 | if( ids.keep.size() == 0 ){
281 | ids.setKeepIDs(ids.file);
282 | }
283 | }
284 |
285 | void genotype_data::freeze_genotypes(){
286 | genotypes.conservativeResize(ids.keep.size(), geno_size);
287 | genotypes.finalize();
288 | genotypes.makeCompressed();
289 | }
290 |
291 | inline bool genotype_data::process_bcf_variant(bcf1_t*& rec, bcf_hdr_t*& hdr, bool store_geno, bool scan_geno){
292 |
293 | if( rec->n_allele != 2 ){
294 | std::cerr << "failed rec->n_allele" << rec->n_allele << "\n";
295 | return false;
296 | }
297 |
298 | if( n_samples <= 0 ){
299 | if( ids.keep.size() == 0 ){
300 | read_bcf_header(hdr);
301 | }else{
302 | n_samples = ids.keep.size();
303 | }
304 | }
305 |
306 | double mean_ = -1;
307 | double var_ = -1;
308 |
309 | bool flip_ = false;
310 | bool keep_ = true;
311 |
312 | // std::string snp_id(rec->d.id);
313 |
314 | if( scan_geno ){
315 |
316 | if( global_opts::use_dosages ){
317 | int nds_arr = 0;
318 | float *ds = NULL;
319 | bcf_get_format_float(hdr, rec, "DS", &ds, &nds_arr)/n_samples;
320 | keep_ = add_bcf_dosages(ds, n_variants, mean_, var_, flip_, store_geno);
321 | delete ds;
322 | }else{
323 | int ngt_arr = 0;
324 | int *gt = NULL;
325 | bcf_get_format_int32(hdr, rec, "GT", >, &ngt_arr)/n_samples;
326 | keep_ = add_bcf_genotypes(gt, n_variants, mean_, var_, flip_, store_geno);
327 | delete gt;
328 | }
329 |
330 | if( !keep_ ){
331 | //genotypes.conservativeResize(n_variants, Eigen::NoChange);
332 | std::cerr << "Warning: Failed to add genotypes.\n";
333 | return false;
334 | }
335 |
336 | }
337 |
338 | n_variants++;
339 |
340 | chr.push_back(bcf_hdr_id2name(hdr, rec->rid));
341 | pos.push_back(rec->pos + 1);
342 | //rsid.push_back(rsid_);
343 | ref.push_back(rec->d.allele[0]);
344 | alt.push_back(rec->d.allele[1]);
345 |
346 | flipped.push_back(flip_);
347 | keep.push_back(keep_);
348 |
349 | mean.push_back(mean_);
350 | var.push_back(var_);
351 |
352 | return true;
353 | }
354 |
355 | bool genotype_data::record_matches(bcf1_t*& rec, bcf_hdr_t*& hdr, const int& i){
356 | //cerr << bcf_hdr_id2name(hdr, rec->rid) << "\t" << (rec->pos + 1) << "\t"
357 | if( bcf_hdr_id2name(hdr, rec->rid) == chr[i] ){
358 | if( (rec->pos + 1) == pos[i] ){
359 | if( rec->d.allele[0] == ref[i] ){
360 | if( rec->d.allele[1] == alt[i] ){
361 | return true;
362 | }
363 | }
364 | }
365 | }
366 | return false;
367 | }
368 |
369 | void genotype_data::read_genotypes(bcf_srs_t*& sr, bcf_hdr_t*& hdr, const int& i_s, const int& n_s)
370 | {
371 | clear_genotypes();
372 | resize_genotypes(n_s);
373 |
374 | int i_e = i_s + n_s - 1;
375 | if( chr[i_s] != chr[i_e] ){
376 | std::cerr << "Fatal: regional chr mismatch in genotype_data::read_genotypes. \n";
377 | exit(1);
378 | }
379 | bcf_seek(sr, chr[i_s], pos[i_s]);
380 |
381 | int i_i = i_s;
382 |
383 | geno_start = i_s;
384 | geno_size = 0;
385 |
386 | int r_i = 0;
387 |
388 | while( bcf_sr_next_line(sr) && r_i < n_s )
389 | {
390 | bcf1_t *rec = bcf_sr_get_line(sr,0);
391 |
392 | if( record_matches(rec, hdr, i_i) ){
393 |
394 | if( keep[i_i] ){
395 |
396 | bool is_flipped = false;
397 |
398 |
399 | if( global_opts::use_dosages ){
400 | int nds_arr = 0;
401 | float *ds = NULL;
402 | bcf_get_format_float(hdr, rec, "DS", &ds, &nds_arr)/n_samples;
403 | keep[i_i] = add_bcf_dosages(ds, r_i, mean[i_i], var[i_i], is_flipped, true);
404 | delete ds;
405 | }else{
406 | int ngt_arr = 0;
407 | int *gt = NULL;
408 | int n_gts = bcf_get_format_int32(hdr, rec, "GT", >, &ngt_arr)/n_samples;
409 | keep[i_i] = add_bcf_genotypes(gt, r_i, mean[i_i], var[i_i], is_flipped, true);
410 | delete gt;
411 | }
412 |
413 | flipped[i_i] = is_flipped;
414 |
415 |
416 | }
417 |
418 | r_i++;
419 | i_i++;
420 | geno_size++;
421 | }
422 | }
423 | if( i_i < i_e ){
424 | std::cerr << "Fatal: Failed to fill region in genotype_data::read_genotypes. \n";
425 | std::cerr << i_s << ", " << i_i << ", " << i_e << "\n";
426 | exit(1);
427 | }
428 | freeze_genotypes();
429 | }
430 |
--------------------------------------------------------------------------------
/src/genotypeData.hpp:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (C) 2020
3 | Author: Corbin Quick
4 |
5 | This file is a part of APEX.
6 |
7 | APEX is distributed "AS IS" in the hope that it will be
8 | useful, but WITHOUT ANY WARRANTY; without even the implied
9 | warranty of MERCHANTABILITY, NON-INFRINGEMENT, or FITNESS
10 | FOR A PARTICULAR PURPOSE.
11 |
12 | The above copyright notice and disclaimer of warranty must
13 | be included in all copies or substantial portions of APEX.
14 | */
15 |
16 |
17 | #ifndef GENOTYPEDATA_HPP
18 | #define GENOTYPEDATA_HPP
19 |
20 | #include
21 | #include
22 |
23 | #include "setOptions.hpp"
24 | #include "mapID.hpp"
25 | #include "htsWrappers.hpp"
26 | #include "dataParser.hpp"
27 | #include "eigenData.hpp"
28 |
29 | #include
30 | #include
31 |
32 | class sparse_gt
33 | {
34 | private:
35 | std::vector> entries;
36 | double sum_gt;
37 | int n_samples;
38 | int n_missing;
39 | int last;
40 | public:
41 | sparse_gt(const int& ns) : sum_gt(0.00), n_samples(ns), n_missing(0), last(-1) {};
42 |
43 | sparse_gt() : sum_gt(0.00), n_samples(0), n_missing(0), last(-1) {};
44 |
45 | void resize(const int& ns){n_samples = ns;};
46 |
47 | int size(){
48 | return entries.size();
49 | };
50 | void set_gt(const int& n, const int& gt){
51 | if( n > last ){
52 | if( gt < 0 ){
53 | entries.push_back(std::make_pair(n,-1));
54 | n_missing++;
55 | }else if( gt > 0 ){
56 | entries.push_back(std::make_pair(n,gt));
57 | sum_gt += gt;
58 | }
59 | last = n;
60 | }else{
61 | std::cerr << "Fatal: Tried to add gt out of order!\n";
62 | exit(2);
63 | }
64 | };
65 | void flip(const int& n){
66 | int j = 0, i = 0;
67 | std::vector> flipped;
68 | while(i < n){
69 | if( j < entries.size() ){
70 | while( i < entries[j].first && i < n ){
71 | flipped.push_back( std::make_pair(i, 2) );
72 | i++;
73 | }
74 | if( i == entries[j].first ){
75 | if( entries[j].second < 0 ){
76 | flipped.push_back(entries[j]);
77 | }else if( entries[j].second == 1 ){
78 | flipped.push_back(entries[j]);
79 | }else if(entries[j].second != 2){
80 | std::cerr <<"\n\n" << entries[j].second << "\n\n";
81 | std::cerr << "entries[j].second != 2\n";
82 | exit(1);
83 | }
84 | i++;
85 | j++;
86 | }else if( j > i ){
87 | std::cerr << "This should never happen!\n";
88 | exit(2);
89 | }
90 | }else{
91 | flipped.push_back( std::make_pair(i, 2) );
92 | i++;
93 | }
94 | }
95 | sum_gt = 2.00 * ( (double) (n_samples - n_missing) ) - sum_gt;
96 | entries = flipped;
97 | };
98 | void add_gt_sparsemat(Eigen::SparseMatrix& smat, const int& col_n){
99 | smat.startVec(col_n);
100 | double missing_val = sum_gt/( (double) (n_samples - n_missing) );
101 | for( const auto& x : entries){
102 | if( x.second > 0 ){
103 | smat.insertBack(x.first, col_n) = x.second;
104 | }else{
105 | smat.insertBack(x.first, col_n) = missing_val;
106 | }
107 | }
108 | };
109 | };
110 |
111 |
112 | class sparse_ds
113 | {
114 | public:
115 | std::vector dosages;
116 | int last;
117 | double sum_dos;
118 | int n_missing;
119 | int n_samples;
120 |
121 | sparse_ds(const int& N) : sum_dos(0.00), n_missing(0), n_samples(N), last(-1) {dosages.resize(N);};
122 |
123 | sparse_ds() : sum_dos(0.00), n_missing(0), n_samples(0), last(-1) {};
124 |
125 | void resize(const int& N){n_samples = N; dosages.resize(N); };
126 |
127 |
128 | void set_ds(const int& n, const float& ds){
129 | if( n > last ){
130 | if( ds < 0 ){
131 | n_missing++;
132 | }else{
133 | sum_dos += ds;
134 | }
135 | dosages[n] = ds;
136 | last = n;
137 | }else{
138 | std::cerr << "Fatal: Tried to add ds out of order!\n";
139 | exit(2);
140 | }
141 | };
142 | void flip(){
143 | for( float& ds: dosages ){
144 | if(ds >= 0){
145 | ds = 2.00 - ds;
146 | }
147 | }
148 | sum_dos = 2.00 * ( (double) (n_samples - n_missing) ) - sum_dos;
149 | };
150 | void add_ds_sparsemat(Eigen::SparseMatrix& smat, const int& col_n){
151 | double missing_val = sum_dos/( (double) (n_samples - n_missing) );
152 | smat.startVec(col_n);
153 | for(int i = 0; i < dosages.size(); i++){
154 | if( dosages[i] < 0.00 ){
155 | if( missing_val > global_opts::dosage_thresh ){
156 | smat.insertBack(i, col_n) = missing_val;
157 | }
158 | }else if( dosages[i] > global_opts::dosage_thresh ){
159 | smat.insertBack(i, col_n) = (double) dosages[i];
160 | }
161 | }
162 | };
163 | };
164 |
165 |
166 | class genotype_data
167 | {
168 | public:
169 | genotype_data(): chunk_size(1000), nz_frac(0.1), n_variants(0), n_samples(0) {};
170 |
171 | int chunk_size;
172 | double nz_frac;
173 |
174 | int n_samples;
175 | int n_variants;
176 |
177 | id_map ids;
178 |
179 | std::vector chr;
180 | std::vector pos;
181 | //std::vector rsid;
182 | std::vector ref;
183 | std::vector alt;
184 |
185 | std::vector> ld_index_range;
186 |
187 | std::vector mean;
188 | std::vector var;
189 |
190 | std::vector flipped;
191 | std::vector keep;
192 |
193 | int geno_start;
194 | int geno_size;
195 |
196 | Eigen::SparseMatrix genotypes;
197 | Eigen::MatrixXd dense_genotypes;
198 |
199 | void read_bcf_variants(bcf_srs_t*&, bcf_hdr_t*&, int&, bool store_geno = true, bool scan_geno = true);
200 |
201 | int get_ld_index();
202 | inline bool process_bcf_variant(bcf1_t*&, bcf_hdr_t*&, bool store_geno = true, bool scan_geno = true);
203 | void read_bcf_header(bcf_hdr_t*);
204 | void freeze_genotypes();
205 | void resize_genotypes(const int& nv){
206 | genotypes.resize(ids.keep.size(), nv);
207 | genotypes.reserve( (int) (nz_frac * genotypes.rows() * genotypes.cols() ) );
208 | };
209 | void initialize_genotypes(const int& nv){
210 | resize_genotypes(nv);
211 | };
212 | void print_genotypes(){
213 | std::cout << genotypes << "\n";
214 | };
215 |
216 | void clear();
217 | void clear_genotypes();
218 | void read_genotypes(bcf_srs_t*&, bcf_hdr_t*&, const int&, const int&);
219 |
220 | bool record_matches(bcf1_t*& rec, bcf_hdr_t*& hdr, const int& i);
221 |
222 | private:
223 | inline bool add_bcf_genotypes(int*&, const int&, double&, double&, bool&, const bool);
224 | inline bool add_bcf_dosages(float*& ds_rec, const int& col_n, double& mean_, double& var_, bool& flip_, const bool store_geno);
225 |
226 | void checkResize(const int& col_n){
227 | if( col_n >= genotypes.cols() ){
228 | genotypes.conservativeResize(ids.keep.size(), col_n + chunk_size);
229 | }
230 | };
231 |
232 | };
233 |
234 |
235 | #endif
236 |
237 |
--------------------------------------------------------------------------------
/src/htsWrappers.cpp:
--------------------------------------------------------------------------------
1 | /* htsWrappers:
2 |
3 | Copyright (C) 2020
4 | Author: Corbin Quick
5 |
6 | This file is a part of APEX.
7 |
8 | APEX is distributed "AS IS" in the hope that it will be
9 | useful, but WITHOUT ANY WARRANTY; without even the implied
10 | warranty of MERCHANTABILITY, NON-INFRINGEMENT, or FITNESS
11 | FOR A PARTICULAR PURPOSE.
12 |
13 | The above copyright notice and disclaimer of warranty must
14 | be included in all copies or substantial portions of APEX.
15 | */
16 |
17 | #include "htsWrappers.hpp"
18 |
19 |
20 | void indexed_hts_file::open(const std::string& prefix, const std::vector& reg)
21 | {
22 | regions = reg;
23 |
24 | htsf = hts_open(prefix.c_str(), "r");
25 | tbx = tbx_index_load(prefix.c_str());
26 |
27 | itr = NULL;
28 |
29 | c_reg = 0;
30 | n_reg = regions.size();
31 |
32 | if( n_reg == 0 ){
33 | const char** chroms = tbx_seqnames(tbx, &n_reg);
34 | for( int i = 0; i < n_reg; ++i ){
35 | regions.push_back(chroms[i]);
36 | }
37 | free(chroms);
38 | }
39 | }
40 |
41 | int indexed_hts_file::next_line(kstring_t& str)
42 | {
43 | if( !itr ){
44 | itr = tbx_itr_querys(tbx, regions[c_reg].c_str() );
45 | }
46 | if( tbx_itr_next(htsf, tbx, itr, &str) < 0 ){
47 | if( c_reg + 1 < n_reg ){
48 | c_reg++;
49 | itr = tbx_itr_querys(tbx, regions[c_reg].c_str() );
50 | return tbx_itr_next(htsf, tbx, itr, &str);
51 | }else{
52 | return -1;
53 | }
54 | }
55 | return 1;
56 | }
57 |
58 | void bcf_seek(bcf_srs_t* sr, const std::string& chr, const int& start, const int& end){
59 |
60 | std::string region = chr;
61 |
62 | if( start > 0 ){
63 | region += ":" + std::to_string(start);
64 | }
65 | if( end > 0 ){
66 | region += "-" + std::to_string(end);
67 | }
68 |
69 | bcf_sr_regions_t *reg = bcf_sr_regions_init(region.c_str(),0,0,1,-2);
70 |
71 | bcf_sr_seek(sr, reg->seq_names[0], reg->start);
72 |
73 | return;
74 | }
75 |
76 | void bgzip_file(std::string file_name, int index){
77 |
78 | // index == 0 => Do not build any index.
79 | // index == 1 => Build gzi byte index.
80 | // index == 2 => Build csi position-based index.
81 |
82 | BGZF *fp;
83 | void *buffer;
84 |
85 | int f_src = open(file_name.c_str(), O_RDONLY);
86 |
87 | std::string file_name_gz = file_name + ".gz";
88 | fp = bgzf_open(file_name_gz.c_str(), "w\0");
89 |
90 | if( index == 1 ) bgzf_index_build_init(fp);
91 |
92 | buffer = malloc(BUFFER_SIZE);
93 |
94 | int c;
95 |
96 | while ((c = read(f_src, buffer, BUFFER_SIZE)) > 0){
97 | if (bgzf_write(fp, buffer, c) < 0){
98 | std::cerr << "Fatal error: Couldn't write to " << file_name << ".gz\n";
99 | exit(1);
100 | }
101 | }
102 |
103 | if( index == 1 ){
104 | if (bgzf_index_dump(fp, file_name.c_str(), ".gz.gzi") < 0){
105 | std::cerr << "Fatal error: Couldn't create index " << file_name << ".gz.gzi\n";
106 | exit(1);
107 | }
108 | }
109 |
110 | if (bgzf_close(fp) < 0){
111 | std::cerr << "Fatal error: Couldn't close " << file_name << ".gz\n";
112 | exit(1);
113 | }
114 |
115 | if( index == 2 ){
116 | if ( tbx_index_build(file_name_gz.c_str(), 14, &tbx_conf_vcf)!=0 )
117 | {
118 | std::cerr << "Fatal error: Couldn't create index " << file_name << ".gz.csi\n";
119 | exit(1);
120 | }
121 | }
122 |
123 | unlink(file_name.c_str());
124 | free(buffer);
125 | close(f_src);
126 | }
127 |
128 | std::vector get_chroms(std::string file_name, std::vector& n_variants){
129 |
130 | // get a list of chromosomes in the file
131 | // this code is directly adapted from bcftools:
132 | // https://github.com/samtools/bcftools/blob/develop/vcfindex.c
133 |
134 | n_variants.clear();
135 | std::vector chroms;
136 |
137 | const char **seq;
138 | int nseq, stats;
139 | tbx_t *tbx = NULL;
140 | hts_idx_t *idx = NULL;
141 |
142 | htsFile *fp = hts_open(file_name.c_str(),"r");
143 | if ( !fp ) { fprintf(stderr,"Could not read %s\n", file_name.c_str()); abort(); }
144 | bcf_hdr_t *hdr = NULL;
145 |
146 | if ( hts_get_format(fp)->format==bcf )
147 | {
148 | hdr = bcf_hdr_read(fp);
149 | if ( !hdr ) { fprintf(stderr,"Could not read the header: %s\n", file_name.c_str()); abort(); }
150 | idx = bcf_index_load(file_name.c_str());
151 | if ( !idx ) { fprintf(stderr,"Could not load index for %s\n", file_name.c_str()); abort(); }
152 | }else{
153 | if( hts_get_format(fp)->format==vcf ){
154 | hdr = bcf_hdr_read(fp);
155 | if ( !hdr ) { fprintf(stderr,"Could not read the header: %s\n", file_name.c_str()); abort(); }
156 | }
157 | tbx = tbx_index_load(file_name.c_str());
158 | if ( !tbx ) { fprintf(stderr,"Could not load index for %s\n", file_name.c_str()); abort(); }
159 | }
160 |
161 | seq = tbx ? tbx_seqnames(tbx, &nseq) : bcf_index_seqnames(idx, hdr, &nseq);
162 | for (int i=0; iidx : idx, i, &records, &v);
166 | if (stats&2 || !records) continue;
167 | chroms.push_back(std::string(seq[i]));
168 | n_variants.push_back( (int) records );
169 | }
170 | free(seq);
171 |
172 | hts_close(fp);
173 | bcf_hdr_destroy(hdr);
174 | if (tbx) tbx_destroy(tbx);
175 | if (idx) hts_idx_destroy(idx);
176 |
177 | return chroms;
178 | }
179 |
--------------------------------------------------------------------------------
/src/htsWrappers.hpp:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (C) 2020
3 | Author: Corbin Quick
4 |
5 | This file is a part of APEX.
6 |
7 | APEX is distributed "AS IS" in the hope that it will be
8 | useful, but WITHOUT ANY WARRANTY; without even the implied
9 | warranty of MERCHANTABILITY, NON-INFRINGEMENT, or FITNESS
10 | FOR A PARTICULAR PURPOSE.
11 |
12 | The above copyright notice and disclaimer of warranty must
13 | be included in all copies or substantial portions of APEX.
14 | */
15 |
16 |
17 | /*
18 | Generic wrappers for HTSLIB BCF/VCF file parsing, as well as for BGZF reading/writing.
19 | */
20 |
21 | #ifndef HTSWRAPPERS_HPP
22 | #define HTSWRAPPERS_HPP
23 |
24 | #include
25 | #include
26 |
27 | #include
28 | #include
29 | #include
30 | #include
31 |
32 | #include "htslib/tbx.h"
33 | #include "htslib/hts.h"
34 | #include "htslib/bgzf.h"
35 | #include "htslib/kseq.h"
36 | #include "htslib/kstring.h"
37 | #include "htslib/synced_bcf_reader.h"
38 |
39 | #include
40 |
41 | #include "setOptions.hpp"
42 | #include "miscUtils.hpp"
43 |
44 |
45 | static const int BUFFER_SIZE = 64 * 1024;
46 |
47 | static int MINUS99 = -99;
48 |
49 | static const std::string null_string = "";
50 |
51 | static Eigen::MatrixXd NULL_MATRIX = Eigen::MatrixXd::Zero(0,0);
52 |
53 | static std::vector null_vector_int = std::vector(0);
54 |
55 | class basic_hts_file
56 | {
57 | public:
58 | void open(const std::string& prefix){htsf = hts_open(prefix.c_str(), "r");};
59 | int next_line(kstring_t& str){ return hts_getline(htsf, KS_SEP_LINE, &str); };
60 |
61 | basic_hts_file(){};
62 | basic_hts_file(const std::string& prefix){ open(prefix); };
63 |
64 | void close(){ hts_close(htsf);};
65 |
66 | private:
67 | htsFile *htsf;
68 | };
69 |
70 | class indexed_hts_file
71 | {
72 | public:
73 |
74 | void open(const std::string& prefix, const std::vector& reg);
75 | void open(const std::string& prefix, const std::string& reg = null_string){
76 | std::vector tmp_reg;
77 | if( reg != "" ) tmp_reg.push_back(reg);
78 | open(prefix, tmp_reg);
79 | };
80 |
81 | int next_line(kstring_t& str);
82 |
83 | indexed_hts_file(){};
84 | indexed_hts_file(const std::string& prefix, const std::string& reg = null_string){
85 | open(prefix, reg);
86 | };
87 | indexed_hts_file(const std::string& prefix, const std::vector& reg){
88 | open(prefix, reg);
89 | };
90 |
91 | void close(){
92 | if (itr) hts_itr_destroy(itr);
93 | hts_close(htsf);
94 | };
95 |
96 | private:
97 | int n_reg;
98 | int c_reg;
99 | std::vector regions;
100 |
101 | htsFile *htsf;
102 | tbx_t *tbx;
103 | hts_itr_t *itr;
104 | };
105 |
106 | void bcf_seek(bcf_srs_t*, const std::string&, const int& start = MINUS99, const int& end = MINUS99);
107 |
108 | void bgzip_file(std::string, int);
109 |
110 | inline void write_to_bgzf(const std::string& inp, BGZF *fp){
111 | if( bgzf_write(fp, (const void*) inp.c_str(), inp.length()) < 0 ){
112 | std::cerr << "Error: Could not write to output file\n";
113 | exit(1);
114 | }
115 | };
116 |
117 | inline void write_to_bgzf(const char* inp, BGZF *fp){
118 | if( bgzf_write(fp, (const void*) inp, strlen(inp)) < 0 ){
119 | std::cerr << "Error: Could not write to output file\n";
120 | exit(1);
121 | }
122 | };
123 |
124 | inline void build_tabix_index(std::string file_name, int type = 0){
125 | if(tbx_index_build(file_name.c_str(), 14, type==0? &tbx_conf_vcf : &tbx_conf_bed )!=0)
126 | {
127 | std::cerr << "Fatal error: Couldn't create index for " << file_name << "\n";
128 | exit(1);
129 | }
130 | };
131 |
132 | inline void operator<< (BGZF& fp, const char*& inp)
133 | {
134 | BGZF* fp_pt = &fp;
135 | if( bgzf_write(fp_pt, (const void*) inp, strlen(inp)) < 0 ){
136 | std::cerr << "Error: Could not write to output file\n";
137 | exit(1);
138 | }
139 | return;
140 | }
141 |
142 | inline void operator<< (BGZF& fp, const std::string& inp)
143 | {
144 | BGZF* fp_pt = &fp;
145 | if( bgzf_write(fp_pt, (const void*) inp.c_str(), inp.length()) < 0 ){
146 | std::cerr << "Error: Could not write to output file\n";
147 | exit(1);
148 | }
149 | return;
150 | }
151 |
152 |
153 | std::vector get_chroms(std::string file_name, std::vector& n_variants = null_vector_int);
154 |
155 | #endif
156 |
157 |
--------------------------------------------------------------------------------
/src/mapID.cpp:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (C) 2020
3 | Author: Corbin Quick
4 |
5 | This file is a part of APEX.
6 |
7 | APEX is distributed "AS IS" in the hope that it will be
8 | useful, but WITHOUT ANY WARRANTY; without even the implied
9 | warranty of MERCHANTABILITY, NON-INFRINGEMENT, or FITNESS
10 | FOR A PARTICULAR PURPOSE.
11 |
12 | The above copyright notice and disclaimer of warranty must
13 | be included in all copies or substantial portions of APEX.
14 | */
15 |
16 |
17 | /*
18 | mapID source files and the 'id_map' class are for subsetting, merging,
19 | and mapping row and column IDs in across multiple files (expression,
20 | covariate, and genotype). These classes help reduce redundancy for
21 | common tasks across file types.
22 | */
23 |
24 | #include "mapID.hpp"
25 |
26 |
27 | std::vector intersect_ids(std::vector a, std::vector b)
28 | {
29 | std::vector out;
30 | sort(a.begin(), a.end());
31 | sort(b.begin(), b.end());
32 | set_intersection(a.begin(), a.end(), b.begin(), b.end(), back_inserter(out));
33 | return out;
34 | }
35 |
36 | int id_map::n()
37 | {
38 | return keep.size();
39 | }
40 |
41 | void id_map::setFileIDs(std::vector& ids)
42 | {
43 | file = ids;
44 |
45 | if( keep.size() > 0 ){
46 | makeIndex();
47 | }
48 | }
49 |
50 | bool id_map::tryKeep(std::string& id)
51 | {
52 | bool should_keep = true;
53 | if( keep.size() > 0 ){
54 | if( keep_set.size() == 0 ){
55 | copy(keep.begin(),keep.end(),inserter(keep_set,keep_set.end()));
56 | }
57 | should_keep = keep_set.find(id) != keep_set.end();
58 | }
59 | if( should_keep ){
60 | file.push_back(id);
61 | }
62 | return should_keep;
63 | }
64 |
65 | void id_map::makeIndex()
66 | {
67 | idx.clear();
68 | idx_f2k.clear();
69 |
70 | std::unordered_map file_id_map;
71 | std::unordered_map keep_id_map;
72 |
73 | int ii = 0;
74 | for( std::string& id : file )
75 | {
76 | file_id_map[id] = ii;
77 | ii++;
78 | }
79 | ii = 0;
80 | std::vector not_in_file;
81 | for( std::string& id : keep )
82 | {
83 | if( file_id_map.find(id) == file_id_map.end() ){
84 | not_in_file.push_back(ii);
85 | }else{
86 | idx.push_back(file_id_map[id]);
87 | }
88 | ii++;
89 | }
90 | for(auto j = not_in_file.rbegin(); j != not_in_file.rend(); ++j)
91 | {
92 | keep.erase(keep.begin() + *j);
93 | }
94 |
95 |
96 | ii = 0;
97 | for( std::string& id : keep )
98 | {
99 | keep_id_map[id] = ii;
100 | ii++;
101 | }
102 | for( std::string& id : file )
103 | {
104 | if( keep_id_map.find(id) == keep_id_map.end() ){
105 | idx_f2k.push_back(-1);
106 | }else{
107 | idx_f2k.push_back(keep_id_map[id]);
108 | }
109 | }
110 | }
111 |
112 | void id_map::setKeepIDs(std::vector& ids)
113 | {
114 | keep = ids;
115 |
116 | // map the ids we want to keep to the ids in the file
117 | if( file.size() > 0 )
118 | {
119 | makeIndex();
120 | }else
121 | {
122 | // if we set 'keep' without setting file ids, we will make
123 | // a hash to check whether each file id should be kept.
124 | copy(keep.begin(),keep.end(),inserter(keep_set,keep_set.end()));
125 | }
126 | }
127 |
128 |
--------------------------------------------------------------------------------
/src/mapID.hpp:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (C) 2020
3 | Author: Corbin Quick
4 |
5 | This file is a part of APEX.
6 |
7 | APEX is distributed "AS IS" in the hope that it will be
8 | useful, but WITHOUT ANY WARRANTY; without even the implied
9 | warranty of MERCHANTABILITY, NON-INFRINGEMENT, or FITNESS
10 | FOR A PARTICULAR PURPOSE.
11 |
12 | The above copyright notice and disclaimer of warranty must
13 | be included in all copies or substantial portions of APEX.
14 | */
15 |
16 | /*
17 | mapID source files and the 'id_map' class are for subsetting, merging,
18 | and mapping row and column IDs in across multiple files (expression,
19 | covariate, and genotype). These classes help reduce redundancy for
20 | common tasks across file types.
21 | */
22 |
23 | #ifndef MAPID_HPP
24 | #define MAPID_HPP
25 |
26 | #include
27 | #include
28 | #include
29 |
30 | #include "miscUtils.hpp"
31 | #include "setOptions.hpp"
32 |
33 |
34 | std::vector intersect_ids(std::vector, std::vector);
35 |
36 | class id_map
37 | {
38 | public:
39 | std::unordered_set keep_set;
40 |
41 | std::vector file;
42 | std::vector keep;
43 | std::vector idx;
44 | std::vector idx_f2k;
45 |
46 | void makeIndex();
47 | bool tryKeep(std::string&);
48 | void setFileIDs(std::vector&);
49 | void setKeepIDs(std::vector&);
50 |
51 | int n();
52 | };
53 |
54 | #endif
55 |
56 |
57 |
58 |
59 |
--------------------------------------------------------------------------------
/src/mathStats.cpp:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (C) 2020
3 | Author: Corbin Quick
4 |
5 | This file is a part of APEX.
6 |
7 | APEX is distributed "AS IS" in the hope that it will be
8 | useful, but WITHOUT ANY WARRANTY; without even the implied
9 | warranty of MERCHANTABILITY, NON-INFRINGEMENT, or FITNESS
10 | FOR A PARTICULAR PURPOSE.
11 |
12 | The above copyright notice and disclaimer of warranty must
13 | be included in all copies or substantial portions of APEX.
14 | */
15 |
16 |
17 | #include "mathStats.hpp"
18 |
19 |
20 | double p_bd = 1e-300;
21 | double q_bd = 3e+299;
22 |
23 | double qnorm(double p, bool lower){
24 | boost::math::normal N01(0.0, 1.0);
25 | if( lower ) return boost::math::quantile(boost::math::complement(N01, p));
26 | return boost::math::quantile(N01, p);
27 | }
28 |
29 | double pnorm(double x, bool lower){
30 | boost::math::normal N01(0.0, 1.0);
31 | if( lower ) return boost::math::cdf(boost::math::complement(N01, x));
32 | return boost::math::cdf(N01, x);
33 | }
34 |
35 | double qcauchy(double p, bool lower){
36 | p = p > p_bd ? p : p_bd;
37 | p = p < 1 - p_bd ? p : 1 - p_bd;
38 |
39 | boost::math::cauchy C01(0.0, 1.0);
40 | if( lower ) return boost::math::quantile(boost::math::complement(C01, p));
41 | return boost::math::quantile(C01, p);
42 | }
43 |
44 | double pcauchy(double x, bool lower){
45 | x = x < q_bd ? x : q_bd;
46 | x = x > -q_bd ? x : -q_bd;
47 |
48 | boost::math::cauchy C01(0.0, 1.0);
49 | if( lower ) return boost::math::cdf(boost::math::complement(C01, x));
50 | return boost::math::cdf(C01, x);
51 | }
52 |
53 | double qt(double p, double df, bool lower){
54 | boost::math::students_t TDIST(df);
55 | if( lower ) return boost::math::quantile(boost::math::complement(TDIST, p));
56 | return boost::math::quantile(TDIST, p);
57 | }
58 |
59 | double pt(double x, double df, bool lower){
60 | boost::math::students_t TDIST(df);
61 | if( lower ) return boost::math::cdf(boost::math::complement(TDIST, x));
62 | return boost::math::cdf(TDIST, x);
63 | }
64 |
65 | double qf(double p, double df1, double df2, bool lower){
66 | boost::math::fisher_f FDIST(df1, df2);
67 | if( lower ) return boost::math::quantile(boost::math::complement(FDIST, p));
68 | return boost::math::cdf(FDIST, p);
69 | }
70 |
71 | double pf(double x, double df1, double df2, bool lower){
72 | boost::math::fisher_f FDIST(df1, df2);
73 | if( lower ) return boost::math::cdf(boost::math::complement(FDIST, x));
74 | return boost::math::cdf(FDIST, x);
75 | }
76 |
77 | double qchisq(double p, double df, bool lower){
78 | boost::math::chi_squared CHISQ(df);
79 | if( lower ) return boost::math::quantile(boost::math::complement(CHISQ, p));
80 | return boost::math::quantile(CHISQ, p);
81 | }
82 |
83 | double pchisq(double x, double df, bool lower){
84 | boost::math::chi_squared CHISQ(df);
85 | if( lower ) return boost::math::cdf(boost::math::complement(CHISQ, x));
86 | return boost::math::cdf(CHISQ, x);
87 | }
88 |
89 | double ACAT(const std::vector& pvals){
90 | long double sum_c = 0.0;
91 | double n = pvals.size();
92 | for( const double& p: pvals ){
93 | if( p >= 1 ){
94 | sum_c += (qcauchy(1 - 1/n, true)/n);
95 | }else if( p <= 0 ){
96 | std::cerr << "ACAT failed; input pval <= 0. \n";
97 | exit(1);
98 | }else{
99 | sum_c += (qcauchy(p, true)/n);
100 | }
101 | }
102 | return pcauchy(sum_c, true);
103 | }
104 |
105 | double ACAT(const std::vector& pvals,const std::vector& weights){
106 | long double sum_c = 0.0;
107 | long double denom = 0.0;
108 | double n = pvals.size();
109 | int i = 0;
110 | for( const double& w: weights ){
111 | denom += weights[i];
112 | }
113 | if( denom <= 0){
114 | return -99;
115 | }
116 | for( const double& p: pvals ){
117 | if( p >= 1 ){
118 | sum_c += (weights[i] * qcauchy(1 - 1/n, true) / denom);
119 | }else if( p <= 0 ){
120 | std::cerr << "ACAT failed; input pval <= 0. \n";
121 | exit(1);
122 | }else{
123 | sum_c += (weights[i] * qcauchy(p, true) / denom);
124 | }
125 | i++;
126 | }
127 | return pcauchy(sum_c, true);
128 | }
129 |
130 | std::vector filter_lt( const std::vector& p, double thresh){
131 | std::vector out;
132 | for( const double& x : p ){
133 | if( x > thresh ) out.push_back(x);
134 | }
135 | return out;
136 | }
137 |
138 | double ACAT_non_missing( const std::vector& pvals, const std::vector& dist ){
139 | if( dist.size() == 0 || global_opts::exp_weight_val <= 0 ){
140 | long double sum_c = 0.0;
141 | double n = 0;
142 | double n_p1 = 0;
143 | double max_p = 0;
144 | for( const double& p: pvals ){
145 | if( !std::isnan(p) ){
146 | if( p >= 1 ){
147 | n_p1 += 1;
148 | n += 1;
149 | }else if( p > 0 ){
150 | sum_c += qcauchy(p, true);
151 | n += 1;
152 | if( p > max_p ){
153 | max_p = p;
154 | }
155 | }
156 | }
157 | }
158 | if( n_p1 > 0 ){
159 | max_p = 0.5 + 0.5 * max_p;
160 | sum_c += n_p1 * qcauchy(max_p, true);
161 | }else if(n < 1){
162 | n = 1;
163 | }
164 | return pcauchy(sum_c/n, true);
165 | }else{
166 | long double sum_c = 0.0;
167 | double denom = 0;
168 | double n_p1 = 0;
169 | double w_p1 = 0;
170 | double max_p = 0;
171 | for( int i = 0; i < pvals.size(); i++ ){
172 | const double& p = pvals[i];
173 | double ww = std::exp(-std::abs(global_opts::exp_weight_val*dist[i]));
174 | if( !std::isnan(p) ){
175 | if( p >= 1 ){
176 | denom += ww;
177 | }else if( p > 0 ){
178 | sum_c += ww*qcauchy(p, true);
179 | denom += ww;
180 | if( max_p < p ){
181 | max_p = p;
182 | }
183 | }
184 | }
185 | }
186 | if( w_p1 > 0 ){
187 | max_p = 0.5 + 0.5 * max_p;
188 | sum_c += w_p1 * qcauchy(max_p, true);
189 | }else if( denom <= 0 ){
190 | denom = 1;
191 | }
192 | return pcauchy(sum_c/denom, true);
193 | }
194 | }
195 |
196 |
197 | std::vector rank_vector(const std::vector& v)
198 | {
199 | // This code is adapted from stackoverflow:
200 | // https://stackoverflow.com/questions/30822729/create-ranking-for-vector-of-double
201 |
202 | std::vector w(v.size());
203 | iota(begin(w), end(w), 0);
204 | sort(begin(w), end(w),
205 | [&v](size_t i, size_t j) { return v[i] < v[j]; });
206 |
207 | std::vector r(w.size());
208 | for (size_t n, i = 0; i < w.size(); i += n)
209 | {
210 | n = 1;
211 | while (i + n < w.size() && v[w[i]] == v[w[i+n]]) ++n;
212 | for (size_t k = 0; k < n; ++k)
213 | {
214 | r[w[i+k]] = i + (n + 1) / 2.0; // average rank of n tied values
215 | // r[w[i+k]] = i + 1; // min
216 | // r[w[i+k]] = i + n; // max
217 | // r[w[i+k]] = i + k + 1; // random order
218 | }
219 | }
220 | return r;
221 | }
222 |
--------------------------------------------------------------------------------
/src/mathStats.hpp:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (C) 2020
3 | Author: Corbin Quick
4 |
5 | This file is a part of APEX.
6 |
7 | APEX is distributed "AS IS" in the hope that it will be
8 | useful, but WITHOUT ANY WARRANTY; without even the implied
9 | warranty of MERCHANTABILITY, NON-INFRINGEMENT, or FITNESS
10 | FOR A PARTICULAR PURPOSE.
11 |
12 | The above copyright notice and disclaimer of warranty must
13 | be included in all copies or substantial portions of APEX.
14 | */
15 |
16 |
17 | #ifndef MATHSTATS_HPP
18 | #define MATHSTATS_HPP
19 |
20 | #include
21 | #include
22 |
23 | #include "setOptions.hpp"
24 |
25 | #include
26 | #include
27 | #include
28 | #include
29 | #include
30 |
31 | #include
32 | #include
33 | #include
34 | #include
35 |
36 | #include
37 |
38 | // ------------------------------------
39 | // Eigen matrix printing formats
40 | // ------------------------------------
41 | const static Eigen::IOFormat EigenCSV(Eigen::StreamPrecision, Eigen::DontAlignCols, ",", "\n");
42 | const static Eigen::IOFormat EigenTSV(Eigen::StreamPrecision, Eigen::DontAlignCols, "\t", "\n");
43 |
44 | // ------------------------------------
45 | // R-like pdf and cdf (TODO: switch to libRmath)
46 | // ------------------------------------
47 | double qnorm(double, bool lower = false);
48 | double pnorm(double, bool lower = false);
49 | double qt(double, double, bool lower = false);
50 | double pt(double, double, bool lower = false);
51 | double qf(double, double, double, bool lower = false);
52 | double pf(double, double, double, bool lower = false);
53 | double qchisq(double, double, bool lower = false);
54 | double pchisq(double, double, bool lower = false);
55 | double qcauchy(double, bool lower = false);
56 | double pcauchy(double, bool lower = false);
57 |
58 | double ACAT(const std::vector&);
59 | double ACAT(const std::vector&,const std::vector&);
60 |
61 | static const std::vector v0(0);
62 |
63 | std::vector filter_lt( const std::vector&, double);
64 | double ACAT_non_missing( const std::vector& pvals, const std::vector& dist = v0);
65 |
66 | std::vector rank_vector(const std::vector&);
67 |
68 | #endif
69 |
70 |
--------------------------------------------------------------------------------
/src/metaAnalysis.hpp:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (C) 2020
3 | Authors: Corbin Quick
4 | Li Guan
5 |
6 | This file is a part of APEX.
7 |
8 | APEX is distributed "AS IS" in the hope that it will be
9 | useful, but WITHOUT ANY WARRANTY; without even the implied
10 | warranty of MERCHANTABILITY, NON-INFRINGEMENT, or FITNESS
11 | FOR A PARTICULAR PURPOSE.
12 |
13 | The above copyright notice and disclaimer of warranty must
14 | be included in all copies or substantial portions of APEX.
15 | */
16 |
17 |
18 | #ifndef METAANALYSIS_HPP
19 | #define METAANALYSIS_HPP
20 |
21 | #include
22 | #include
23 |
24 | #include "setOptions.hpp"
25 | #include "processVCOV.hpp"
26 | #include "miscUtils.hpp"
27 | #include "fitModels.hpp"
28 | #include "dataParser.hpp"
29 |
30 | #include
31 | #include
32 |
33 |
34 | class cis_sumstat_data
35 | {
36 | public:
37 | std::string file_prefix;
38 | std::string region;
39 |
40 | lindex ln;
41 |
42 | std::vector chr;
43 | std::vector start;
44 | std::vector end;
45 | std::vector gene_id;
46 |
47 | std::vector | |