├── Data └── TestData │ ├── 1000G │ ├── All.chr22.27022019.GRCh38.phased.gds │ └── LOG.txt │ └── Input │ ├── ._FAVOR.T2210k.gds │ ├── FAVOR.T2210k.gds │ └── FAVOR.T2210k.vcf ├── Docs ├── .DS_Store └── Tutorial │ ├── .DS_Store │ ├── Demos │ ├── FASRC.md │ ├── UKBB200KWESpreprocessVCF.md │ └── preprocessVCF.md │ ├── Detailed-Explanation │ └── FAVORFullDB.xlsx │ ├── Figures │ ├── FASRC1.jpg │ ├── FAVORannotatorOnTerra.png │ ├── Figure2A.png │ ├── Figure2B.png │ ├── Figure2C.png │ ├── HarvardDataVerse.png │ ├── LiveDemo.png │ ├── createDBinstance.png │ ├── figure1.png │ ├── figure4.png │ ├── postgreSQLdb.png │ ├── runningInstance.png │ ├── versions.png │ └── versions1.png │ └── Tables │ ├── table 1.png │ └── table1.png ├── README.md └── Scripts ├── CSV ├── Dockerfile.txt ├── FAVORannotatorCSVEssentialDB.R ├── FAVORannotatorCSVFullDB.R ├── FAVORannotatorv2aGDS.r ├── config.R ├── convertVCFtoGDS.r ├── subBatchJobs.sh ├── subBatchJobs.txt └── submitJobs.sh ├── Cloud ├── .DS_Store ├── ._.DS_Store ├── DNAnexus │ ├── ._FAVORannotatorDev.R │ ├── ._code.sh │ ├── ._favorannotator.R │ ├── FAVORannotatorDev.R │ ├── code.sh │ └── favorannotator.R └── Terra │ ├── .DS_Store │ ├── .Rhistory │ ├── FAVORannotatorEssentialDB.wdl │ ├── FAVORannotatorFullDB.wdl │ ├── FAVORannotatorTerra.r │ ├── FAVORannotatorTerraEssentialDB.R │ ├── FAVORannotatorTerraFullDB.R │ ├── convertVCFtoGDS.R │ ├── headercolumn.txt │ └── test.R ├── Dockerize ├── Dockerfile.txt ├── ExampleDockerFiles.txt └── install_packages.R ├── SQL ├── FAVORannotatorv2aGDS.r ├── FAVORdatabase_chrsplit.csv ├── config.R ├── convertVCFtoGDS.r ├── convertVCFtoNullGenotypeGDS.r ├── importCommands.sql └── submitJobs.sh └── UTL ├── FAVORannotatorAddIn.R ├── convBCF2GDS.r ├── convertVCFtoGDS.r ├── convertVCFtoNullGenotypeGDS.r ├── convertaGDStoVCF.r └── preProcessingVCF.sh /Data/TestData/1000G/All.chr22.27022019.GRCh38.phased.gds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Data/TestData/1000G/All.chr22.27022019.GRCh38.phased.gds -------------------------------------------------------------------------------- /Data/TestData/1000G/LOG.txt: -------------------------------------------------------------------------------- 1 | wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chr22.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz 2 | 3 | cd ../../../Scripts/UTL 4 | 5 | Rscript convertVCFtoGDS.r ../../Data/TestData/1000G/ALL.chr22.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz ../../Data/TestData/1000G/All.chr22.27022019.GRCh38.phased.gds 6 | 7 | 8 | Rscript convertVCFtoGDS.r ../../Data/TestData/Input/ALL.chr22.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz ../../Data/1000G/All.chr22.27022019.GRCh38.phased.gds 9 | Tue Sep 13 09:41:36 2022 10 | Variant Call Format (VCF) Import: 11 | file(s): 12 | ALL.chr22.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz (176.9M) 13 | file format: VCFv4.3 14 | the number of sets of chromosomes (ploidy): 2 15 | the number of samples: 2,548 16 | genotype storage: bit2 17 | compression method: LZMA_RA 18 | # of samples: 2548 19 | Output: 20 | ../../Data/1000G/All.chr22.27022019.GRCh38.phased.gds 21 | Parsing 'ALL.chr22.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz': 22 | + genotype/data { Bit2 2x2548x1059079 LZMA_ra(1.98%), 25.5M } 23 | Digests: 24 | sample.id [md5: cc8afb576aed4d02012126932df7cad6] 25 | variant.id [md5: 7c017d53094de68d314b6ad6d5731cee] 26 | position [md5: 661ae4bc37d222bc242b379ac5b4103c] 27 | chromosome [md5: 0f71906ff5f7af239ab447459e0fd340] 28 | allele [md5: e03733491972cf350905736dc3ba7897] 29 | genotype [md5: 310a491df81e5e5d015cfd8b0534c343] 30 | phase [md5: feef32f42a2bebbf7e8aca22a385acef] 31 | annotation/id [md5: af0e6be931baefc61425e7d80e8a7d6c] 32 | annotation/qual [md5: de3d57a832d4552c0b92a592f0c30ab3] 33 | annotation/filter [md5: 12aa343d303c14e0e724b2c3ac634d59] 34 | annotation/info/AF [md5: 08ba51bd9a4fe4c8d65124d906d651be] 35 | annotation/info/AC [md5: f50cf8580f617f21755b775c998a79a7] 36 | annotation/info/NS [md5: 3f8d2c2fe9b610e0407b63069cdcca19] 37 | annotation/info/AN [md5: 66dc16416504683004b60bf1259370d3] 38 | annotation/info/EAS_AF [md5: 6268475df4da4ecfe85ff45a31985bf2] 39 | annotation/info/EUR_AF [md5: 11f69a8880a343f916f428d428ee0e3e] 40 | annotation/info/AFR_AF [md5: cde11169e2c527e079563326ec5eb603] 41 | annotation/info/AMR_AF [md5: d85787dac3642db9f70cb05a9f22248a] 42 | annotation/info/SAS_AF [md5: 70bb72b5bf8b850a68da314769c6b09d] 43 | annotation/info/VT [md5: f7172d73a09bf45b641029eb2bde879e] 44 | annotation/info/EX_TARGET [md5: 401261c4071060a74aa7994bdce29065] 45 | annotation/info/DP [md5: 47cd81d4a60b61552a300cb09fa0a2cf] 46 | Done. 47 | Tue Sep 13 09:44:39 2022 48 | Optimize the access efficiency ... 49 | Clean up the fragments of GDS file: 50 | open the file '../../Data/1000G/All.chr22.27022019.GRCh38.phased.gds' (31.9M) 51 | # of fragments: 795 52 | save to '../../Data/1000G/All.chr22.27022019.GRCh38.phased.gds.tmp' 53 | rename '../../Data/1000G/All.chr22.27022019.GRCh38.phased.gds.tmp' (31.9M, reduced: 8.2K) 54 | # of fragments: 92 55 | Object of class "SeqVarGDSClass" 56 | File: ./Data/1000G/All.chr22.27022019.GRCh38.phased.gds (31.9M) 57 | + [ ] * 58 | |--+ description [ ] * 59 | |--+ sample.id { Str8 2548 LZMA_ra(7.84%), 1.6K } * 60 | |--+ variant.id { Int32 1059079 LZMA_ra(6.20%), 256.6K } * 61 | |--+ position { Int32 1059079 LZMA_ra(27.0%), 1.1M } * 62 | |--+ chromosome { Str8 1059079 LZMA_ra(0.02%), 617B } * 63 | |--+ allele { Str8 1059079 LZMA_ra(15.4%), 665.6K } * 64 | |--+ genotype [ ] * 65 | | |--+ data { Bit2 2x2548x1059079 LZMA_ra(1.98%), 25.5M } * 66 | | |--+ extra.index { Int32 3x0 LZMA_ra, 18B } * 67 | | \--+ extra { Int16 0 LZMA_ra, 18B } 68 | |--+ phase [ ] 69 | | |--+ data { Bit1 2548x1059079 LZMA_ra(0.01%), 48.1K } * 70 | | |--+ extra.index { Int32 3x0 LZMA_ra, 18B } * 71 | | \--+ extra { Bit1 0 LZMA_ra, 18B } 72 | |--+ annotation [ ] 73 | | |--+ id { Str8 1059079 LZMA_ra(0.03%), 305B } * 74 | | |--+ qual { Float32 1059079 LZMA_ra(0.02%), 777B } * 75 | | |--+ filter { Int32,factor 1059079 LZMA_ra(0.02%), 777B } * 76 | | |--+ info [ ] 77 | | | |--+ AF { Float32 1059079 LZMA_ra(7.72%), 319.6K } * 78 | | | |--+ AC { Int32 1059079 LZMA_ra(19.0%), 788.0K } * 79 | | | |--+ NS { Int32 1059079 LZMA_ra(0.02%), 777B } * 80 | | | |--+ AN { Int32 1059079 LZMA_ra(0.02%), 777B } * 81 | | | |--+ EAS_AF { Float32 1059079 LZMA_ra(5.73%), 237.2K } * 82 | | | |--+ EUR_AF { Float32 1059079 LZMA_ra(6.18%), 255.7K } * 83 | | | |--+ AFR_AF { Float32 1059079 LZMA_ra(8.56%), 354.1K } * 84 | | | |--+ AMR_AF { Float32 1059079 LZMA_ra(6.70%), 277.2K } * 85 | | | |--+ SAS_AF { Float32 1059079 LZMA_ra(6.45%), 266.8K } * 86 | | | |--+ VT { Str8 1059079 LZMA_ra(2.06%), 88.0K } * 87 | | | |--+ EX_TARGET { Bit1 1059079 LZMA_ra(6.62%), 8.6K } * 88 | | | \--+ DP { Int32 1059079 LZMA_ra(45.0%), 1.8M } * 89 | | \--+ format [ ] 90 | \--+ sample.annotation [ ] 91 | 92 | 93 | 94 | 95 | 96 | 97 | Rscript convertVCFtoGDS.r ../../Data/TestData/1000G/ALL.chr1.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz ../../Data/TestData/1000G/All.chr1.27022019.GRCh38.phased.gds 98 | Wed Sep 14 15:34:06 2022 99 | Variant Call Format (VCF) Import: 100 | file(s): 101 | ALL.chr1.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz (992.0M) 102 | file format: VCFv4.3 103 | the number of sets of chromosomes (ploidy): 2 104 | the number of samples: 2,548 105 | genotype storage: bit2 106 | compression method: LZMA_RA 107 | # of samples: 2548 108 | Output: 109 | ../../Data/TestData/1000G/All.chr1.27022019.GRCh38.phased.gds 110 | Parsing 'ALL.chr1.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz': 111 | + genotype/data { Bit2 2x2548x6191833 LZMA_ra(1.66%), 124.8M } 112 | Digests: 113 | sample.id [md5: cc8afb576aed4d02012126932df7cad6] 114 | variant.id [md5: 279f387b3778a8c9d445cfb30d10a171] 115 | position [md5: a6c28615b3f1ee1c947a1b05edee371c] 116 | chromosome [md5: 54f1983c4511a2c4390a5d5df7caa405] 117 | allele [md5: 05ba696424c744191d4fa03bbbb513da] 118 | genotype [md5: 1cc8fb9f7c258ad52077d9fba0cd0b28] 119 | phase [md5: 65e7447bf92f4d3f01ed41bef75a7909] 120 | annotation/id [md5: dbfffce3ef30f3f18399e274c1310a1b] 121 | annotation/qual [md5: fce0966ea8b5452c728dedad5bc274e9] 122 | annotation/filter [md5: 2816dfe618d22aecb05a7c7ee4dd0b15] 123 | annotation/info/AF [md5: 11a4bf622d63d108bb353ea9fc1401df] 124 | annotation/info/AC [md5: dee598f41dd55a2e53df96d7ac639e23] 125 | annotation/info/NS [md5: 03af0bc2c44c6ca6bef0e93a68260dce] 126 | annotation/info/AN [md5: 5cf4736661784e0f51f33d5c7514a963] 127 | annotation/info/EAS_AF [md5: 297f3fa3a884a7f4e8cf1a977fce980c] 128 | annotation/info/EUR_AF [md5: 7ad89550530d1118871c8a4013b6a04f] 129 | annotation/info/AFR_AF [md5: 24f627bbc19e5e300ed57ee265e67904] 130 | annotation/info/AMR_AF [md5: 5a84744b92f10161982d1143896a19ec] 131 | annotation/info/SAS_AF [md5: dfaa87f4f0e77c5375c657652f9c0fd8] 132 | annotation/info/VT [md5: afbc4e4d62b4497a42d627dd938ae5ac] 133 | annotation/info/EX_TARGET [md5: dbf69680949973e984135aac3ab2d290] 134 | annotation/info/DP [md5: a64abd5bc289ef2b94d8807371ef601c] 135 | Done. 136 | Wed Sep 14 15:54:28 2022 137 | Optimize the access efficiency ... 138 | Clean up the fragments of GDS file: 139 | open the file '../../Data/TestData/1000G/All.chr1.27022019.GRCh38.phased.gds' (161.1M) 140 | # of fragments: 3534 141 | save to '../../Data/TestData/1000G/All.chr1.27022019.GRCh38.phased.gds.tmp' 142 | rename '../../Data/TestData/1000G/All.chr1.27022019.GRCh38.phased.gds.tmp' (161.0M, reduced: 40.3K) 143 | # of fragments: 92 144 | Wed Sep 14 15:54:29 2022 145 | [1] "GDS built" 146 | Object of class "SeqVarGDSClass" 147 | File: ./Data/TestData/1000G/All.chr1.27022019.GRCh38.phased.gds (161.0M) 148 | + [ ] * 149 | |--+ description [ ] * 150 | |--+ sample.id { Str8 2548 LZMA_ra(7.84%), 1.6K } * 151 | |--+ variant.id { Int32 6191833 LZMA_ra(2.92%), 706.4K } * 152 | |--+ position { Int32 6191833 LZMA_ra(27.9%), 6.6M } * 153 | |--+ chromosome { Str8 6191833 LZMA_ra(0.02%), 1.9K } * 154 | |--+ allele { Str8 6191833 LZMA_ra(15.7%), 3.9M } * 155 | |--+ genotype [ ] * 156 | | |--+ data { Bit2 2x2548x6191833 LZMA_ra(1.66%), 124.8M } * 157 | | |--+ extra.index { Int32 3x0 LZMA_ra, 18B } * 158 | | \--+ extra { Int16 0 LZMA_ra, 18B } 159 | |--+ phase [ ] 160 | | |--+ data { Bit1 2548x6191833 LZMA_ra(0.01%), 280.4K } * 161 | | |--+ extra.index { Int32 3x0 LZMA_ra, 18B } * 162 | | \--+ extra { Bit1 0 LZMA_ra, 18B } 163 | |--+ annotation [ ] 164 | | |--+ id { Str8 6191833 LZMA_ra(0.02%), 1.0K } * 165 | | |--+ qual { Float32 6191833 LZMA_ra(0.02%), 3.7K } * 166 | | |--+ filter { Int32,factor 6191833 LZMA_ra(0.02%), 3.7K } * 167 | | |--+ info [ ] 168 | | | |--+ AF { Float32 6191833 LZMA_ra(7.39%), 1.7M } * 169 | | | |--+ AC { Int32 6191833 LZMA_ra(18.5%), 4.4M } * 170 | | | |--+ NS { Int32 6191833 LZMA_ra(0.02%), 3.7K } * 171 | | | |--+ AN { Int32 6191833 LZMA_ra(0.02%), 3.7K } * 172 | | | |--+ EAS_AF { Float32 6191833 LZMA_ra(5.54%), 1.3M } * 173 | | | |--+ EUR_AF { Float32 6191833 LZMA_ra(5.90%), 1.4M } * 174 | | | |--+ AFR_AF { Float32 6191833 LZMA_ra(8.20%), 1.9M } * 175 | | | |--+ AMR_AF { Float32 6191833 LZMA_ra(6.52%), 1.5M } * 176 | | | |--+ SAS_AF { Float32 6191833 LZMA_ra(6.17%), 1.5M } * 177 | | | |--+ VT { Str8 6191833 LZMA_ra(2.09%), 522.7K } * 178 | | | |--+ EX_TARGET { Bit1 6191833 LZMA_ra(4.82%), 36.4K } * 179 | | | \--+ DP { Int32 6191833 LZMA_ra(44.0%), 10.4M } * 180 | | \--+ format [ ] 181 | \--+ sample.annotation [ ] 182 | 183 | 184 | zhou@M1 Full % cp ../../FAVORannotator/Scripts/CSV/FAVORannotatorCSVFullDB.R . 185 | zhou@M1 Full % Rscript FAVORannotatorCSVFullDB.R All.chr22.27022019.GRCh38.phased.gds 22 186 | [1] "gds.file: All.chr22.27022019.GRCh38.phased.gds" 187 | [1] "chr: 22" 188 | [1] "use_compression: Yes" 189 | --2022-09-14 16:39:31-- https://dataverse.harvard.edu/api/access/datafile/6358299 190 | Resolving dataverse.harvard.edu (dataverse.harvard.edu)... 3.219.100.164, 54.211.138.37, 3.226.192.24 191 | Connecting to dataverse.harvard.edu (dataverse.harvard.edu)|3.219.100.164|:443... connected. 192 | HTTP request sent, awaiting response... 303 See Other 193 | Location: https://dvn-cloud.s3.amazonaws.com/10.7910/DVN/KFUBKG/181abe84e49-d907a5916c0e?response-content-disposition=attachment%3B%20filename%2A%3DUTF-8%27%27FAVOR.FullDB.Chr22.tar.gz&response-content-type=application%2Fgzip&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20220914T203932Z&X-Amz-SignedHeaders=host&X-Amz-Expires=3600&X-Amz-Credential=AKIAIEJ3NV7UYCSRJC7A%2F20220914%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=705ebe6c586f25f965028b6008bcafc810a4afeec25a6aaca5eca23c11ff87a2 [following] 194 | --2022-09-14 16:39:32-- https://dvn-cloud.s3.amazonaws.com/10.7910/DVN/KFUBKG/181abe84e49-d907a5916c0e?response-content-disposition=attachment%3B%20filename%2A%3DUTF-8%27%27FAVOR.FullDB.Chr22.tar.gz&response-content-type=application%2Fgzip&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20220914T203932Z&X-Amz-SignedHeaders=host&X-Amz-Expires=3600&X-Amz-Credential=AKIAIEJ3NV7UYCSRJC7A%2F20220914%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=705ebe6c586f25f965028b6008bcafc810a4afeec25a6aaca5eca23c11ff87a2 195 | Resolving dvn-cloud.s3.amazonaws.com (dvn-cloud.s3.amazonaws.com)... 3.5.8.193 196 | Connecting to dvn-cloud.s3.amazonaws.com (dvn-cloud.s3.amazonaws.com)|3.5.8.193|:443... connected. 197 | HTTP request sent, awaiting response... 200 OK 198 | Length: 9565974525 (8.9G) [application/gzip] 199 | Saving to: ‘6358299’ 200 | 201 | 6358299 88%[===================================================================================================> 6358299 6358299 100%[================================================================================================================>] 8.91G 3.71MB/s in 44m 54s 202 | 203 | 2022-09-14 17:24:26 (3.39 MB/s) - ‘6358299’ saved [9565974525/9565974525] 204 | 205 | x chr22_1.csv 206 | x chr22_2.csv 207 | x chr22_3.csv 208 | Object of class "SeqVarGDSClass" 209 | File: /Users/zhou/Storage/Research/Projects/Test/Full/All.chr22.27022019.GRCh38.phased.gds (31.9M) 210 | + [ ] * 211 | |--+ description [ ] * 212 | |--+ sample.id { Str8 2548 LZMA_ra(7.84%), 1.6K } * 213 | |--+ variant.id { Int32 1059079 LZMA_ra(6.20%), 256.6K } * 214 | |--+ position { Int32 1059079 LZMA_ra(27.0%), 1.1M } * 215 | |--+ chromosome { Str8 1059079 LZMA_ra(0.02%), 617B } * 216 | |--+ allele { Str8 1059079 LZMA_ra(15.4%), 665.6K } * 217 | |--+ genotype [ ] * 218 | | |--+ data { Bit2 2x2548x1059079 LZMA_ra(1.98%), 25.5M } * 219 | | |--+ extra.index { Int32 3x0 LZMA_ra, 18B } * 220 | | \--+ extra { Int16 0 LZMA_ra, 18B } 221 | |--+ phase [ ] 222 | | |--+ data { Bit1 2548x1059079 LZMA_ra(0.01%), 48.1K } * 223 | | |--+ extra.index { Int32 3x0 LZMA_ra, 18B } * 224 | | \--+ extra { Bit1 0 LZMA_ra, 18B } 225 | |--+ annotation [ ] 226 | | |--+ id { Str8 1059079 LZMA_ra(0.03%), 305B } * 227 | | |--+ qual { Float32 1059079 LZMA_ra(0.02%), 777B } * 228 | | |--+ filter { Int32,factor 1059079 LZMA_ra(0.02%), 777B } * 229 | | |--+ info [ ] 230 | | | |--+ AF { Float32 1059079 LZMA_ra(7.72%), 319.6K } * 231 | | | |--+ AC { Int32 1059079 LZMA_ra(19.0%), 788.0K } * 232 | | | |--+ NS { Int32 1059079 LZMA_ra(0.02%), 777B } * 233 | | | |--+ AN { Int32 1059079 LZMA_ra(0.02%), 777B } * 234 | | | |--+ EAS_AF { Float32 1059079 LZMA_ra(5.73%), 237.2K } * 235 | | | |--+ EUR_AF { Float32 1059079 LZMA_ra(6.18%), 255.7K } * 236 | | | |--+ AFR_AF { Float32 1059079 LZMA_ra(8.56%), 354.1K } * 237 | | | |--+ AMR_AF { Float32 1059079 LZMA_ra(6.70%), 277.2K } * 238 | | | |--+ SAS_AF { Float32 1059079 LZMA_ra(6.45%), 266.8K } * 239 | | | |--+ VT { Str8 1059079 LZMA_ra(2.06%), 88.0K } * 240 | | | |--+ EX_TARGET { Bit1 1059079 LZMA_ra(6.62%), 8.6K } * 241 | | | \--+ DP { Int32 1059079 LZMA_ra(45.0%), 1.8M } * 242 | | \--+ format [ ] 243 | \--+ sample.annotation [ ] 244 | [1] 1 245 | [1] 2 246 | [1] 3 247 | [1] 1 248 | [1] 2 249 | [1] 3 250 | ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── 251 | cols( 252 | .default = col_double(), 253 | VarInfo = col_character(), 254 | variant_vcf = col_character(), 255 | variant_annovar = col_character(), 256 | ref_annovar = col_character(), 257 | alt_annovar = col_character(), 258 | ref_vcf = col_character(), 259 | alt_vcf = col_character(), 260 | aloft_value = col_logical(), 261 | aloft_description = col_logical(), 262 | filter_status = col_character(), 263 | cage_enhancer = col_logical(), 264 | cage_promoter = col_logical(), 265 | cage_tc = col_character(), 266 | clnsig = col_logical(), 267 | clnsigincl = col_logical(), 268 | clndn = col_logical(), 269 | clndnincl = col_logical(), 270 | clnrevstat = col_logical(), 271 | origin = col_logical(), 272 | clndisdb = col_logical() 273 | # ... with 38 more columns 274 | ) 275 | ℹ Use `spec()` for the full column specifications. 276 | 277 | Warning: 7150897 parsing failures. 278 | row col expected actual file 279 | 1421 cage_promoter 1/0/T/F/TRUE/FALSE chr22:10729410..10729413,- './chr22/Anno_chr22.csv' 280 | 1466 cage_promoter 1/0/T/F/TRUE/FALSE chr22:10730403..10730417,- './chr22/Anno_chr22.csv' 281 | 1467 cage_promoter 1/0/T/F/TRUE/FALSE chr22:10730403..10730417,- './chr22/Anno_chr22.csv' 282 | 1468 cage_promoter 1/0/T/F/TRUE/FALSE chr22:10730403..10730417,- './chr22/Anno_chr22.csv' 283 | 1469 cage_promoter 1/0/T/F/TRUE/FALSE chr22:10730403..10730417,- './chr22/Anno_chr22.csv' 284 | .... ............. .................. .......................... ........................ 285 | See problems(...) for more details. 286 | 287 | [1] 1059079 190 288 | There were 14 warnings (use warnings() to see them) 289 | Object of class "SeqVarGDSClass" 290 | File: /Users/zhou/Storage/Research/Projects/Test/Full/All.chr22.27022019.GRCh38.phased.gds (226.1M) 291 | + [ ] * 292 | |--+ description [ ] * 293 | |--+ sample.id { Str8 2548 LZMA_ra(7.84%), 1.6K } * 294 | |--+ variant.id { Int32 1059079 LZMA_ra(6.20%), 256.6K } * 295 | |--+ position { Int32 1059079 LZMA_ra(27.0%), 1.1M } * 296 | |--+ chromosome { Str8 1059079 LZMA_ra(0.02%), 617B } * 297 | |--+ allele { Str8 1059079 LZMA_ra(15.4%), 665.6K } * 298 | |--+ genotype [ ] * 299 | | |--+ data { Bit2 2x2548x1059079 LZMA_ra(1.98%), 25.5M } * 300 | | |--+ extra.index { Int32 3x0 LZMA_ra, 18B } * 301 | | \--+ extra { Int16 0 LZMA_ra, 18B } 302 | |--+ phase [ ] 303 | | |--+ data { Bit1 2548x1059079 LZMA_ra(0.01%), 48.1K } * 304 | | |--+ extra.index { Int32 3x0 LZMA_ra, 18B } * 305 | | \--+ extra { Bit1 0 LZMA_ra, 18B } 306 | |--+ annotation [ ] 307 | | |--+ id { Str8 1059079 LZMA_ra(0.03%), 305B } * 308 | | |--+ qual { Float32 1059079 LZMA_ra(0.02%), 777B } * 309 | | |--+ filter { Int32,factor 1059079 LZMA_ra(0.02%), 777B } * 310 | | |--+ info [ ] 311 | | | |--+ AF { Float32 1059079 LZMA_ra(7.72%), 319.6K } * 312 | | | |--+ AC { Int32 1059079 LZMA_ra(19.0%), 788.0K } * 313 | | | |--+ NS { Int32 1059079 LZMA_ra(0.02%), 777B } * 314 | | | |--+ AN { Int32 1059079 LZMA_ra(0.02%), 777B } * 315 | | | |--+ EAS_AF { Float32 1059079 LZMA_ra(5.73%), 237.2K } * 316 | | | |--+ EUR_AF { Float32 1059079 LZMA_ra(6.18%), 255.7K } * 317 | | | |--+ AFR_AF { Float32 1059079 LZMA_ra(8.56%), 354.1K } * 318 | | | |--+ AMR_AF { Float32 1059079 LZMA_ra(6.70%), 277.2K } * 319 | | | |--+ SAS_AF { Float32 1059079 LZMA_ra(6.45%), 266.8K } * 320 | | | |--+ VT { Str8 1059079 LZMA_ra(2.06%), 88.0K } * 321 | | | |--+ EX_TARGET { Bit1 1059079 LZMA_ra(6.62%), 8.6K } * 322 | | | |--+ DP { Int32 1059079 LZMA_ra(45.0%), 1.8M } * 323 | | | \--+ FAVORFullDBAug1st2022 [ spec_tbl_df,tbl_df,tbl,data.frame,list ] * 324 | | | |--+ VarInfo { Str8 1059079 LZMA_ra(15.6%), 2.5M } 325 | | | |--+ vid { Float64 1059079 LZMA_ra(21.8%), 1.8M } 326 | | | |--+ variant_vcf { Str8 1059079 LZMA_ra(15.5%), 2.5M } 327 | | | |--+ variant_annovar { Str8 1059079 LZMA_ra(11.7%), 2.9M } 328 | | | |--+ chromosome { Float64 1059079 LZMA_ra(0.35%), 29.4K } 329 | | | |--+ start_position { Float64 1059079 LZMA_ra(18.4%), 1.5M } 330 | | | |--+ end_position { Float64 1059079 LZMA_ra(18.4%), 1.5M } 331 | | | |--+ ref_annovar { Str8 1059079 LZMA_ra(17.9%), 383.0K } 332 | | | |--+ alt_annovar { Str8 1059079 LZMA_ra(16.9%), 351.1K } 333 | | | |--+ position { Float64 1059079 LZMA_ra(18.4%), 1.5M } 334 | | | |--+ ref_vcf { Str8 1059079 LZMA_ra(18.4%), 399.5K } 335 | | | |--+ alt_vcf { Str8 1059079 LZMA_ra(16.6%), 347.3K } 336 | | | |--+ aloft_value { Int32,logical 1059079 LZMA_ra(0.02%), 777B } * 337 | | | |--+ aloft_description { Int32,logical 1059079 LZMA_ra(0.02%), 777B } * 338 | | | |--+ apc_conservation { Float64 1059079 LZMA_ra(87.0%), 7.0M } 339 | | | |--+ apc_conservation_v2 { Float64 1059079 LZMA_ra(86.9%), 7.0M } 340 | | | |--+ apc_epigenetics_active { Float64 1059079 LZMA_ra(79.4%), 6.4M } 341 | | | |--+ apc_epigenetics { Float64 1059079 LZMA_ra(86.2%), 7.0M } 342 | | | |--+ apc_epigenetics_repressed { Float64 1059079 LZMA_ra(48.9%), 3.9M } 343 | | | |--+ apc_epigenetics_transcription { Float64 1059079 LZMA_ra(48.5%), 3.9M } 344 | | | |--+ apc_local_nucleotide_diversity { Float64 1059079 LZMA_ra(1.92%), 158.5K } 345 | | | |--+ apc_local_nucleotide_diversity_v2 { Float64 1059079 LZMA_ra(83.5%), 6.8M } 346 | | | |--+ apc_local_nucleotide_diversity_v3 { Float64 1059079 LZMA_ra(84.1%), 6.8M } 347 | | | |--+ apc_mappability { Float64 1059079 LZMA_ra(31.4%), 2.5M } 348 | | | |--+ apc_micro_rna { Float64 1059079 LZMA_ra(2.91%), 241.2K } 349 | | | |--+ apc_mutation_density { Float64 1059079 LZMA_ra(83.5%), 6.7M } 350 | | | |--+ apc_protein_function { Float64 1059079 LZMA_ra(2.57%), 212.7K } 351 | | | |--+ apc_protein_function_v2 { Float64 1059079 LZMA_ra(2.59%), 214.4K } 352 | | | |--+ apc_protein_function_v3 { Float64 1059079 LZMA_ra(2.58%), 213.5K } 353 | | | |--+ apc_proximity_to_coding { Float64 1059079 LZMA_ra(14.1%), 1.1M } 354 | | | |--+ apc_proximity_to_coding_v2 { Float64 1059079 LZMA_ra(6.53%), 540.0K } 355 | | | |--+ apc_proximity_to_tsstes { Float64 1059079 LZMA_ra(74.6%), 6.0M } 356 | | | |--+ apc_transcription_factor { Float64 1059079 LZMA_ra(8.40%), 694.7K } 357 | | | |--+ bravo_an { Float64 1059079 LZMA_ra(1.68%), 138.6K } 358 | | | |--+ bravo_af { Float64 1059079 LZMA_ra(26.3%), 2.1M } 359 | | | |--+ filter_status { Str8 1059079 LZMA_ra(4.39%), 205.2K } 360 | | | |--+ cage_enhancer { Int32,logical 1059079 LZMA_ra(0.02%), 777B } * 361 | | | |--+ cage_promoter { Int32,logical 1059079 LZMA_ra(0.02%), 777B } * 362 | | | |--+ cage_tc { Str8 1059079 LZMA_ra(5.61%), 98.1K } 363 | | | |--+ clnsig { Int32,logical 1059079 LZMA_ra(0.02%), 777B } * 364 | | | |--+ clnsigincl { Int32,logical 1059079 LZMA_ra(0.02%), 777B } * 365 | | | |--+ clndn { Int32,logical 1059079 LZMA_ra(0.02%), 777B } * 366 | | | |--+ clndnincl { Int32,logical 1059079 LZMA_ra(0.02%), 777B } * 367 | | | |--+ clnrevstat { Int32,logical 1059079 LZMA_ra(0.02%), 777B } * 368 | | | |--+ origin { Int32,logical 1059079 LZMA_ra(0.22%), 9.3K } * 369 | | | |--+ clndisdb { Int32,logical 1059079 LZMA_ra(0.02%), 777B } * 370 | | | |--+ clndisdbincl { Int32,logical 1059079 LZMA_ra(0.02%), 777B } * 371 | | | |--+ geneinfo { Int32,logical 1059079 LZMA_ra(0.02%), 777B } * 372 | | | |--+ polyphen2_hdiv_score { Int32,logical 1059079 LZMA_ra(0.02%), 777B } * 373 | | | |--+ polyphen2_hvar_score { Int32,logical 1059079 LZMA_ra(0.02%), 777B } * 374 | | | |--+ mutation_taster_score { Int32,logical 1059079 LZMA_ra(0.27%), 11.0K } * 375 | | | |--+ mutation_assessor_score { Int32,logical 1059079 LZMA_ra(0.05%), 2.2K } * 376 | | | |--+ metasvm_pred { Int32,logical 1059079 LZMA_ra(0.33%), 13.7K } * 377 | | | |--+ fathmm_xf { Float64 1059079 LZMA_ra(54.4%), 4.4M } 378 | | | |--+ funseq_value { Int32,logical 1059079 LZMA_ra(0.38%), 15.7K } * 379 | | | |--+ funseq_description { Int32,logical 1059079 LZMA_ra(0.02%), 777B } * 380 | | | |--+ genecode_comprehensive_category { Str8 1059079 LZMA_ra(0.64%), 66.9K } 381 | | | |--+ genecode_comprehensive_info { Str8 1059079 LZMA_ra(5.62%), 1.1M } 382 | | | |--+ genecode_comprehensive_exonic_category { Int32,logical 1059079 LZMA_ra(0.02%), 777B } * 383 | | | |--+ genecode_comprehensive_exonic_info { Int32,logical 1059079 LZMA_ra(0.02%), 777B } * 384 | | | |--+ genehancer { Str8 1059079 LZMA_ra(0.24%), 171.9K } 385 | | | |--+ af_total { Float64 1059079 LZMA_ra(56.3%), 4.6M } 386 | | | |--+ af_asj_female { Float64 1059079 LZMA_ra(9.65%), 798.1K } 387 | | | |--+ af_eas_female { Float64 1059079 LZMA_ra(9.85%), 815.3K } 388 | | | |--+ af_afr_male { Float64 1059079 LZMA_ra(34.1%), 2.8M } 389 | | | |--+ af_female { Float64 1059079 LZMA_ra(47.3%), 3.8M } 390 | | | |--+ af_fin_male { Float64 1059079 LZMA_ra(16.1%), 1.3M } 391 | | | |--+ af_oth_female { Float64 1059079 LZMA_ra(12.6%), 1.0M } 392 | | | |--+ af_ami { Float64 1059079 LZMA_ra(7.24%), 599.2K } 393 | | | |--+ af_oth { Float64 1059079 LZMA_ra(16.2%), 1.3M } 394 | | | |--+ af_male { Float64 1059079 LZMA_ra(48.7%), 3.9M } 395 | | | |--+ af_ami_female { Float64 1059079 LZMA_ra(6.35%), 525.5K } 396 | | | |--+ af_afr { Float64 1059079 LZMA_ra(42.0%), 3.4M } 397 | | | |--+ af_eas_male { Float64 1059079 LZMA_ra(10.3%), 855.4K } 398 | | | |--+ af_sas { Float64 1059079 LZMA_ra(16.0%), 1.3M } 399 | | | |--+ af_nfe_female { Float64 1059079 LZMA_ra(26.0%), 2.1M } 400 | | | |--+ af_asj_male { Float64 1059079 LZMA_ra(9.37%), 775.5K } 401 | | | |--+ af_raw { Float64 1059079 LZMA_ra(49.4%), 4.0M } 402 | | | |--+ af_oth_male { Float64 1059079 LZMA_ra(12.6%), 1.0M } 403 | | | |--+ af_nfe_male { Float64 1059079 LZMA_ra(24.4%), 2.0M } 404 | | | |--+ af_asj { Float64 1059079 LZMA_ra(11.5%), 947.8K } 405 | | | |--+ af_amr_male { Float64 1059079 LZMA_ra(22.3%), 1.8M } 406 | | | |--+ af_amr_female { Float64 1059079 LZMA_ra(21.0%), 1.7M } 407 | | | |--+ af_sas_female { Float64 1059079 LZMA_ra(8.97%), 742.4K } 408 | | | |--+ af_fin { Float64 1059079 LZMA_ra(16.9%), 1.4M } 409 | | | |--+ af_afr_female { Float64 1059079 LZMA_ra(35.8%), 2.9M } 410 | | | |--+ af_sas_male { Float64 1059079 LZMA_ra(14.8%), 1.2M } 411 | | | |--+ af_amr { Float64 1059079 LZMA_ra(26.8%), 2.2M } 412 | | | |--+ af_nfe { Float64 1059079 LZMA_ra(30.3%), 2.4M } 413 | | | |--+ af_eas { Float64 1059079 LZMA_ra(12.6%), 1.0M } 414 | | | |--+ af_ami_male { Float64 1059079 LZMA_ra(6.20%), 512.7K } 415 | | | |--+ af_fin_female { Float64 1059079 LZMA_ra(11.3%), 934.2K } 416 | | | |--+ linsight { Float64 1059079 LZMA_ra(25.8%), 2.1M } 417 | | | |--+ gc { Float64 1059079 LZMA_ra(9.14%), 756.2K } 418 | | | |--+ cpg { Float64 1059079 LZMA_ra(4.39%), 363.1K } 419 | | | |--+ min_dist_tss { Float64 1059079 LZMA_ra(18.5%), 1.5M } 420 | | | |--+ min_dist_tse { Float64 1059079 LZMA_ra(18.5%), 1.5M } 421 | | | |--+ sift_cat { Int32,logical 1059079 LZMA_ra(0.02%), 777B } * 422 | | | |--+ sift_val { Int32,logical 1059079 LZMA_ra(0.21%), 8.9K } * 423 | | | |--+ polyphen_cat { Int32,logical 1059079 LZMA_ra(0.02%), 777B } * 424 | | | |--+ polyphen_val { Int32,logical 1059079 LZMA_ra(0.15%), 6.1K } * 425 | | | |--+ priphcons { Float64 1059079 LZMA_ra(14.0%), 1.1M } 426 | | | |--+ mamphcons { Float64 1059079 LZMA_ra(9.34%), 773.0K } 427 | | | |--+ verphcons { Float64 1059079 LZMA_ra(9.14%), 755.9K } 428 | | | |--+ priphylop { Float64 1059079 LZMA_ra(14.7%), 1.2M } 429 | | | |--+ mamphylop { Float64 1059079 LZMA_ra(20.7%), 1.7M } 430 | | | |--+ verphylop { Float64 1059079 LZMA_ra(21.0%), 1.7M } 431 | | | |--+ bstatistic { Int32,logical 1059079 LZMA_ra(0.02%), 777B } * 432 | | | |--+ chmm_e1 { Float64 1059079 LZMA_ra(0.70%), 58.1K } 433 | | | |--+ chmm_e2 { Float64 1059079 LZMA_ra(0.63%), 52.2K } 434 | | | |--+ chmm_e3 { Float64 1059079 LZMA_ra(0.84%), 69.2K } 435 | | | |--+ chmm_e4 { Float64 1059079 LZMA_ra(0.98%), 81.3K } 436 | | | |--+ chmm_e5 { Float64 1059079 LZMA_ra(0.78%), 64.3K } 437 | | | |--+ chmm_e6 { Float64 1059079 LZMA_ra(0.65%), 53.7K } 438 | | | |--+ chmm_e7 { Float64 1059079 LZMA_ra(1.33%), 110.3K } 439 | | | |--+ chmm_e8 { Float64 1059079 LZMA_ra(0.97%), 80.0K } 440 | | | |--+ chmm_e9 { Float64 1059079 LZMA_ra(0.57%), 47.5K } 441 | | | |--+ chmm_e10 { Float64 1059079 LZMA_ra(0.69%), 57.4K } 442 | | | |--+ chmm_e11 { Float64 1059079 LZMA_ra(0.87%), 71.8K } 443 | | | |--+ chmm_e12 { Float64 1059079 LZMA_ra(1.37%), 113.7K } 444 | | | |--+ chmm_e13 { Float64 1059079 LZMA_ra(1.00%), 82.9K } 445 | | | |--+ chmm_e14 { Float64 1059079 LZMA_ra(0.99%), 81.6K } 446 | | | |--+ chmm_e15 { Float64 1059079 LZMA_ra(2.18%), 180.8K } 447 | | | |--+ chmm_e16 { Float64 1059079 LZMA_ra(0.55%), 45.6K } 448 | | | |--+ chmm_e17 { Float64 1059079 LZMA_ra(0.70%), 58.3K } 449 | | | |--+ chmm_e18 { Float64 1059079 LZMA_ra(0.69%), 57.5K } 450 | | | |--+ chmm_e19 { Float64 1059079 LZMA_ra(0.65%), 53.4K } 451 | | | |--+ chmm_e20 { Float64 1059079 LZMA_ra(0.63%), 52.4K } 452 | | | |--+ chmm_e21 { Float64 1059079 LZMA_ra(1.33%), 109.8K } 453 | | | |--+ chmm_e22 { Float64 1059079 LZMA_ra(1.03%), 85.6K } 454 | | | |--+ chmm_e23 { Float64 1059079 LZMA_ra(0.79%), 65.6K } 455 | | | |--+ chmm_e24 { Float64 1059079 LZMA_ra(1.31%), 108.6K } 456 | | | |--+ chmm_e25 { Float64 1059079 LZMA_ra(0.83%), 68.7K } 457 | | | |--+ gerp_rs { Float64 1059079 LZMA_ra(1.00%), 82.8K } 458 | | | |--+ gerp_rs_pval { Float64 1059079 LZMA_ra(1.41%), 117.1K } 459 | | | |--+ gerp_n { Float64 1059079 LZMA_ra(14.0%), 1.1M } 460 | | | |--+ gerp_s { Float64 1059079 LZMA_ra(18.9%), 1.5M } 461 | | | |--+ encodeh3k4me1_sum { Float64 1059079 LZMA_ra(18.2%), 1.5M } 462 | | | |--+ encodeh3k4me2_sum { Float64 1059079 LZMA_ra(17.4%), 1.4M } 463 | | | |--+ encodeh3k4me3_sum { Float64 1059079 LZMA_ra(17.4%), 1.4M } 464 | | | |--+ encodeh3k9ac_sum { Float64 1059079 LZMA_ra(17.5%), 1.4M } 465 | | | |--+ encodeh3k9me3_sum { Float64 1059079 LZMA_ra(17.7%), 1.4M } 466 | | | |--+ encodeh3k27ac_sum { Float64 1059079 LZMA_ra(17.9%), 1.4M } 467 | | | |--+ encodeh3k27me3_sum { Float64 1059079 LZMA_ra(18.5%), 1.5M } 468 | | | |--+ encodeh3k36me3_sum { Float64 1059079 LZMA_ra(17.4%), 1.4M } 469 | | | |--+ encodeh3k79me2_sum { Float64 1059079 LZMA_ra(17.6%), 1.4M } 470 | | | |--+ encodeh4k20me1_sum { Float64 1059079 LZMA_ra(18.0%), 1.5M } 471 | | | |--+ encodeh2afz_sum { Float64 1059079 LZMA_ra(17.9%), 1.4M } 472 | | | |--+ encode_dnase_sum { Float64 1059079 LZMA_ra(9.53%), 788.3K } 473 | | | |--+ encodetotal_rna_sum { Float64 1059079 LZMA_ra(5.74%), 474.7K } 474 | | | |--+ grantham { Int32,logical 1059079 LZMA_ra(0.04%), 1.8K } * 475 | | | |--+ freq100bp { Float64 1059079 LZMA_ra(1.96%), 162.4K } 476 | | | |--+ rare100bp { Float64 1059079 LZMA_ra(3.02%), 249.5K } 477 | | | |--+ sngl100bp { Float64 1059079 LZMA_ra(6.39%), 528.8K } 478 | | | |--+ freq1000bp { Float64 1059079 LZMA_ra(2.28%), 188.6K } 479 | | | |--+ rare1000bp { Float64 1059079 LZMA_ra(3.46%), 286.1K } 480 | | | |--+ sngl1000bp { Float64 1059079 LZMA_ra(7.47%), 617.8K } 481 | | | |--+ freq10000bp { Float64 1059079 LZMA_ra(2.64%), 218.1K } 482 | | | |--+ rare10000bp { Float64 1059079 LZMA_ra(4.01%), 332.0K } 483 | | | |--+ sngl10000bp { Float64 1059079 LZMA_ra(7.93%), 655.9K } 484 | | | |--+ remap_overlap_tf { Float64 1059079 LZMA_ra(4.28%), 354.0K } 485 | | | |--+ remap_overlap_cl { Float64 1059079 LZMA_ra(4.72%), 390.2K } 486 | | | |--+ cadd_rawscore { Float64 1059079 LZMA_ra(46.0%), 3.7M } 487 | | | |--+ cadd_phred { Float64 1059079 LZMA_ra(23.8%), 1.9M } 488 | | | |--+ k24_bismap { Float64 1059079 LZMA_ra(7.19%), 594.8K } 489 | | | |--+ k24_umap { Float64 1059079 LZMA_ra(3.94%), 325.6K } 490 | | | |--+ k36_bismap { Float64 1059079 LZMA_ra(4.25%), 351.3K } 491 | | | |--+ k36_umap { Float64 1059079 LZMA_ra(3.44%), 284.5K } 492 | | | |--+ k50_bismap { Float64 1059079 LZMA_ra(3.75%), 310.0K } 493 | | | |--+ k50_umap { Float64 1059079 LZMA_ra(2.69%), 222.6K } 494 | | | |--+ k100_bismap { Float64 1059079 LZMA_ra(1.97%), 162.7K } 495 | | | |--+ k100_umap { Float64 1059079 LZMA_ra(0.68%), 56.6K } 496 | | | |--+ nucdiv { Int32,logical 1059079 LZMA_ra(0.02%), 777B } * 497 | | | |--+ rdhs { Str8 1059079 LZMA_ra(2.52%), 112.2K } 498 | | | |--+ recombination_rate { Int32,logical 1059079 LZMA_ra(0.02%), 777B } * 499 | | | |--+ refseq_category { Int32,logical 1059079 LZMA_ra(0.02%), 777B } * 500 | | | |--+ refseq_info { Int32,logical 1059079 LZMA_ra(0.02%), 777B } * 501 | | | |--+ refseq_exonic_category { Int32,logical 1059079 LZMA_ra(0.02%), 777B } * 502 | | | |--+ refseq_exonic_info { Int32,logical 1059079 LZMA_ra(0.02%), 777B } * 503 | | | |--+ super_enhancer { Int32,logical 1059079 LZMA_ra(0.02%), 777B } * 504 | | | |--+ tg_afr { Int32,logical 1059079 LZMA_ra(0.08%), 3.2K } * 505 | | | |--+ tg_all { Int32,logical 1059079 LZMA_ra(0.05%), 2.3K } * 506 | | | |--+ tg_amr { Int32,logical 1059079 LZMA_ra(0.07%), 3.0K } * 507 | | | |--+ tg_eas { Int32,logical 1059079 LZMA_ra(0.13%), 5.5K } * 508 | | | |--+ tg_eur { Int32,logical 1059079 LZMA_ra(0.07%), 3.0K } * 509 | | | |--+ tg_sas { Int32,logical 1059079 LZMA_ra(0.08%), 3.4K } * 510 | | | |--+ ucsc_category { Str8 1059079 LZMA_ra(0.62%), 74.5K } 511 | | | |--+ ucsc_info { Str8 1059079 LZMA_ra(2.24%), 1.1M } 512 | | | |--+ ucsc_exonic_category { Int32,logical 1059079 LZMA_ra(0.02%), 777B } * 513 | | | \--+ ucsc_exonic_info { Int32,logical 1059079 LZMA_ra(0.02%), 777B } * 514 | | \--+ format [ ] 515 | \--+ sample.annotation [ ] 516 | [1] "time" 517 | Time difference of 1.045753 hours 518 | 519 | 520 | 521 | 522 | 523 | 524 | zhou@M1 Test % Rscript FAVORannotatorCSVEssentialDB.R All.chr22.27022019.GRCh38.phased.gds 22 525 | [1] "gds.file: All.chr22.27022019.GRCh38.phased.gds" 526 | [1] "chr: 22" 527 | [1] "use_compression: Yes" 528 | --2022-09-14 16:42:28-- https://dataverse.harvard.edu/api/access/datafile/6170504 529 | Resolving dataverse.harvard.edu (dataverse.harvard.edu)... 3.219.100.164, 3.226.192.24, 54.211.138.37 530 | Connecting to dataverse.harvard.edu (dataverse.harvard.edu)|3.219.100.164|:443... connected. 531 | HTTP request sent, awaiting response... 303 See Other 532 | Location: https://dvn-cloud.s3.amazonaws.com/10.7910/DVN/1VGTJI/17fe155b1d0-76967428f313?response-content-disposition=attachment%3B%20filename%2A%3DUTF-8%27%27chr22.tar.gz&response-content-type=application%2Fx-gzip&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20220914T204228Z&X-Amz-SignedHeaders=host&X-Amz-Expires=3600&X-Amz-Credential=AKIAIEJ3NV7UYCSRJC7A%2F20220914%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=6be2b2b646fbb584c3a51af135b1558a0cb7d63bbffa6457da4ecde93573b489 [following] 533 | --2022-09-14 16:42:28-- https://dvn-cloud.s3.amazonaws.com/10.7910/DVN/1VGTJI/17fe155b1d0-76967428f313?response-content-disposition=attachment%3B%20filename%2A%3DUTF-8%27%27chr22.tar.gz&response-content-type=application%2Fx-gzip&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20220914T204228Z&X-Amz-SignedHeaders=host&X-Amz-Expires=3600&X-Amz-Credential=AKIAIEJ3NV7UYCSRJC7A%2F20220914%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=6be2b2b646fbb584c3a51af135b1558a0cb7d63bbffa6457da4ecde93573b489 534 | Resolving dvn-cloud.s3.amazonaws.com (dvn-cloud.s3.amazonaws.com)... 52.217.171.57 535 | Connecting to dvn-cloud.s3.amazonaws.com (dvn-cloud.s3.amazonaws.com)|52.217.171.57|:443... connected. 536 | HTTP request sent, awaiting response... 200 OK 537 | Length: 5574054308 (5.2G) [application/x-gzip] 538 | Saving to: ‘6170504’ 539 | 540 | 6170504 100%[================================================================================================================>] 5.19G 2.59MB/s in 32m 38s 541 | 542 | 2022-09-14 17:15:07 (2.72 MB/s) - ‘6170504’ saved [5574054308/5574054308] 543 | 544 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr22_1.csv 545 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr22_1.csv.idx 546 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr22_2.csv 547 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr22_2.csv.idx 548 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr22_3.csv 549 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr22_3.csv.idx 550 | Object of class "SeqVarGDSClass" 551 | File: /Users/zhou/Storage/Research/Projects/Test/All.chr22.27022019.GRCh38.phased.gds (87.4M) 552 | + [ ] * 553 | |--+ description [ ] * 554 | |--+ sample.id { Str8 2548 LZMA_ra(7.84%), 1.6K } * 555 | |--+ variant.id { Int32 1059079 LZMA_ra(6.20%), 256.6K } * 556 | |--+ position { Int32 1059079 LZMA_ra(27.0%), 1.1M } * 557 | |--+ chromosome { Str8 1059079 LZMA_ra(0.02%), 617B } * 558 | |--+ allele { Str8 1059079 LZMA_ra(15.4%), 665.6K } * 559 | |--+ genotype [ ] * 560 | | |--+ data { Bit2 2x2548x1059079 LZMA_ra(1.98%), 25.5M } * 561 | | |--+ extra.index { Int32 3x0 LZMA_ra, 18B } * 562 | | \--+ extra { Int16 0 LZMA_ra, 18B } 563 | |--+ phase [ ] 564 | | |--+ data { Bit1 2548x1059079 LZMA_ra(0.01%), 48.1K } * 565 | | |--+ extra.index { Int32 3x0 LZMA_ra, 18B } * 566 | | \--+ extra { Bit1 0 LZMA_ra, 18B } 567 | |--+ annotation [ ] 568 | | |--+ id { Str8 1059079 LZMA_ra(0.03%), 305B } * 569 | | |--+ qual { Float32 1059079 LZMA_ra(0.02%), 777B } * 570 | | |--+ filter { Int32,factor 1059079 LZMA_ra(0.02%), 777B } * 571 | | |--+ info [ ] 572 | | | |--+ AF { Float32 1059079 LZMA_ra(7.72%), 319.6K } * 573 | | | |--+ AC { Int32 1059079 LZMA_ra(19.0%), 788.0K } * 574 | | | |--+ NS { Int32 1059079 LZMA_ra(0.02%), 777B } * 575 | | | |--+ AN { Int32 1059079 LZMA_ra(0.02%), 777B } * 576 | | | |--+ EAS_AF { Float32 1059079 LZMA_ra(5.73%), 237.2K } * 577 | | | |--+ EUR_AF { Float32 1059079 LZMA_ra(6.18%), 255.7K } * 578 | | | |--+ AFR_AF { Float32 1059079 LZMA_ra(8.56%), 354.1K } * 579 | | | |--+ AMR_AF { Float32 1059079 LZMA_ra(6.70%), 277.2K } * 580 | | | |--+ SAS_AF { Float32 1059079 LZMA_ra(6.45%), 266.8K } * 581 | | | |--+ VT { Str8 1059079 LZMA_ra(2.06%), 88.0K } * 582 | | | |--+ EX_TARGET { Bit1 1059079 LZMA_ra(6.62%), 8.6K } * 583 | | | |--+ DP { Int32 1059079 LZMA_ra(45.0%), 1.8M } * 584 | | | |--+ FunctionalAnnotationJun1st2022 [ tbl_df,tbl,data.frame,list ] * 585 | | | \--+ FunctionalAnnotationAug1st2022 [ spec_tbl_df,tbl_df,tbl,data.frame,list ] * 586 | | | |--+ VarInfo { Str8 1059079 LZMA_ra(15.6%), 2.5M } 587 | | | |--+ apc_conservation { Float64 1059079 LZMA_ra(86.9%), 7.0M } 588 | | | |--+ apc_epigenetics { Float64 1059079 LZMA_ra(86.2%), 7.0M } 589 | | | |--+ apc_epigenetics_active { Float64 1059079 LZMA_ra(79.4%), 6.4M } 590 | | | |--+ apc_epigenetics_repressed { Float64 1059079 LZMA_ra(48.9%), 3.9M } 591 | | | |--+ apc_epigenetics_transcription { Float64 1059079 LZMA_ra(48.5%), 3.9M } 592 | | | |--+ apc_local_nucleotide_diversity { Float64 1059079 LZMA_ra(84.1%), 6.8M } 593 | | | |--+ apc_mappability { Float64 1059079 LZMA_ra(31.4%), 2.5M } 594 | | | |--+ apc_protein_function { Float64 1059079 LZMA_ra(2.58%), 213.5K } 595 | | | |--+ apc_transcription_factor { Float64 1059079 LZMA_ra(8.40%), 694.7K } 596 | | | |--+ cage_tc { Str8 1059079 LZMA_ra(5.61%), 98.1K } 597 | | | |--+ metasvm_pred { Str8 1059079 LZMA_ra(1.25%), 13.1K } 598 | | | |--+ rsid { Str8 1059079 LZMA_ra(35.6%), 4.1M } 599 | | | |--+ fathmm_xf { Float64 1059079 LZMA_ra(54.4%), 4.4M } 600 | | | |--+ genecode_comprehensive_category { Str8 1059079 LZMA_ra(0.64%), 66.9K } 601 | | | |--+ genecode_comprehensive_info { Str8 1059079 LZMA_ra(5.62%), 1.1M } 602 | | | |--+ genecode_comprehensive_exonic_category { Str8 1059079 LZMA_ra(1.51%), 21.7K } 603 | | | |--+ genecode_comprehensive_exonic_info { Str8 1059079 LZMA_ra(7.62%), 330.8K } 604 | | | |--+ genehancer { Str8 1059079 LZMA_ra(0.24%), 171.9K } 605 | | | |--+ linsight { Float64 1059079 LZMA_ra(25.8%), 2.1M } 606 | | | |--+ cadd_phred { Float64 1059079 LZMA_ra(23.8%), 1.9M } 607 | | | \--+ rdhs { Str8 1059079 LZMA_ra(2.52%), 112.2K } 608 | | \--+ format [ ] 609 | \--+ sample.annotation [ ] 610 | [1] 1 611 | [1] 2 612 | [1] 3 613 | [1] 1 614 | [1] 2 615 | [1] 3 616 | [1] 1059079 22 617 | Warning messages: 618 | 1: In add.gdsn(ans, nm[i], val[[i]], compress = compress, closezip = closezip, : 619 | Missing characters are converted to "". 620 | 2: In add.gdsn(ans, nm[i], val[[i]], compress = compress, closezip = closezip, : 621 | Missing characters are converted to "". 622 | 3: In add.gdsn(ans, nm[i], val[[i]], compress = compress, closezip = closezip, : 623 | Missing characters are converted to "". 624 | 4: In add.gdsn(ans, nm[i], val[[i]], compress = compress, closezip = closezip, : 625 | Missing characters are converted to "". 626 | 5: In add.gdsn(ans, nm[i], val[[i]], compress = compress, closezip = closezip, : 627 | Missing characters are converted to "". 628 | 6: In add.gdsn(ans, nm[i], val[[i]], compress = compress, closezip = closezip, : 629 | Missing characters are converted to "". 630 | 7: In add.gdsn(ans, nm[i], val[[i]], compress = compress, closezip = closezip, : 631 | Missing characters are converted to "". 632 | 8: In add.gdsn(ans, nm[i], val[[i]], compress = compress, closezip = closezip, : 633 | Missing characters are converted to "". 634 | 9: In add.gdsn(ans, nm[i], val[[i]], compress = compress, closezip = closezip, : 635 | Missing characters are converted to "". 636 | [1] "time" 637 | Time difference of 14.08326 mins 638 | 639 | 640 | 641 | 642 | 643 | zhou@M1 Essential % Rscript FAVORannotatorCSVEssentialDB.R All.chr1.27022019.GRCh38.phased.gds 1 644 | [1] "gds.file: All.chr1.27022019.GRCh38.phased.gds" 645 | [1] "chr: 1" 646 | [1] "use_compression: Yes" 647 | --2022-09-14 16:24:39-- https://dataverse.harvard.edu/api/access/datafile/6170506 648 | Resolving dataverse.harvard.edu (dataverse.harvard.edu)... 54.211.138.37, 3.219.100.164, 3.226.192.24 649 | Connecting to dataverse.harvard.edu (dataverse.harvard.edu)|54.211.138.37|:443... connected. 650 | HTTP request sent, awaiting response... 303 See Other 651 | Location: https://dvn-cloud.s3.amazonaws.com/10.7910/DVN/1VGTJI/17fe5944e75-2c901ebf815d?response-content-disposition=attachment%3B%20filename%2A%3DUTF-8%27%27chr1.tar.gz&response-content-type=application%2Fx-gzip&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20220914T202439Z&X-Amz-SignedHeaders=host&X-Amz-Expires=3600&X-Amz-Credential=AKIAIEJ3NV7UYCSRJC7A%2F20220914%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=52a60668962768b84f03bc5e6f2084ddddb776848455215b7d7cfd9074e02fb8 [following] 652 | --2022-09-14 16:24:39-- https://dvn-cloud.s3.amazonaws.com/10.7910/DVN/1VGTJI/17fe5944e75-2c901ebf815d?response-content-disposition=attachment%3B%20filename%2A%3DUTF-8%27%27chr1.tar.gz&response-content-type=application%2Fx-gzip&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20220914T202439Z&X-Amz-SignedHeaders=host&X-Amz-Expires=3600&X-Amz-Credential=AKIAIEJ3NV7UYCSRJC7A%2F20220914%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=52a60668962768b84f03bc5e6f2084ddddb776848455215b7d7cfd9074e02fb8 653 | Resolving dvn-cloud.s3.amazonaws.com (dvn-cloud.s3.amazonaws.com)... 52.217.133.185 654 | Connecting to dvn-cloud.s3.amazonaws.com (dvn-cloud.s3.amazonaws.com)|52.217.133.185|:443... connected. 655 | HTTP request sent, awaiting response... 200 OK 656 | Length: 33455185130 (31G) [application/x-gzip] 657 | Saving to: ‘6170506’ 658 | 659 | 6170506 100%[================================================================================================================>] 31.16G 5.23MB/s in 2h 2m 660 | 661 | 2022-09-14 18:27:07 (4.34 MB/s) - ‘6170506’ saved [33455185130/33455185130] 662 | 663 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_10.csv 664 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_10.csv.idx 665 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_11.csv 666 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_11.csv.idx 667 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_12.csv 668 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_12.csv.idx 669 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_13.csv 670 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_13.csv.idx 671 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_14.csv 672 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_14.csv.idx 673 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_1.csv 674 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_1.csv.idx 675 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_2.csv 676 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_2.csv.idx 677 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_3.csv 678 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_3.csv.idx 679 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_4.csv 680 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_4.csv.idx 681 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_5.csv 682 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_5.csv.idx 683 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_6.csv 684 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_6.csv.idx 685 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_7.csv 686 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_7.csv.idx 687 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_8.csv 688 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_8.csv.idx 689 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_9.csv 690 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_9.csv.idx 691 | Object of class "SeqVarGDSClass" 692 | File: /Users/zhou/Storage/Research/Projects/Test/Essential/All.chr1.27022019.GRCh38.phased.gds (161.0M) 693 | + [ ] * 694 | |--+ description [ ] * 695 | |--+ sample.id { Str8 2548 LZMA_ra(7.84%), 1.6K } * 696 | |--+ variant.id { Int32 6191833 LZMA_ra(2.92%), 706.4K } * 697 | |--+ position { Int32 6191833 LZMA_ra(27.9%), 6.6M } * 698 | |--+ chromosome { Str8 6191833 LZMA_ra(0.02%), 1.9K } * 699 | |--+ allele { Str8 6191833 LZMA_ra(15.7%), 3.9M } * 700 | |--+ genotype [ ] * 701 | | |--+ data { Bit2 2x2548x6191833 LZMA_ra(1.66%), 124.8M } * 702 | | |--+ extra.index { Int32 3x0 LZMA_ra, 18B } * 703 | | \--+ extra { Int16 0 LZMA_ra, 18B } 704 | |--+ phase [ ] 705 | | |--+ data { Bit1 2548x6191833 LZMA_ra(0.01%), 280.4K } * 706 | | |--+ extra.index { Int32 3x0 LZMA_ra, 18B } * 707 | | \--+ extra { Bit1 0 LZMA_ra, 18B } 708 | |--+ annotation [ ] 709 | | |--+ id { Str8 6191833 LZMA_ra(0.02%), 1.0K } * 710 | | |--+ qual { Float32 6191833 LZMA_ra(0.02%), 3.7K } * 711 | | |--+ filter { Int32,factor 6191833 LZMA_ra(0.02%), 3.7K } * 712 | | |--+ info [ ] 713 | | | |--+ AF { Float32 6191833 LZMA_ra(7.39%), 1.7M } * 714 | | | |--+ AC { Int32 6191833 LZMA_ra(18.5%), 4.4M } * 715 | | | |--+ NS { Int32 6191833 LZMA_ra(0.02%), 3.7K } * 716 | | | |--+ AN { Int32 6191833 LZMA_ra(0.02%), 3.7K } * 717 | | | |--+ EAS_AF { Float32 6191833 LZMA_ra(5.54%), 1.3M } * 718 | | | |--+ EUR_AF { Float32 6191833 LZMA_ra(5.90%), 1.4M } * 719 | | | |--+ AFR_AF { Float32 6191833 LZMA_ra(8.20%), 1.9M } * 720 | | | |--+ AMR_AF { Float32 6191833 LZMA_ra(6.52%), 1.5M } * 721 | | | |--+ SAS_AF { Float32 6191833 LZMA_ra(6.17%), 1.5M } * 722 | | | |--+ VT { Str8 6191833 LZMA_ra(2.09%), 522.7K } * 723 | | | |--+ EX_TARGET { Bit1 6191833 LZMA_ra(4.82%), 36.4K } * 724 | | | \--+ DP { Int32 6191833 LZMA_ra(44.0%), 10.4M } * 725 | | \--+ format [ ] 726 | \--+ sample.annotation [ ] 727 | [1] 1 728 | [1] 2 729 | [1] 3 730 | [1] 4 731 | [1] 5 732 | [1] 6 733 | [1] 7 734 | [1] 8 735 | [1] 9 736 | [1] 10 737 | [1] 11 738 | [1] 12 739 | [1] 13 740 | [1] 14 741 | [1] 1 742 | 743 | [1] 2 744 | [1] 3 745 | [1] 4 746 | [1] 5 747 | [1] 6 748 | [1] 7 749 | [1] 8 750 | [1] 9 751 | [1] 10 752 | [1] 11 753 | [1] 12 754 | [1] 13 755 | [1] 14 756 | [1] 6191833 22 757 | Warning messages: 758 | 1: In add.gdsn(ans, nm[i], val[[i]], compress = compress, closezip = closezip, : 759 | Missing characters are converted to "". 760 | 2: In add.gdsn(ans, nm[i], val[[i]], compress = compress, closezip = closezip, : 761 | Missing characters are converted to "". 762 | 3: In add.gdsn(ans, nm[i], val[[i]], compress = compress, closezip = closezip, : 763 | Missing characters are converted to "". 764 | 4: In add.gdsn(ans, nm[i], val[[i]], compress = compress, closezip = closezip, : 765 | Missing characters are converted to "". 766 | 5: In add.gdsn(ans, nm[i], val[[i]], compress = compress, closezip = closezip, : 767 | Missing characters are converted to "". 768 | 6: In add.gdsn(ans, nm[i], val[[i]], compress = compress, closezip = closezip, : 769 | Missing characters are converted to "". 770 | 7: In add.gdsn(ans, nm[i], val[[i]], compress = compress, closezip = closezip, : 771 | Missing characters are converted to "". 772 | 8: In add.gdsn(ans, nm[i], val[[i]], compress = compress, closezip = closezip, : 773 | Missing characters are converted to "". 774 | 9: In add.gdsn(ans, nm[i], val[[i]], compress = compress, closezip = closezip, : 775 | Missing characters are converted to "". 776 | Object of class "SeqVarGDSClass" 777 | File: /Users/zhou/Storage/Research/Projects/Test/Essential/All.chr1.27022019.GRCh38.phased.gds (485.6M) 778 | + [ ] * 779 | |--+ description [ ] * 780 | |--+ sample.id { Str8 2548 LZMA_ra(7.84%), 1.6K } * 781 | |--+ variant.id { Int32 6191833 LZMA_ra(2.92%), 706.4K } * 782 | |--+ position { Int32 6191833 LZMA_ra(27.9%), 6.6M } * 783 | |--+ chromosome { Str8 6191833 LZMA_ra(0.02%), 1.9K } * 784 | |--+ allele { Str8 6191833 LZMA_ra(15.7%), 3.9M } * 785 | |--+ genotype [ ] * 786 | | |--+ data { Bit2 2x2548x6191833 LZMA_ra(1.66%), 124.8M } * 787 | | |--+ extra.index { Int32 3x0 LZMA_ra, 18B } * 788 | | \--+ extra { Int16 0 LZMA_ra, 18B } 789 | |--+ phase [ ] 790 | | |--+ data { Bit1 2548x6191833 LZMA_ra(0.01%), 280.4K } * 791 | | |--+ extra.index { Int32 3x0 LZMA_ra, 18B } * 792 | | \--+ extra { Bit1 0 LZMA_ra, 18B } 793 | |--+ annotation [ ] 794 | | |--+ id { Str8 6191833 LZMA_ra(0.02%), 1.0K } * 795 | | |--+ qual { Float32 6191833 LZMA_ra(0.02%), 3.7K } * 796 | | |--+ filter { Int32,factor 6191833 LZMA_ra(0.02%), 3.7K } * 797 | | |--+ info [ ] 798 | | | |--+ AF { Float32 6191833 LZMA_ra(7.39%), 1.7M } * 799 | | | |--+ AC { Int32 6191833 LZMA_ra(18.5%), 4.4M } * 800 | | | |--+ NS { Int32 6191833 LZMA_ra(0.02%), 3.7K } * 801 | | | |--+ AN { Int32 6191833 LZMA_ra(0.02%), 3.7K } * 802 | | | |--+ EAS_AF { Float32 6191833 LZMA_ra(5.54%), 1.3M } * 803 | | | |--+ EUR_AF { Float32 6191833 LZMA_ra(5.90%), 1.4M } * 804 | | | |--+ AFR_AF { Float32 6191833 LZMA_ra(8.20%), 1.9M } * 805 | | | |--+ AMR_AF { Float32 6191833 LZMA_ra(6.52%), 1.5M } * 806 | | | |--+ SAS_AF { Float32 6191833 LZMA_ra(6.17%), 1.5M } * 807 | | | |--+ VT { Str8 6191833 LZMA_ra(2.09%), 522.7K } * 808 | | | |--+ EX_TARGET { Bit1 6191833 LZMA_ra(4.82%), 36.4K } * 809 | | | |--+ DP { Int32 6191833 LZMA_ra(44.0%), 10.4M } * 810 | | | \--+ FunctionalAnnotationJun1st2022 [ spec_tbl_df,tbl_df,tbl,data.frame,list ] * 811 | | | |--+ VarInfo { Str8 6191833 LZMA_ra(16.4%), 15.2M } 812 | | | |--+ apc_conservation { Float64 6191833 LZMA_ra(86.9%), 41.1M } 813 | | | |--+ apc_epigenetics { Float64 6191833 LZMA_ra(86.4%), 40.8M } 814 | | | |--+ apc_epigenetics_active { Float64 6191833 LZMA_ra(80.7%), 38.1M } 815 | | | |--+ apc_epigenetics_repressed { Float64 6191833 LZMA_ra(52.7%), 24.9M } 816 | | | |--+ apc_epigenetics_transcription { Float64 6191833 LZMA_ra(48.1%), 22.7M } 817 | | | |--+ apc_local_nucleotide_diversity { Float64 6191833 LZMA_ra(83.6%), 39.5M } 818 | | | |--+ apc_mappability { Float64 6191833 LZMA_ra(29.0%), 13.7M } 819 | | | |--+ apc_protein_function { Float64 6191833 LZMA_ra(2.17%), 1.0M } 820 | | | |--+ apc_transcription_factor { Float64 6191833 LZMA_ra(7.31%), 3.5M } 821 | | | |--+ cage_tc { Str8 6191833 LZMA_ra(4.99%), 451.4K } 822 | | | |--+ metasvm_pred { Str8 6191833 LZMA_ra(0.94%), 57.5K } 823 | | | |--+ rsid { Str8 6191833 LZMA_ra(35.7%), 24.2M } 824 | | | |--+ fathmm_xf { Float64 6191833 LZMA_ra(57.2%), 27.0M } 825 | | | |--+ genecode_comprehensive_category { Str8 6191833 LZMA_ra(0.57%), 360.1K } 826 | | | |--+ genecode_comprehensive_info { Str8 6191833 LZMA_ra(5.77%), 7.2M } 827 | | | |--+ genecode_comprehensive_exonic_category { Str8 6191833 LZMA_ra(1.17%), 89.4K } 828 | | | |--+ genecode_comprehensive_exonic_info { Str8 6191833 LZMA_ra(7.27%), 1.3M } 829 | | | |--+ genehancer { Str8 6191833 LZMA_ra(0.27%), 707.1K } 830 | | | |--+ linsight { Float64 6191833 LZMA_ra(22.9%), 10.8M } 831 | | | |--+ cadd_phred { Float64 6191833 LZMA_ra(23.8%), 11.2M } 832 | | | \--+ rdhs { Str8 6191833 LZMA_ra(2.76%), 622.1K } 833 | | \--+ format [ ] 834 | \--+ sample.annotation [ ] 835 | 836 | -------------------------------------------------------------------------------- /Data/TestData/Input/._FAVOR.T2210k.gds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Data/TestData/Input/._FAVOR.T2210k.gds -------------------------------------------------------------------------------- /Data/TestData/Input/FAVOR.T2210k.gds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Data/TestData/Input/FAVOR.T2210k.gds -------------------------------------------------------------------------------- /Docs/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Docs/.DS_Store -------------------------------------------------------------------------------- /Docs/Tutorial/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Docs/Tutorial/.DS_Store -------------------------------------------------------------------------------- /Docs/Tutorial/Demos/FASRC.md: -------------------------------------------------------------------------------- 1 | # **Step-by-step tutorial of running FAVORannotator on FASRC Slurm Cluster** 2 | 3 | ## 4 | 5 | ## FAVORannotator runs smoothly on FASRC slurm cluster. Like many other slurm cluster, FASRC has PostgreSQL installed. And we can quickly boot up the PostgreSQL, on different nodes that vastly boost the performance and enables the parallel computing. 6 | 7 | ## 1. Download the FAVORannotator data file from the FAVOR website: [http://favor.genohub.org](http://favor.genohub.org/). 8 | 9 | ## 2. Download the FAVORannotator data file from here **whole genome** version (download [URL](https://drive.google.com/file/d/1izzKJliuouG2pCJ6MkcXd_oxoEwzx5RQ/view?usp=sharing)) and **by chromosome** version (download [URL](https://drive.google.com/file/d/1Ccep9hmeWpIT_OH9IqS6p1MZbEonjG2z/view?usp=sharing)) or from the FAVOR website: [http://favor.genohub.org](http://favor.genohub.org/) 10 | ## 3. Set up the database on slurm cluster 11 | 12 | ## 4. Install the fasrc VPN ([https://docs.rc.fas.harvard.edu/kb/vpn-setup/](https://docs.rc.fas.harvard.edu/kb/vpn-setup/)). To connect to VPN, the Cisco AnyConnect client can be installed fromVPN portal ([https://downloads.rc.fas.harvard.edu](https://downloads.rc.fas.harvard.edu)) Note that you need to add @fasrc after your username in order to login. 13 | 14 | 15 | ## 5. Once the VPN has been connected, access the fasrc VDI ([https://docs.rc.fas.harvard.edu/kb/virtual-desktop/](https://docs.rc.fas.harvard.edu/kb/virtual-desktop/)). Following figure shows how the VDI interaface look like. 16 | 17 | ![VDI Interface](https://github.com/zhouhufeng/FAVORannotator/blob/main/Docs/Tutorial/Figures/FASRC1.jpg) 18 | 19 | _Figure 1. VDI Interface._ 20 | 21 | ## 7. Create a folder on fasrc where you would like to store the database ($ _mkdir /Directory/FAVORannotatorDataBase/_) 22 | 23 | ## 8. Then we can create a database server by 1. Click “My Interactive Sessions”; at the top. 2. Click “Postgresql db”; on the left. 3. Configure the server. 24 | ![My Interactive Sessions of postgreSQL](https://github.com/zhouhufeng/FAVORannotator/blob/main/Docs/Tutorial/Figures/postgreSQLdb.png) 25 | 26 | _Figure 2. My Interactive Sessions of postgreSQL._ 27 | 28 | ## 9. The configuration of postgreSQL database server is shown through the following figure. In the following example show in the figure 3, we input the folder directory for which we want the postgreSQL to store the database to, and also we input the database name. 29 | ![postgreSQL configuration on VDI](https://github.com/zhouhufeng/FAVORannotator/blob/main/Docs/Tutorial/Figures/createDBinstance.png) 30 | 31 | _Figure 3. postgreSQL configuration on VDI._ 32 | 33 | ## 10. The postgreSQL database server is up and running after a few minutes of the creating as shown through the following figure. And on the page you will be able to find the assigned **host name** and the **port number**.These information is important for FAVORannotator R program to find the database instance. In the following example show in the figure 4, the host name is holy7c04301, and the port number is 9011. 34 | 35 | ![Active running postgreSQL database](https://github.com/zhouhufeng/FAVORannotator/blob/main/Docs/Tutorial/Figures/runningInstance.png) 36 | 37 | _Figure 4. Active running postgreSQL database._ 38 | 39 | ## 11. Through the above configuration and booting up postgreSQL through VDI, we now know the following information of the running backend database host on the postgreSQL. DBName from step 9, Host and Port from step 10, and User and Password is your FASRC user name and password. These information can be input in the config.R file. 40 | 41 | 42 | ## 12. Once config.R file is updated, if the database has already been imported then FAVORannotator is ready to run. If it is the first to boot up the database instance, we can import the database in the following commands. 43 | 44 | 45 | 46 | ## **Import Database into PostgreSQL and Run FAVORannotator** 47 | 48 | Once PostgreSQL database is booted up and running, backend datbase can be imported and then FAVORannotator can be executed as follows. 49 | 50 | ### 1. Once the server is running, set up the database: 51 | 52 | 1) Load the postgres module 53 | 54 | i. On fasrc, the command is: _module load postgresql/12.2-fasrc01_ 55 | 56 | 2) Log into the database: psql -h hostname -p port -d databasename; 57 | 58 | ii. eg_: psql -h holy2c14409 -p 8462 -d favor_ 59 | 60 | 3) Create the table 61 | 62 | iii. _CREATE TABLE MAIN( 63 | variant_vcf text, 64 | chromosome text, 65 | position integer, 66 | ref_vcf text, 67 | alt_vcf text, 68 | apc_conservation numeric, 69 | apc_conservation_v2 numeric, 70 | apc_epigenetics numeric, 71 | apc_epigenetics_active numeric, 72 | apc_epigenetics_repressed numeric, 73 | apc_epigenetics_transcription numeric, 74 | apc_local_nucleotide_diversity numeric, 75 | apc_local_nucleotide_diversity_v2 numeric, 76 | apc_local_nucleotide_diversity_v3 numeric, 77 | apc_mappability numeric, 78 | apc_micro_rna numeric, 79 | apc_mutation_density numeric, 80 | apc_protein_function numeric, 81 | apc_proximity_to_coding numeric, 82 | apc_proximity_to_coding_v2 numeric, 83 | apc_proximity_to_tsstes numeric, 84 | apc_transcription_factor numeric, 85 | cadd_phred numeric, 86 | cage text, 87 | fathmm_xf numeric, 88 | genecode_comprehensive_category text, 89 | genecode_comprehensive_info text, 90 | genecode_comprehensive_exonic_info text, 91 | genecode_comprehensive_exonic_category text, 92 | genehancer text, 93 | linsight numeric, 94 | metasvm_pred text, 95 | rdhs text, 96 | rsid text);_ 97 | 98 | iv. Load the data: _COPY main FROM path to file/offlineData.csv; CSV HEADER;_ This command can take several hours to complete, up to a day. 99 | 100 | v. Create the index: _CREATE INDEX ON main USING HASH(variant\_vcf);_ This command can take several hours to complete, up to a day. 101 | 102 | vi. Create the view: _CREATE VIEW offline\_view AS SELECT \* FROM main_; 103 | 104 | ### 2. Now the PostgreSQL hosting FAVORannotator backend database is up and running it is listening for the query from FAVORannotator R program. 105 | ### 3. Update the config.R file with the PostgreSQL instance information (database name, port, host, user, password): 106 | 107 | • USER_G <- 'userID'; 108 | • PASSWORD_G <- 'secretPassWord' 109 | • vcf.fn<-"/n/location/input.vcf" 110 | • gds.fn<-"/n/location/output.gds" 111 | • DBNAME_G <- favor; 112 | • HOST_G <- holy2c14409; 113 | • PORT_G <- 8462; 114 | 115 | ### 4. We can first create GDS file from the input VCF file. 116 | • $ Rscript convertVCFtoGDS.r 117 | 118 | ### 5. Now FAVORannotator is ready to run using following command: 119 | • $ Rscript FAVORannotatorGDS.r 120 | 121 | ### If using the FAVORannotator by chromosome version, import the database in the same way and run FAVORannotator exactly as above. The only difference is config.R contains all the 22 chromosomes instances information (vcf file, gds file, database name, port, host, user, password). For many clusters, we also provide the submitting scripts (submitJobs.sh) for submitting all 22 jobs to the cluster at the same time. For by chromosome versions, the R scripts needs to feed in the chromosome number and the above command turns into following. 122 | • $ Rscript convertVCFtoGDS.r 22 123 | • $ Rscript FAVORannotatorGDS.r 22 124 | ### To simplify the parallel computing process, we also provide the submission scripts example here ([submission.sh](https://github.com/zhouhufeng/FAVORannotator/blob/main/Scripts/ByChromosome/submitJobs.sh)). 125 | 126 | ## If interested in learning more about how to run FAVORannotator on FASRC slurm cluster, we have also prepared the recorded live demonstration here. 127 | [![Recorded Live Demo](https://github.com/zhouhufeng/FAVORannotator/blob/main/Docs/Tutorial/Figures/LiveDemo.png)](https://youtu.be/_FRQLsFY4qI) 128 | -------------------------------------------------------------------------------- /Docs/Tutorial/Demos/UKBB200KWESpreprocessVCF.md: -------------------------------------------------------------------------------- 1 | # **Step-by-step tutorial of turning raw VCFs into well-organized aGDS files** 2 | This is a tutorial for (1) preprocessing VCFs (2) generate GDS file from the preprocessed VCFs, with high priority in computing performance and speed. 3 | 4 | 5 | ### Preprocessing file using BCFtools. 6 | #### Prerequisites: 7 | **BCFTools** (version 1.16) Please install the **BCFTools**. 8 | 9 | #### Step 0: Check up for errors and inconsistencies. 10 | The following steps are important for the successful execution of BCFTools, the raw VCF files needs strictly follows the VCF format standard v4.2. 11 | 1. Fixed Headers [make sure all fields are defined in header].  12 | 2. Remove Duplicated VCFs [Make sure there is no duplicated VCF files]. Otherwise duplicated entries will cause issues for the following steps. 13 | 14 | Note: Most of the raw VCFs has issues with the header files that needs to be fixed, without this step BCFTools will not be able to process these VCF files. 15 | 16 | #### Step 1: Remove other FORMAT variables but only keep GT [multi-core]. 17 | ##### Script: 18 | - ```$ bcftools annotate -x ^FORMAT/GT ukb23156_c19_c12.vcf.gz -Oz -o ./CVCF/ukb23156_c19_c12.vcf.gz ``` 19 | ##### Input: All the raw VCF files ** ukb23156_c19_b0_v1.vcf.gz, ukb23156_c19_b2_v1.vcf.gz,..., ukb23156_c19_b64_v1.vcf.gz** 20 | ##### Output: The cleaned VCF files in the folder **./CVCF/** that within which has the same file name, but the FORMAT fields only contain GT. 21 | 22 | Note: This is computationally intensive, each smaller file is one multi-core instance, and multiple instances can be run in parallel to speed up the process. 23 | 24 | Finish 65 VCF processing in 12 core parallel, 130 mins 25 | 26 | Finish 65 VCF processing in 32 core parallel,49 mins 27 | 28 | #### Step 2: Break the multi-allelic sites into multiple rows of all the VCFs of each study. 29 | ##### Script: 30 | - ```$ bcftools norm -m -any ./ConcatVCF/ukb23156_c19_c12.vcf.gz -Oz -o ./ConcatVCF/ukb23156_c19_c12.bk.vcf.gz``` 31 | ##### Input: The VCF file contains multi-allelic sites ** ukb23156_c19_c12.vcf.gz ** 32 | ##### Output: The VCF file has multi-allelic sites break into multiple lines ** ukb23156_c19_c12.bk.vcf.gz**. 33 | 34 | Note: multi-allelic sites cause issues for the following analysis, we usually break them into multiple rows in the preprocessing steps. 35 | 36 | Finish 65 VCF processing in 12 core parallel, 33 mins 37 | 38 | Finish 65 VCF processing in 32 core parallel, 12 mins 39 | 40 | #### Step 3: Concat the smaller VCFs (sliced by variants) within each study into one VCF file. [Benchmark in UKBB 200k WES 24 mins] 41 | ##### Script: 42 | - ```$ bcftools concat --threads 12 ./CVCF/ukb23156_c19_b*_v1.vcf.gz -Oz -o ./ConcatVCF/ukb23156_c19_c12.vcf.gz``` 43 | ##### Input: All the cleaned VCF files in the folder **./CVCF/** with the VCF files name: ** ukb23156_c19_b0_v1.vcf.gz, ukb23156_c19_b2_v1.vcf.gz,..., ukb23156_c19_b64_v1.vcf.gz** 44 | ##### Output: The concatenated VCF files in the folder **./ConcatVCF/ukb23156_c19_c12.vcf.gz** that is the results of all the input VCFs concatenated by rows into one big VCF that has the same columns. 45 | 46 | Note: This is computationally intensive, multi-core function enabled to speed up the process. Concat is only for VCFs has same samples [columns] just need to concat the variants [rows], if VCF is sliced by samples, you should refer to the following steps using the merge function. 47 | 48 | Finish concat 65 VCF processing in 12 core parallel,36 mins 49 | 50 | Finish concat 65 VCF processing in 32 core parallel,24 mins 51 | 52 | 53 | 54 | #### Step 4: Convert the merged VCFs per chromosomes into GDSs (per chromosome) [Benchmarked using UKBB 200k WES chr19 VCF takes 72 mins]. 55 | ##### Script: 56 | - ```$ Rscripts ./convertVCFtoGDS.r ./MergedVCF/ukbb.merged.bk.nm.vcf.gz ./MergedGDS/ukbb.merged.bk.nm.gds``` 57 | Script: **convertVCFtoGDS.r** 58 | ##### Input: The preprocessed VCF file,**ukbb.merged.bk.nm.vcf.gz**. 59 | ##### Output: The generated GDS file **ukbb.merged.bk.nm.gds**. 60 | 61 | Note: This is computationally intensive multi-core option enabled, by default it is 12 core,parallel=10, users can modify based on computing platforms. Since this multi-core convertVCFtoGDS.r involes 3 steps: (1)count variants, (2)generate smaller GDS intermediate files, (3)merge intermediate files into one GDS. Only Step (2) is running in parallel R sessions. Therefore, small VCF file (<10GB) will not see significant computing time reduce with too many cores(>10 cores). We recommend, small VCF file (<10GB) parallel=6, medium VCF file (10GB~50GB) parallel=12,large VCF file (>50GB) parallel=32. 62 | 63 | Finish VCF to GDS processing in 12 core parallel, 90 mins 64 | 65 | Finish VCF to GDS processing in 32 core parallel, 72 mins 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | #### Step additional: Index VCFs. 74 | ##### Script: 75 | - ```$ bcftools index ./ConcatVCF/ukb23156_c19_c12.bk.nm.vcf.gz``` 76 | ##### Input: The VCF file needs index ** ukb23156_c19_c12.bk.nm.vcf.gz ** 77 | ##### Output: The VCF file index file ** ukb23156_c19_c12.bk.nm.vcf.gz.csi ** 78 | 79 | Note: Many processes needs indexed VCFs, e.g. view range, merge, etc. 80 | 81 | #### Step additional: Normalize (left) the broken multi-allelic VCFs. 82 | ##### Script: 83 | - ```$ bcftools norm -f --threads 12 hg38.p13.fa ./ConcatVCF/ukb23156_c19_c12.bk.vcf.gz -Oz -o ./ConcatVCF/ukb23156_c19_c12.bk.nm.vcf.gz``` 84 | ##### Input: The VCF file has multi-allelic sites break into multiple lines ** ukb23156_c19_c12.bk.vcf.gz**, and the reference genome fasta file **hg38.p13.fa**. 85 | ##### Output: The left-normalized VCF file has multi-allelic sites break into multiple lines ** ukb23156_c19_c12.bk.nm.vcf.gz**. 86 | 87 | Note: left-normalization is critical for the indels to have the correct and most commonly accepted formats and representation. 88 | 89 | 90 | -------------------------------------------------------------------------------- /Docs/Tutorial/Demos/preprocessVCF.md: -------------------------------------------------------------------------------- 1 | # **Step-by-step tutorial of turning raw VCFs into well-organized aGDS files** 2 | This is a tutorial for (1) preprocessing VCFs (2) generate GDS file from the preprocessed VCFs, with high priority in computing performance and speed. 3 | 4 | 5 | ### Preprocessing file using BCFtools. 6 | #### Prerequisites: 7 | **BCFTools** (version 1.16) Please install the **BCFTools**. 8 | 9 | #### Step 0: Check up for errors and inconsistencies. 10 | The following steps are important for the successful execution of BCFTools, the raw VCF files needs strictly follows the VCF format standard v4.2. 11 | 1. Fixed Headers [make sure all fields are defined in header].  12 | 2. Remove Duplicated VCFs [Make sure there is no duplicated VCF files]. Otherwise duplicated entries will cause issues for the following steps. 13 | 14 | Note: Most of the raw VCFs has issues with the header files that needs to be fixed, without this step BCFTools will not be able to process these VCF files. 15 | 16 | #### Step 1: Remove other FORMAT variables but only keep GT [multi-core]. 17 | ##### Script: 18 | - ```$ for fl in ukb23156_c19_b*_v1.vcf.gz; do bcftools annotate -x ^FORMAT/GT $fl --threads 12 -Oz -o ./CVCF/$fl &; done``` 19 | ##### Input: All the raw VCF files ** ukb23156_c19_b0_v1.vcf.gz, ukb23156_c19_b2_v1.vcf.gz,..., ukb23156_c19_b64_v1.vcf.gz** 20 | ##### Output: The cleaned VCF files in the folder **./CVCF/** that within which has the same file name, but the FORMAT fields only contain GT. 21 | 22 | Note: This is computationally intensive, each smaller file is one multi-core instance, and multiple instances can be run in parallel to speed up the process. 23 | 24 | #### Step 2: Concat the smaller VCFs (sliced by variants) within each study into one VCF file. [Benchmark in UKBB 200k WES 24 mins] 25 | ##### Script: 26 | - ```$ bcftools concat --threads 12 ./CVCF/ukb23156_c19_b*_v1.vcf.gz -Oz -o ./ConcatVCF/ukb23156_c19_c12.vcf.gz``` 27 | ##### Input: All the cleaned VCF files in the folder **./CVCF/** with the VCF files name: ** ukb23156_c19_b0_v1.vcf.gz, ukb23156_c19_b2_v1.vcf.gz,..., ukb23156_c19_b64_v1.vcf.gz** 28 | ##### Output: The concatenated VCF files in the folder **./ConcatVCF/ukb23156_c19_c12.vcf.gz** that is the results of all the input VCFs concatenated by rows into one big VCF that has the same columns. 29 | 30 | Note: This is computationally intensive, multi-core function enabled to speed up the process. Concat is only for VCFs has same samples [columns] just need to concat the variants [rows], if VCF is sliced by samples, you should refer to the following steps using the merge function. 31 | 32 | #### Step 3: Break the multi-allelic sites into multiple rows of all the VCFs of each study. 33 | ##### Script: 34 | - ```$ bcftools norm -m -any --threads 12 ./ConcatVCF/ukb23156_c19_c12.vcf.gz -Oz -o ./ConcatVCF/ukb23156_c19_c12.bk.vcf.gz``` 35 | ##### Input: The VCF file contains multi-allelic sites ** ukb23156_c19_c12.vcf.gz ** 36 | ##### Output: The VCF file has multi-allelic sites break into multiple lines ** ukb23156_c19_c12.bk.vcf.gz**. 37 | 38 | Note: multi-allelic sites cause issues for the following analysis, we usually break them into multiple rows in the preprocessing steps. 39 | 40 | #### Step 4: Normalize (left) the broken multi-allelic VCFs. 41 | ##### Script: 42 | - ```$ bcftools norm -f --threads 12 hg38.p13.fa ./ConcatVCF/ukb23156_c19_c12.bk.vcf.gz -Oz -o ./ConcatVCF/ukb23156_c19_c12.bk.nm.vcf.gz``` 43 | ##### Input: The VCF file has multi-allelic sites break into multiple lines ** ukb23156_c19_c12.bk.vcf.gz**, and the reference genome fasta file **hg38.p13.fa**. 44 | ##### Output: The left-normalized VCF file has multi-allelic sites break into multiple lines ** ukb23156_c19_c12.bk.nm.vcf.gz**. 45 | 46 | Note: left-normalization is critical for the indels to have the correct and most commonly accepted formats and representation. 47 | 48 | #### Step 5: Index VCFs. 49 | ##### Script: 50 | - ```$ bcftools index ./ConcatVCF/ukb23156_c19_c12.bk.nm.vcf.gz``` 51 | ##### Input: The VCF file needs index ** ukb23156_c19_c12.bk.nm.vcf.gz ** 52 | ##### Output: The VCF file index file ** ukb23156_c19_c12.bk.nm.vcf.gz.csi ** 53 | 54 | Note: Many processes needs indexed VCFs, e.g. view range, merge, etc. 55 | 56 | #### Step 6: Sliced the Normalized VCFs into each chromosome [if needed]. 57 | ##### Script: 58 | - ```$ bcftools view -r chr19 ./ConcatVCF/ukb23156_c12.bk.nm.vcf.gz -Oz -o ./ConcatVCF/ukb23156_c19_c12.bk.nm.vcf.gz``` 59 | ##### Input: The VCF file contains all chromosomes ** ukb23156_c12.bk.nm.vcf.gz ** 60 | ##### Output: The VCF file that only has chr19 (chosen chromosome or range) ** ukb23156_c19_c12.bk.nm.vcf.gz**. 61 | 62 | Note: Sliced VCF will have many advantages in computing performance. 63 | 64 | #### Step 7: Merge the Normalized VCFs (sliced by different samples) of each study into one VCF (per chromosome). 65 | ##### Script: 66 | - ```$ bcftools merge -m all --threads 6 ./DifferentStudies/ukbb*.bk.nm.vcf.gz -Oz -o ./MergedVCF/ukbb.merged.bk.nm.vcf.gz``` 67 | ##### Input: The VCF files has same set of Variants (rows) but different samples (columns) ** /DifferentStudies/ukbb*.bk.nm.vcf.gz ** 68 | ##### Output: One big VCF file has same set of variants (rows) now with all the samples (columns) **./MergedVCF/ukbb.merged.bk.nm.vcf.gz**. 69 | 70 | Note:This is computationally intensive multi-core option enabled. merge function only for VCFs with same set of variants (rows) merging different samples (columns) together. 71 | 72 | 73 | #### Step 8: Convert the merged VCFs per chromosomes into GDSs (per chromosome) [Benchmarked using UKBB 200k WES chr19 VCF takes 72 mins]. 74 | ##### Script: 75 | - ```$ Rscripts ./convertVCFtoGDS.r ./MergedVCF/ukbb.merged.bk.nm.vcf.gz ./MergedGDS/ukbb.merged.bk.nm.gds``` 76 | Script: **convertVCFtoGDS.r** 77 | ##### Input: The preprocessed VCF file,**ukbb.merged.bk.nm.vcf.gz**. 78 | ##### Output: The generated GDS file **ukbb.merged.bk.nm.gds**. 79 | 80 | Note: This is computationally intensive multi-core option enabled, by default it is 12 core,parallel=10, users can modify based on computing platforms. Since this multi-core convertVCFtoGDS.r involes 3 steps: (1)count variants, (2)generate smaller GDS intermediate files, (3)merge intermediate files into one GDS. Only Step (2) is running in parallel R sessions. Therefore, small VCF file (<10GB) will not see significant computing time reduce with too many cores(>10 cores). We recommend, small VCF file (<10GB) parallel=6, medium VCF file (10GB~50GB) parallel=12,large VCF file (>50GB) parallel=32. 81 | 82 | -------------------------------------------------------------------------------- /Docs/Tutorial/Detailed-Explanation/FAVORFullDB.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Docs/Tutorial/Detailed-Explanation/FAVORFullDB.xlsx -------------------------------------------------------------------------------- /Docs/Tutorial/Figures/FASRC1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Docs/Tutorial/Figures/FASRC1.jpg -------------------------------------------------------------------------------- /Docs/Tutorial/Figures/FAVORannotatorOnTerra.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Docs/Tutorial/Figures/FAVORannotatorOnTerra.png -------------------------------------------------------------------------------- /Docs/Tutorial/Figures/Figure2A.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Docs/Tutorial/Figures/Figure2A.png -------------------------------------------------------------------------------- /Docs/Tutorial/Figures/Figure2B.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Docs/Tutorial/Figures/Figure2B.png -------------------------------------------------------------------------------- /Docs/Tutorial/Figures/Figure2C.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Docs/Tutorial/Figures/Figure2C.png -------------------------------------------------------------------------------- /Docs/Tutorial/Figures/HarvardDataVerse.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Docs/Tutorial/Figures/HarvardDataVerse.png -------------------------------------------------------------------------------- /Docs/Tutorial/Figures/LiveDemo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Docs/Tutorial/Figures/LiveDemo.png -------------------------------------------------------------------------------- /Docs/Tutorial/Figures/createDBinstance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Docs/Tutorial/Figures/createDBinstance.png -------------------------------------------------------------------------------- /Docs/Tutorial/Figures/figure1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Docs/Tutorial/Figures/figure1.png -------------------------------------------------------------------------------- /Docs/Tutorial/Figures/figure4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Docs/Tutorial/Figures/figure4.png -------------------------------------------------------------------------------- /Docs/Tutorial/Figures/postgreSQLdb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Docs/Tutorial/Figures/postgreSQLdb.png -------------------------------------------------------------------------------- /Docs/Tutorial/Figures/runningInstance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Docs/Tutorial/Figures/runningInstance.png -------------------------------------------------------------------------------- /Docs/Tutorial/Figures/versions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Docs/Tutorial/Figures/versions.png -------------------------------------------------------------------------------- /Docs/Tutorial/Figures/versions1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Docs/Tutorial/Figures/versions1.png -------------------------------------------------------------------------------- /Docs/Tutorial/Tables/table 1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Docs/Tutorial/Tables/table 1.png -------------------------------------------------------------------------------- /Docs/Tutorial/Tables/table1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Docs/Tutorial/Tables/table1.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0) 2 | 3 | # **FAVORannotator** 4 | FAVORannotator is an R program for performing functional annotation of any genetic study (e.g. Whole-Genome/Whole-Exome Sequencing/Genome-Wide Association Studies) using the [FAVOR backend database](https://favor.genohub.org) to create an annotated Genomic Data Structure (aGDS) file by storing the genotype data (in VCF or GDS format) and their functional annotation data in an all-in-one file. 5 | 6 | **For generating GDS/aGDS from raw VCF files, please refer to the detailed tutorial [here](https://github.com/zhouhufeng/FAVORannotator/blob/main/Docs/Tutorial/Demos/preprocessVCF.md).** 7 | 8 | ## 1.Introduction 9 | 10 | FAVORannotator is an open-source pipeline for functionally annotating and efficiently storing the genotype and variant functional annotation data of any genetic study (e.g. GWAS/WES/WGS). Functional annotation data is stored alongside with genotype data in an all-in-one aGDS file, through using the FAVORannotator. It then facilitates a wide range of functionally-informed downstream analyses (Figure 1). 11 | 12 | FAVORannotator first converts a genotype VCF input file to a GDS file, searches the variants in the GDS file using the FAVOR database for their functional annotations, and then integrates these annotations into the GDS file to create an aGDS file. This aGDS file allows both genotype and functional annotation data to be stored in a single unified file (Figure 1). Furthermore, FAVORannotator can be conveniently integrated into [STAARpipeline](https://github.com/xihaoli/STAARpipeline), a rare variant association analysis tool, to perform association analysis of large-scale WGS/WES studies. 13 | 14 | ![FAVORannotator workflow](https://github.com/zhouhufeng/FAVORannotator/blob/main/Docs/Tutorial/Figures/figure1.png) 15 | 16 | _Figure 1. FAVORannotator workflow._ 17 | 18 | ## 2. FAVORannotator differnt versions (SQL, CSV and Cloud Versions) 19 | 20 | There are three main versions of FAVORannotator: **SQL**, **CSV** and **Cloud**. 21 | 22 | All the versions of FAVORannotator requires the same set of R libraries. The postgreSQL version requires postgreSQL installation, and CSV version requires the XSV software dependencies, Cloud version also requires the XSV software dependencies. 23 | 24 | All the FAVORannotator versions produced identical results and have similar performance, they only differ on the computing environments where FAVORannotator is deployed. Users can choose the different versions of FAVORannotator according to their computing platforms and use cases. 25 | 26 | FAVORannotator accomplishes both high query speed and storage efficiency due to its optimized configurations and indices. Its offline nature avoids the excessive waiting time and file size restrictions of FAVOR online operation. 27 | 28 | ### 2.1 FAVORannotator SQL version 29 | 30 | It is important to note that the FAVORannotator SQL version PostgreSQL database differs from other storage because it needs to be running in order to be accessed. Thus, users must ensure the database is running before running annotations. 31 | 32 | Once the FAVORannotator database is booted on and running, the following connection information must be specified for the FAVORannotator R program to access the database : DBName, Host, Port, User, and Password. 33 | 34 | This above specialized database setting, ensure the high query speed. Here shows the detail features described above. 35 | 36 | ![FAVORannotator SQL version Tech Features](https://github.com/zhouhufeng/FAVORannotator/blob/main/Docs/Tutorial/Figures/Figure2A.png) 37 | 38 | _Figure 2. FAVORannotator SQL version workflow and differences highlights._ 39 | 40 | ### 2.2 FAVORannotator CSV version 41 | 42 | FAVORannotator CSV version database adopts the similar strategies of slicing both database and query inputs into smaller pieces and create index with each of the smaller chucks of database so as to achieve high performance and fast query speed as the SQL version. 43 | 44 | Differs from SQL version, CSV version database is static, and the query depends upon the xsv software, and therefore does not need to ensure the database is running before running annotations. The CSV version database is static and have much easier way to access through xsv software rather than acquiring the details of the running postgreSQL database, therefore widen the application of FAVORannotator in case computing platform does not support postgreSQL installation. 45 | 46 | ![FAVORannotator CSV version Tech Features](https://github.com/zhouhufeng/FAVORannotator/blob/main/Docs/Tutorial/Figures/Figure2B.png) 47 | 48 | _Figure 3. FAVORannotator CSV version workflow and differences highlights._ 49 | 50 | ### 2.3 FAVORannotator Cloud version 51 | 52 | FAVORannotator Cloud version develop based on the CSV version (no pre-install database) adopts the similar strategies of slicing both database and query inputs into smaller pieces and create index with each of the smaller chucks of database so as to achieve high performance and fast query speed as the SQL/CSV version. But the FAVORannotator Cloud version download the FAVOR databases (Full Databaseor Essential Database) on the fly, requires no pre-install FAVOR database on the computing platform. 53 | 54 | Cloud version database download from ([FAVOR on Harvard Database](https://dataverse.harvard.edu/dataverse/favor) when FAVORannotator is executed, and after the download finishes, database is decompressed. The downloaded database is CSV version, which is static, and the query depends upon the xsv software therefore requires minimal dependencies and running database management systems. 55 | 56 | ![FAVORannotator Cloud version Tech Features](https://github.com/zhouhufeng/FAVORannotator/blob/main/Docs/Tutorial/Figures/Figure2C.png) 57 | 58 | _Figure 4. FAVORannotator Cloud version workflow and differences highlights._ 59 | 60 | 61 | ## 3. Obtain the FAVOR Database 62 | ### 3.1 Obtain the database through direct downloading 63 | 1. Download the FAVORannotator data file from here ([download URL](http://favor.genohub.org), under the "FAVORannotator" tab). 64 | 2. Decompress the downloaded data. 65 | 3. Move the decompressedd database to the location, and update location info on '''config.R'''. 66 | 67 | ### 3.2 FAVOR databases host on Harvard Dataverse 68 | FAVOR databases (Essential Database and Full Database) are hosting on ([Harvard Database](https://dataverse.harvard.edu/dataverse/favor)). 69 | 70 | 71 | ![FAVORannotator Cloud version Tech Features](https://github.com/zhouhufeng/FAVORannotator/blob/main/Docs/Tutorial/Figures/HarvardDataVerse.png) 72 | 73 | _Figure 5. FAVOR Databases on Harvard Dataverse (both Essential Database and Full Database)._ 74 | 75 | 76 | ### 3.3 FAVOR Essential Database 77 | ([FAVOR Essential Database](https://doi.org/10.7910/DVN/1VGTJI)) containing 20 essential annotation scores. This FAVOR Essential Database is comprised of a collection of essential annotation scores for all possible SNVs (8,812,917,339) and observed indels (79,997,898) in Build GRCh38/hg38. 78 | 79 | ### 3.4 FAVOR Full Database 80 | ([FAVOR Full Database](https://doi.org/10.7910/DVN/KFUBKG)) containing 160 essential annotation scores. This FAVOR Full Database is comprised of a collection of full annotation scores for all possible SNVs (8,812,917,339) and observed indels (79,997,898) in Build GRCh38/hg38. 81 | 82 | 83 | ## 4. Resource requirements 84 | 85 | The resources utilized by the FAVORannotator R program and PostgreSQL instance are largely dependent upon the size of the input variants. 86 | 87 | For the both the SQL and CSV versions of FAVORannotator, 60,000 samples of WGS variant sets were tested. The whole functional annotation finished in parallel in 1 hour using 24 computing cores (Intel cascade lake with 2.9 GHz frequency). The memory consumed by each instance varies (usually within 18 GB), as there are different amounts of variants associated with each chromosome. 88 | 89 | ## 5. Resource requirements 90 | 91 | The resources utilized by the FAVORannotator R program and PostgreSQL instance are largely dependent upon the size of the input variants. 92 | 93 | For the both the SQL and CSV versions of FAVORannotator, 60,000 samples of WGS variant sets were tested. The whole functional annotation finished in parallel in 1 hour using 24 computing cores (Intel cascade lake with 2.9 GHz frequency). The memory consumed by each instance varies (usually within 18 GB), as there are different amounts of variants associated with each chromosome. 94 | 95 | 96 | 97 | ## 6. How to Use FAVORannotator 98 | 99 | ### 6.1 SQL/CSV versions 100 | 101 | Installing and run FAVORannotator to perform functional annotation requires only 2 major steps: 102 | 103 | **I. Install software dependencies and prepare the database (process varies between systems).** 104 | 105 | **II. Run FAVORannotator (CSV or SQL versions).** 106 | 107 | The first step depends on whether FAVORannotator is the SQL or CSV version, and depends on different computing platforms. The following sections detail the process for major platforms. The second step (running FAVORannotator) will be detailed first, as it is consistent across platforms. 108 | 109 | 110 | ### 6.2 No pre-install databases version 111 | There are a few user cases where download the database and configuration can be difficult, we simply the FAVORannotator by including the downloading, decompression, update config.R, include database location and output location all into the FAOVRannotator (no pre-install database version), users only need to put the R scripts in to the directory with enough storage and run the program. 112 | 113 | **I. Install software dependencies.** 114 | 115 | **II. Run FAOVRannotator (no pre-install database version).** 116 | 117 | ### 6.3 Cloud version 118 | Based on the FAOVRannotator (no pre-install database version), we develop the FAOVRannotator cloud-native app, in the cloud platform like Terra and DNAnexus, or on the virtual machines of Google Cloud Platform (GCP), Amazon Web Services (AWS), Microsoft Azure. With the dockerized images and workflow languages, FAVORannotator can be executed through the user-friendly and drag-and-drop graphical interface, with no scripting nor programming skills required from the users. 119 | 120 | 121 | ![FAVORannotator Versions](https://github.com/zhouhufeng/FAVORannotator/blob/main/Docs/Tutorial/Figures/versions.png) 122 | 123 | _Figure 6. FAVORannotator Different Versions._ 124 | 125 | 126 | ## 7. SQL version 127 | ### 7.1 Run FAVORannotator SQL version 128 | 129 | Once PostgreSQL is running, the database can be imported and FAVORannotator can be executed as follows. Please find the R scripts in the ```Scripts/SQL/``` folder. 130 | 131 | **Important: Before run FAVORannotator SQL version, please update the file locations and database info on the ```config.R``` file. FAVORannotator relies on the file locations and database info for the annotation.** 132 | 133 | 1. Create GDS file from the input VCF file: 134 | 135 | - ``` $ Rscript convertVCFtoGDS.r chrnumber ``` 136 | 137 | 2. Run FAVORannotator: 138 | 139 | - ``` $ Rscript FAVORannotatorv2aGDS.r chrnumber ``` 140 | 141 | chrnumber are the numeric number indicating which chromosome this database is reading from, chrnumber can be 1, 2, ..., 22. 142 | 143 | Scripts for submitting jobs for all chromosomes simultaneously have been provided. They use SLURM, which is supported by many high-performance clusters, and utilize parallel jobs to boost performance. 144 | 145 | A SLURM script to simplify the process can be found here: ([submission.sh](https://github.com/zhouhufeng/FAVORannotator/blob/main/Scripts/SQL/submitJobs.sh)). 146 | 147 | ### 7.2 Install and prepare the database for SQL version 148 | 149 | The FAVORannotator SQL version relies upon the PostgreSQL Database Management System (DBMS). PostgreSQL is a free and open-source application which emphasizes extensibility and SQL compliance. It is a highly stable DBMS, backed by more than 20 years of community development. PostgreSQL is used to manage data for many web, mobile, geospatial, and analytics applications. Its advanced features, including diverse index types and configuration options, have been carefully selected for FAVORannotator so that end users do not need to worry about the implementation. 150 | 151 | How to use FAVORannotator will be explained from the following steps. PostgreSQL is available in most platforms. Each of these platforms has a different process for installing software, which affects the first step of installing FAVORannotator. 152 | 153 | Once PostgreSQL is running, the database can be imported and FAVORannotator can be executed as follows: 154 | 155 | 1. Once the server is running, Load the database: ```$ psql -h hostname -p port_number -U username -f your_file.sql databasename ``` 156 | 157 | e.g. ```$ psql -h c02510 -p 582  -f /n/SQL/ByChr7FAVORDBxO.sql Chr7``` 158 | 159 | 2. Now the PostgreSQL hosting FAVORannotator backend database is up and running it is listening for the query from FAVORannotator R program. 160 | 161 | 3. Update the config.R file with the PostgreSQL instance information (database name, port, host, user, password): 162 | 163 | ### 7.3 Install PostgreSQL (FAVORannotator SQL version) 164 | 165 | The following steps have been written for major computing environments in order to best account for all possibilities. The following steps are for the widely used operating system (Ubuntu) on a virtual machine. 166 | 167 | 1. Install the required software: 168 | - ```$ sudo apt install postgresql postgresql-contrib``` 169 | 2. Start and run PostgreSQL: 170 | - ```$ sudo -i -u postgres``` 171 | - ```$ psql``` 172 | 173 | 3. [Optional] For installing the database on external storage (Edit the configuration file): 174 | - The file is located at ```/etc/postgresql/12/main/postgresql.conf``` 175 | - Change the line in file “postgresql.conf”, data_directory = 'new directory of external storage' 176 | - Reboot the data directory, ```$ sudo systemctl start postgresql``` 177 | 178 | 179 | **For more detailed instructions on how to use FAVORannotator (SQL version) on the Harvard FASRC Slurm Cluster, please refer to the detailed tutorial [here](https://github.com/zhouhufeng/FAVORannotator/blob/main/Docs/Tutorial/Demos/FASRC.md).** 180 | 181 | 182 | ## 8. CSV version 183 | 184 | ### 8.1 Run FAVORannotator CSV version 185 | 186 | Once CSV database is downloaded and decompressed, the database is readable by FAVORannotator can be executed as follows. Please find the R scripts in the ```Scripts/CSV/``` folder. 187 | 188 | **Important: Before run FAVORannotator CSV version, please update the file locations and database info on the ```config.R``` file. FAVORannotator relies on the file locations and database info for the annotation.** 189 | 190 | 1. Create GDS file from the input VCF file: 191 | 192 | - ``` $ Rscript convertVCFtoGDS.r chrnumber ``` 193 | 194 | 2. Run FAVORannotator: 195 | 196 | - ``` $ Rscript FAVORannotatorv2aGDS.r chrnumber ``` 197 | 198 | Scripts for submitting jobs for all chromosomes simultaneously have been provided. They use SLURM, which is supported by many high-performance clusters, and utilize parallel jobs to boost performance. 199 | 200 | A SLURM script to simplify the process can be found here: ([submission.sh](https://github.com/zhouhufeng/FAVORannotator/blob/main/Scripts/SQL/submitJobs.sh)). 201 | 202 | chrnumber are the numeric number indicating which chromosome this database is reading from, chrnumber can be 1, 2, ..., 22. 203 | 204 | ### 8.2 Install and prepare the database for CSV version 205 | 206 | **FAVORannotator** (CSV version) depends on the **xsv software** and the **FAVOR database** in CSV format. Please install the **xsv software** and 207 | download the **FAVOR database** CSV files (under the "FAVORannotator" tab) before using **FAVORannotator** (CSV version). 208 | 209 | ### 8.3 Install xsv (FAVORannotator CSV version) 210 | 211 | The following steps have been written for major computing environments in order to best account for all possibilities. The following steps are for the widely used operating system (Ubuntu) on a virtual machine. 212 | 213 | 1. Install Rust and Cargo: 214 | - ```$ curl https://sh.rustup.rs -sSf | sh``` 215 | 2. Source the environment: 216 | - ```$ source $HOME/.cargo/env``` 217 | 3. Install xsv using Cargo: 218 | - ```$ cargo install xsv``` 219 | 220 | 221 | 222 | ## 9 No pre-install databases version 223 | 224 | ### 9.1 Install xsv (No need to pre-install database but xsv need to be installed) 225 | 226 | The following steps have been written for major computing environments in order to best account for all possibilities. The following steps are for the widely used operating system (Ubuntu) on a virtual machine. 227 | 228 | 1. Install Rust and Cargo: 229 | - ```$ curl https://sh.rustup.rs -sSf | sh``` 230 | 2. Source the environment: 231 | - ```$ source $HOME/.cargo/env``` 232 | 3. Install xsv using Cargo: 233 | - ```$ cargo install xsv``` 234 | 235 | 236 | 237 | ### 9.2 Run FAVORannotator no pre-install databases version 238 | 239 | FAVOR database can be downloaded on the fly and decompressed automatically in the scripts, this version of FAVORannotator will remove the burden of download the backend database and update the ```config.R```. The database is downloaded and decompressed automatically and is readable by FAVORannotator can be executed as follows. 240 | 241 | Please find the R scripts in the ```Scripts/SQL/``` folder. 242 | 243 | **Important: This version of FAVORannotator no pre-install version does not need to update ```config.R``` file. This version of FAVORannotator directly download FAVORdatabase (Full or Essential versions) from the Harvard Dataverse to the default file locations and database info for the annotation. Just put the FAVORannotator script in the directory with ample storage all the database and index and intermediate files will be generated in the directory.** 244 | 245 | 1. Create GDS file from the input VCF file: 246 | 247 | - ``` $ Rscript convertVCFtoGDS.r input.vcf output.gds ``` 248 | 249 | 2. Run FAVORannotator for the FAVOR Essential Database: 250 | 251 | - ``` $ Rscript FAVORannotatorCSVEssentialDB.R output.gds chrnumber ``` 252 | 253 | 3. Run FAVORannotator for the FAVOR Full Database: 254 | 255 | - ``` $ Rscript FAVORannotatorCSVFullDB.R output.gds chrnumber ``` 256 | 257 | chrnumber are the numeric number indicating which chromosome this database is reading from, chrnumber can be 1, 2, ..., 22. 258 | 259 | Scripts for submitting jobs for all chromosomes simultaneously have been provided. They use SLURM, which is supported by many high-performance clusters, and utilize parallel jobs to boost performance. 260 | 261 | A SLURM script to simplify the process can be found here: ([submission.sh](https://github.com/zhouhufeng/FAVORannotator/blob/main/Scripts/SQL/submitJobs.sh)). 262 | 263 | 264 | ## 10. Cloud Version 265 | ### 10.1 Run FAVORannotator Cloud Version 266 | 267 | For Cloud environment, we simplified the process of database set up and remove the configration files. FAVOR database can be downloaded on the fly and decompressed automatically in the scripts, this version of FAVORannotator will remove the burden of download the backend database and update the ```config.R```. The database is downloaded and decompressed automatically and is capable of seamless integration to the workflow languages of the cloud platform. It currently works for cloud platforms like Terra, DNAnexus, etc. This tutorial uses Terra as an example to illustrate the functional annotation process. 268 | 269 | Please find the R scripts in the ```Scripts/Cloud/``` folder. 270 | 271 | **Important: This version of FAVORannotator based on the no pre-install version does not need ```config.R``` file. This version of FAVORannotator directly download FAVORdatabase (Full or Essential versions) from the Harvard Dataverse to the default file locations and database info for the annotation. Just put the FAVORannotator script in the directory with ample storage all the database and index and intermediate files will be generated in the directory. These database files and intermediate files in the working directories will be removed in most cloud platforms.** 272 | 273 | 1. Create GDS file from the input VCF file: 274 | 275 | - ``` $ Rscript convertVCFtoGDS.r input.vcf output.gds ``` 276 | 277 | 2.1 Run FAVORannotator for the FAVOR Essential Database: 278 | 279 | - ``` $ Rscript FAVORannotatorTerraEssentialDB.R output.gds chrnumber ``` 280 | 281 | 2.2. Run FAVORannotator for the FAVOR Essential Database workflow: 282 | 283 | - ``` $ java -jar cromwell-30.2.jar run FAVORannotatorEssentialDB.wdl --inputs file.json ``` 284 | 285 | 286 | 3.1 Run FAVORannotator for the FAVOR Full Database: 287 | 288 | - ``` $ Rscript FAVORannotatorTerraEssentialDB.R output.gds chrnumber ``` 289 | 290 | chrnumber are the numeric number indicating which chromosome this database is reading from, chrnumber can be 1, 2, ..., 22. 291 | 292 | 3.2. Run FAVORannotator for the FAVOR Full Database workflow: 293 | 294 | - ``` $ java -jar cromwell-30.2.jar run FAVORannotatorFullDB.wdl --inputs file.json ``` 295 | 296 | 297 | 298 | 299 | ![FAVORannotator Cloud Version](https://github.com/zhouhufeng/FAVORannotator/blob/main/Docs/Tutorial/Figures/FAVORannotatorOnTerra.png) 300 | 301 | _Figure 7. FAVORannotator Cloud Native Workflow on Terra._ 302 | 303 | ## 11. Other Functions and Utilities 304 | 305 | ### 11.1 Convert VCF to aGDS 306 | 307 | The following functions have been written for the purpose of converting VCF files to GDS/aGDS files. Please find the R scripts in the ```Scripts/UTL/``` folder. 308 | 309 | 1. If users wish to convert VCF files that only contain genotype data into GDS files for the following annoation process: 310 | - ```$ Rscript convertVCFtoGDS.r input.vcf output.agds``` 311 | 312 | 313 | 2. If users wish to convert Variant List that does not contain genotype data into GDS files for the following annoation process, after formatting the varaint list into the same VCF format, following R scripts can generate the empty GDS file that do not have genotype data just the varaint info: 314 | - ```$ Rscript convertVCFtoGDS.r inputVariantList.vcf output.agds``` 315 | 316 | 3. If users already annotated VCF files using SpnEff,BCFTools, VarNote, Vcfanno and just wish to use aGDS for the following analysis, running the followign R script to convert annotated VCF files into aGDS file 317 | - ```$ Rscript convertVCFtoGDS.r annotated.vcf output.agds``` 318 | 319 | 320 | 321 | ### 11.2 Add In Functional Annotations to aGDS 322 | 1. If users have external annotation sources or annotation in text tables that containing varaint sets, this function will be able to add in the new functional annotations into the new node of aGDS files: 323 | - ```$ Rscript FAVORannotatorAddIn.R input.agds AnnotationFile.tsv``` 324 | 325 | 326 | ### 11.3 Extract Variant Functional Annotation to Text Tables from aGDS 327 | 328 | 1. If users prefer to have the Variant Functional Annotation results write into Text Tables, this Rscripts will be able to extract the functional annotation from aGDS and write into the text tables: 329 | - ```$ Rscript FAVORaGDSToText.R annotated.agds AnnotationTextTable.tsv``` 330 | 331 | 332 | 333 | ## 12 Demo Using Real Example (1000 Genomes Project Data) 334 | 335 | The following steps are the demo of how to FAVORannotato through using real genotype data from 1000 Genomes Project. From the step of obtaining the genotype data to the end point of creating aGDS are illustrated here below in the step by step process. 336 | 337 | 338 | ### 12.1 Download the 1000G VCF 339 | 340 | If users can use command line below to obtain the ([1000G](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7059836/)) from the FTP ([1000 Genomes official website](http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/)), for the following process. 341 | 342 | Change the directory: 343 | - ```$ cd ../../Data/TestData/1000G/ ``` 344 | 345 | Download VCF to the directory (chr22): 346 | 347 | - ```$ wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chr22.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz ``` 348 | 349 | Additionally if download chr1: 350 | - ``` $ wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chr22.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz``` 351 | 352 | 353 | ### 12.2 Convert VCF to GDS (chr22) 354 | 355 | Users can use command line below to convert the VCF to GDS. 356 | 357 | Change the directory: 358 | - ```$ cd ../../../Scripts/UTL ``` 359 | 360 | Run program to create GDS: 361 | - ```$ Rscript convertVCFtoGDS.r ../../Data/TestData/Input/ALL.chr22.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz ../../Data/1000G/All.chr22.27022019.GRCh38.phased.gds ``` 362 | 363 | And you will get the following output on terminal: 364 | 365 | ``` 366 | ALL.chr22.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz (176.9M) 367 | file format: VCFv4.3 368 | the number of sets of chromosomes (ploidy): 2 369 | the number of samples: 2,548 370 | genotype storage: bit2 371 | compression method: LZMA_RA 372 | of samples: 2548 373 | ... 374 | ``` 375 | 376 | 377 | ### 12.3 Annotate GDS using FAVORannotator to create aGDS (no pre-install version) 378 | 379 | Users can use following command to annotate GDS using FAVORannotator to create aGDS . 380 | 381 | Change the directory: 382 | - ```$ cd ../../Data/1000G/ ``` 383 | 384 | Copy FAVORannotator program to the current directory: 385 | - ```$ cp ../../../Scripts/CSV/FAVORannotatorCSVEssentialDB.R .``` 386 | - ```$ cp ../../../Scripts/CSV/FAVORannotatorCSVFullDB.R . ``` 387 | 388 | Run program to annotate GDS using FAVORannotator reading FAVOR Essential Database to create aGDS(chr22): 389 | - ```$ Rscript FAVORannotatorCSVEssentialDB.R All.chr22.27022019.GRCh38.phased.gds 22 ``` 390 | 391 | And you will get the following output on terminal: 392 | ``` 393 | [1] gds.file: All.chr22.27022019.GRCh38.phased.gds 394 | [1] chr: 22 395 | [1] use_compression Yes 396 | --2022-09-14 16:42:28-- https://dataverse.harvard.edu/api/access/datafile/6170504 397 | 398 | ``` 399 | 400 | 401 | Run program to annotate GDS using FAVORannotator reading FAVOR Full Database to create aGDS(chr22): 402 | - ```$ Rscript FAVORannotatorCSVFullDB.R All.chr22.27022019.GRCh38.phased.gds 22 ``` 403 | 404 | And you will get the following output on terminal: 405 | ``` 406 | [1] gds.file: All.chr22.27022019.GRCh38.phased.gds 407 | [1] chr: 22 408 | [1] use_compression: Yes 409 | --2022-09-14 16:39:31-- https://dataverse.harvard.edu/api/access/datafile/6358299 410 | 411 | 412 | ``` 413 | 414 | 415 | 416 | 417 | 418 | 419 | 420 | 421 | ## 13 Dependencies 422 | FAVORannotator imports R packages: dplyr, SeqArray, gdsfmt, RPostgreSQL, stringr, readr, stringi. These dependencies should be installed before running FAVORannotator. 423 | 424 | FAVORannotator (SQL version) depends upon PostgreSQL software. 425 | 426 | FAVORannotator (CSV version) depends upon xsv software. 427 | 428 | ## Data Availability 429 | The whole-genome individual functional annotation data assembled from a variety of sources and the computed annotation principal components are available at the [Functional Annotation of Variant - Online Resource (FAVOR)](https://favor.genohub.org) site. 430 | 431 | ## Version 432 | The current version is 1.1.1 (August 30th, 2022). 433 | ## License 434 | This software is licensed under GPLv3. 435 | 436 | ![GPLv3](http://www.gnu.org/graphics/gplv3-127x51.png) 437 | [GNU General Public License, GPLv3](http://www.gnu.org/copyleft/gpl.html) 438 | -------------------------------------------------------------------------------- /Scripts/CSV/Dockerfile.txt: -------------------------------------------------------------------------------- 1 | # Base image https://hub.docker.com/u/rocker/ 2 | FROM rocker/r-base:latest 3 | 4 | ## create directories 5 | RUN mkdir -p /01_data 6 | RUN mkdir -p /02_code 7 | RUN mkdir -p /03_output 8 | 9 | ## copy files 10 | #COPY install_packages.R 11 | #COPY /02_code/myScript.R 12 | 13 | ## install R-packages 14 | RUN Rscript install_packages.R 15 | 16 | -------------------------------------------------------------------------------- /Scripts/CSV/FAVORannotatorCSVEssentialDB.R: -------------------------------------------------------------------------------- 1 | ############################################################################# 2 | #Title: FAVORannotatorCSVEssentialDB 3 | #Function: 4 | # * Build the aGDS file thorugh performing functional annotation, 5 | # * without pre-install the FAVOR Full Database, download database on the fly. 6 | #Author: Hufeng Zhou 7 | #Time: Sept 27th 2022 8 | ############################################################################# 9 | 10 | 11 | args <- commandArgs(TRUE) 12 | ### mandatory 13 | 14 | gds.file <- args[1] 15 | print(paste0("gds.file: ",gds.file)) 16 | 17 | chr <- as.numeric(args[2]) 18 | print(paste0("chr: ",chr)) 19 | #chr<-19 20 | 21 | use_compression <- "Yes" 22 | print(paste0("use_compression: ",use_compression)) 23 | 24 | ### R package 25 | library(gdsfmt) 26 | library(SeqArray) 27 | library(readr) 28 | 29 | ### xsv directory 30 | xsv <- "~/.cargo/bin/xsv" 31 | 32 | ## read info 33 | DB_info <- read.csv(url("https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/main/Scripts/SQL/FAVORdatabase_chrsplit.csv"),header=TRUE) 34 | DB_info_chr <- DB_info[DB_info$Chr==chr,] 35 | chr_splitnum <- sum(DB_info$Chr==chr) 36 | 37 | ### DB file 38 | DB_path <- "n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/" 39 | 40 | ### output 41 | output_path <- "./" 42 | 43 | ### annotation file 44 | anno_file_name_1 <- "Anno_chr" 45 | anno_file_name_2 <- "_STAARpipeline.csv" 46 | 47 | 48 | ########################################################################## 49 | ### Step 0 (Download FAVOR Database) 50 | ########################################################################## 51 | URLs <- data.frame(chr = c(1:22), 52 | URL = c("https://dataverse.harvard.edu/api/access/datafile/6170506", 53 | "https://dataverse.harvard.edu/api/access/datafile/6170501", 54 | "https://dataverse.harvard.edu/api/access/datafile/6170502", 55 | "https://dataverse.harvard.edu/api/access/datafile/6170521", 56 | "https://dataverse.harvard.edu/api/access/datafile/6170511", 57 | "https://dataverse.harvard.edu/api/access/datafile/6170516", 58 | "https://dataverse.harvard.edu/api/access/datafile/6170505", 59 | "https://dataverse.harvard.edu/api/access/datafile/6170513", 60 | "https://dataverse.harvard.edu/api/access/datafile/6165867", 61 | "https://dataverse.harvard.edu/api/access/datafile/6170507", 62 | "https://dataverse.harvard.edu/api/access/datafile/6170517", 63 | "https://dataverse.harvard.edu/api/access/datafile/6170520", 64 | "https://dataverse.harvard.edu/api/access/datafile/6170503", 65 | "https://dataverse.harvard.edu/api/access/datafile/6170509", 66 | "https://dataverse.harvard.edu/api/access/datafile/6170515", 67 | "https://dataverse.harvard.edu/api/access/datafile/6170518", 68 | "https://dataverse.harvard.edu/api/access/datafile/6170510", 69 | "https://dataverse.harvard.edu/api/access/datafile/6170508", 70 | "https://dataverse.harvard.edu/api/access/datafile/6170514", 71 | "https://dataverse.harvard.edu/api/access/datafile/6170512", 72 | "https://dataverse.harvard.edu/api/access/datafile/6170519", 73 | "https://dataverse.harvard.edu/api/access/datafile/6170504")) 74 | 75 | URL <- URLs[chr, "URL"] 76 | system(paste0("wget --progress=bar:force:noscroll ", URLs[chr, "URL"])) 77 | system(paste0("tar -xvf ", gsub(".*?([0-9]+).*", "\\1", URL))) 78 | 79 | ########################################################################## 80 | ### Step 1 (Varinfo_gds) 81 | ########################################################################## 82 | 83 | start_time <- Sys.time() 84 | ### make directory 85 | system(paste0("mkdir ",output_path,"chr",chr)) 86 | 87 | ### chromosome number 88 | 89 | ## open GDS 90 | genofile <- seqOpen(gds.file, readonly = FALSE) 91 | 92 | genofile 93 | 94 | CHR <- as.numeric(seqGetData(genofile, "chromosome")) 95 | position <- as.integer(seqGetData(genofile, "position")) 96 | REF <- as.character(seqGetData(genofile, "$ref")) 97 | ALT <- as.character(seqGetData(genofile, "$alt")) 98 | 99 | VarInfo_genome <- paste0(CHR,"-",position,"-",REF,"-",ALT) 100 | 101 | ## Generate VarInfo 102 | for(kk in 1:dim(DB_info_chr)[1]) 103 | { 104 | print(kk) 105 | 106 | VarInfo <- VarInfo_genome[(position>=DB_info_chr$Start_Pos[kk])&(position<=DB_info_chr$End_Pos[kk])] 107 | VarInfo <- data.frame(VarInfo) 108 | 109 | write.csv(VarInfo,paste0(output_path,"chr",chr,"/VarInfo_chr",chr,"_",kk,".csv"),quote=FALSE,row.names = FALSE) 110 | } 111 | 112 | ########################################################################## 113 | ### Step 2 (Annotate) 114 | ########################################################################## 115 | ### anno channel (subset) 116 | anno_colnum <- c(1,8:12,15,16,19,23,25:36) 117 | 118 | 119 | for(kk in 1:chr_splitnum) 120 | { 121 | print(kk) 122 | 123 | system(paste0(xsv," join --left VarInfo ",output_path,"chr",chr,"/VarInfo_chr",chr,"_",kk,".csv variant_vcf ",DB_path,"/chr",chr,"_",kk,".csv > ",output_path,"chr",chr,"/Anno_chr",chr,"_",kk,".csv")) 124 | } 125 | 126 | ## merge info 127 | Anno <- paste0(output_path,"chr",chr,"/Anno_chr",chr,"_",seq(1:chr_splitnum),".csv ") 128 | merge_command <- paste0(xsv," cat rows ",Anno[1]) 129 | 130 | for(kk in 2:chr_splitnum) 131 | { 132 | merge_command <- paste0(merge_command,Anno[kk]) 133 | } 134 | 135 | merge_command <- paste0(merge_command,"> ",output_path,"chr",chr,"/Anno_chr",chr,".csv") 136 | 137 | system(merge_command) 138 | 139 | ## subset 140 | anno_colnum_xsv <- c() 141 | for(kk in 1:(length(anno_colnum)-1)) 142 | { 143 | anno_colnum_xsv <- paste0(anno_colnum_xsv,anno_colnum[kk],",") 144 | } 145 | anno_colnum_xsv <- paste0(anno_colnum_xsv,anno_colnum[length(anno_colnum)]) 146 | 147 | system(paste0(xsv," select ",anno_colnum_xsv," ",output_path,"chr",chr,"/Anno_chr",chr,".csv > ",output_path,"chr",chr,"/Anno_chr",chr,"_STAARpipeline.csv")) 148 | 149 | ########################################################################## 150 | ### Step 3 (gds2agds) 151 | ########################################################################## 152 | 153 | ### read annotation data 154 | FunctionalAnnotation <- read_csv(paste0(output_path,"chr",chr,"/",anno_file_name_1,chr,anno_file_name_2), 155 | col_types=list(col_character(),col_double(),col_double(),col_double(),col_double(), 156 | col_double(),col_double(),col_double(),col_double(),col_double(), 157 | col_character(),col_character(),col_character(),col_double(),col_character(), 158 | col_character(),col_character(),col_character(),col_character(),col_double(), 159 | col_double(),col_character())) 160 | 161 | dim(FunctionalAnnotation) 162 | 163 | ## rename colnames 164 | colnames(FunctionalAnnotation)[2] <- "apc_conservation" 165 | colnames(FunctionalAnnotation)[7] <- "apc_local_nucleotide_diversity" 166 | colnames(FunctionalAnnotation)[9] <- "apc_protein_function" 167 | 168 | Anno.folder <- index.gdsn(genofile, "annotation/info") 169 | add.gdsn(Anno.folder, "FunctionalAnnotation", val=FunctionalAnnotation, compress="LZMA_ra", closezip=TRUE) 170 | 171 | genofile 172 | 173 | seqClose(genofile) 174 | 175 | end_time <- Sys.time() 176 | 177 | print("time") 178 | end_time - start_time 179 | -------------------------------------------------------------------------------- /Scripts/CSV/FAVORannotatorCSVFullDB.R: -------------------------------------------------------------------------------- 1 | ############################################################################# 2 | #Title: FAVORannotatorCSVFullDB 3 | #Function: 4 | # * Build the aGDS file thorugh performing functional annotation, 5 | # * without pre-install the FAVOR Full Database, download database on the fly. 6 | #Author: Hufeng Zhou 7 | #Time: Sept 27th 2022 8 | ############################################################################# 9 | 10 | 11 | args <- commandArgs(TRUE) 12 | ### mandatory 13 | 14 | gds.file <- args[1] 15 | print(paste0("gds.file: ",gds.file)) 16 | 17 | #outfile <- args[2] 18 | #print(paste0("outfile: ",outfile)) 19 | 20 | chr <- as.numeric(args[2]) 21 | print(paste0("chr: ",chr)) 22 | #chr<-19 23 | 24 | use_compression <- "Yes" 25 | print(paste0("use_compression: ",use_compression)) 26 | 27 | ### output 28 | output_path <- "./" 29 | 30 | ### make directory 31 | system(paste0("mkdir ",output_path,"chr",chr)) 32 | 33 | ### annotation file 34 | dir_anno <- "./" 35 | 36 | 37 | ### load required package 38 | library(gdsfmt) 39 | library(SeqArray) 40 | library(readr) 41 | 42 | 43 | ### chromosome number 44 | ## read info 45 | DB_info <- read.csv(url("https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/main/Scripts/SQL/FAVORdatabase_chrsplit.csv"),header=TRUE) 46 | DB_info <- DB_info[DB_info$Chr==chr,] 47 | 48 | ### DB file 49 | DB_path <- "./" 50 | 51 | ### xsv directory 52 | xsv <- "~/.cargo/bin/xsv" 53 | 54 | 55 | 56 | 57 | ########################################################################## 58 | ### Step 0 (Download FAVOR Database) 59 | ########################################################################## 60 | URLs <- data.frame(chr = c(1:22), 61 | URL = c("https://dataverse.harvard.edu/api/access/datafile/6380374", #1 62 | "https://dataverse.harvard.edu/api/access/datafile/6380471", #2 63 | "https://dataverse.harvard.edu/api/access/datafile/6380732", #3 64 | "https://dataverse.harvard.edu/api/access/datafile/6381512", #4 65 | "https://dataverse.harvard.edu/api/access/datafile/6381457", #5 66 | "https://dataverse.harvard.edu/api/access/datafile/6381327", #6 67 | "https://dataverse.harvard.edu/api/access/datafile/6384125", #7 68 | "https://dataverse.harvard.edu/api/access/datafile/6382573", #8 69 | "https://dataverse.harvard.edu/api/access/datafile/6384268", #9 70 | "https://dataverse.harvard.edu/api/access/datafile/6380273", #10 71 | "https://dataverse.harvard.edu/api/access/datafile/6384154", #11 72 | "https://dataverse.harvard.edu/api/access/datafile/6384198", #12 73 | "https://dataverse.harvard.edu/api/access/datafile/6388366", #13 74 | "https://dataverse.harvard.edu/api/access/datafile/6388406", #14 75 | "https://dataverse.harvard.edu/api/access/datafile/6388427", #15 76 | "https://dataverse.harvard.edu/api/access/datafile/6388551", #16 77 | "https://dataverse.harvard.edu/api/access/datafile/6388894", #17 78 | "https://dataverse.harvard.edu/api/access/datafile/6376523", #18 79 | "https://dataverse.harvard.edu/api/access/datafile/6376522", #19 80 | "https://dataverse.harvard.edu/api/access/datafile/6376521", #20 81 | "https://dataverse.harvard.edu/api/access/datafile/6358305", #21 82 | "https://dataverse.harvard.edu/api/access/datafile/6358299")) #22 83 | 84 | URL <- URLs[chr, "URL"] 85 | system(paste0("wget --progress=bar:force:noscroll ", URLs[chr, "URL"])) 86 | system(paste0("tar -xvf ", gsub(".*?([0-9]+).*", "\\1", URL))) 87 | 88 | ########################################################################## 89 | ### Step 1 (Varinfo_gds) 90 | ########################################################################## 91 | 92 | ## open GDS 93 | genofile <- seqOpen(gds.file, readonly = FALSE) 94 | 95 | genofile 96 | 97 | CHR <- as.numeric(seqGetData(genofile, "chromosome")) 98 | position <- as.integer(seqGetData(genofile, "position")) 99 | REF <- as.character(seqGetData(genofile, "$ref")) 100 | ALT <- as.character(seqGetData(genofile, "$alt")) 101 | 102 | VarInfo_genome <- paste0(CHR,"-",position,"-",REF,"-",ALT) 103 | 104 | ## Generate VarInfo 105 | for(kk in 1:dim(DB_info)[1]) 106 | { 107 | print(kk) 108 | VarInfo <- VarInfo_genome[(position>=DB_info$Start_Pos[kk])&(position<=DB_info$End_Pos[kk])] 109 | VarInfo <- data.frame(VarInfo) 110 | write.csv(VarInfo,paste0(output_path,"chr",chr,"/VarInfo_chr",chr,"_",kk,".csv"),quote=FALSE,row.names = FALSE) 111 | } 112 | 113 | ########################################################################## 114 | ### Step 2 (Annotate) 115 | ########################################################################## 116 | start_time <- Sys.time() 117 | chr_splitnum <- sum(DB_info$Chr==chr) 118 | 119 | for(kk in 1:chr_splitnum) 120 | { 121 | print(kk) 122 | #system(paste0(xsv," index ",DB_path,"/chr",chr,"_",kk,".csv)) 123 | system(paste0(xsv," join --left VarInfo ",output_path,"chr",chr,"/VarInfo_chr",chr,"_",kk,".csv variant_vcf ",DB_path,"/chr",chr,"_",kk,".csv > ",output_path,"chr",chr,"/Anno_chr",chr,"_",kk,".csv")) 124 | } 125 | 126 | ## merge info 127 | Anno <- paste0(output_path,"chr",chr,"/Anno_chr",chr,"_",seq(1:chr_splitnum),".csv ") 128 | merge_command <- paste0(xsv," cat rows ",Anno[1]) 129 | for(kk in 2:chr_splitnum) 130 | { 131 | merge_command <- paste0(merge_command,Anno[kk]) 132 | } 133 | merge_command <- paste0(merge_command,"> ",output_path,"chr",chr,"/Anno_chr",chr,".csv") 134 | system(merge_command) 135 | 136 | ########################################################################## 137 | ### Step 3 (gds2agds) 138 | ########################################################################## 139 | ### read annotation data 140 | FunctionalAnnotation <- read_csv(paste0(dir_anno,"chr",chr,"/Anno_chr",chr,".csv")) 141 | dim(FunctionalAnnotation) 142 | 143 | Anno.folder <- index.gdsn(genofile, "annotation/info") 144 | add.gdsn(Anno.folder, "FAVORFullDB", val=FunctionalAnnotation, compress="LZMA_ra", closezip=TRUE) 145 | genofile 146 | 147 | seqClose(genofile) 148 | end_time <- Sys.time() 149 | 150 | print("time") 151 | end_time - start_time 152 | 153 | #system(paste0("mv ", gds.file, " ", outfile, ".gds")) 154 | 155 | -------------------------------------------------------------------------------- /Scripts/CSV/FAVORannotatorv2aGDS.r: -------------------------------------------------------------------------------- 1 | rm(list=ls()) 2 | gc() 3 | ### R package 4 | library(gdsfmt) 5 | library(SeqArray) 6 | library(readr) 7 | source('config.R') 8 | 9 | CHRN <- as.numeric(commandArgs(TRUE)[1]) 10 | 11 | ### make directory 12 | system(paste0("mkdir ",output_path,"/chr",CHRN)) 13 | start_time<-Sys.time() 14 | 15 | ### chromosome number 16 | ## read info 17 | DB_info <- read.csv(file_DBsplit,header=TRUE) 18 | chr_splitnum <- sum(DB_info$Chr==CHRN) 19 | DB_info_chr <- DB_info[DB_info$Chr==CHRN,] 20 | 21 | ## open GDS 22 | genofile<-seqOpen(eval(parse(text = paste0("gds.chr",CHRN,".fn"))), readonly = FALSE) 23 | 24 | CHR <- as.numeric(seqGetData(genofile, "chromosome")) 25 | position <- as.integer(seqGetData(genofile, "position")) 26 | REF <- as.character(seqGetData(genofile, "$ref")) 27 | ALT <- as.character(seqGetData(genofile, "$alt")) 28 | 29 | VarInfo_genome <- paste0(CHR,"-",position,"-",REF,"-",ALT) 30 | 31 | ## Generate VarInfo 32 | for(kk in 1:dim(DB_info_chr)[1]) 33 | { 34 | print(kk) 35 | VarInfo <- VarInfo_genome[(position>=DB_info_chr$Start_Pos[kk])&(position<=DB_info_chr$End_Pos[kk])] 36 | VarInfo <- data.frame(VarInfo) 37 | write.csv(VarInfo,paste0(output_path,"/chr",CHRN,"/VarInfo_chr",CHRN,"_",kk,".csv"),quote=FALSE,row.names = FALSE) 38 | } 39 | gc() 40 | 41 | for(kk in 1:chr_splitnum) 42 | { 43 | print(kk) 44 | system(paste0(xsv," join --left VarInfo ",output_path,"/chr",CHRN,"/VarInfo_chr",CHRN,"_",kk,".csv variant_vcf ",DB_path,"/chr",CHRN,"_",kk,".csv > ",output_path,"/chr",CHRN,"/Anno_chr",CHRN,"_",kk,".csv")) 45 | } 46 | 47 | ## merge info 48 | Anno <- paste0(output_path,"/chr",CHRN,"/Anno_chr",CHRN,"_",seq(1:chr_splitnum),".csv ") 49 | merge_command <- paste0(xsv," cat rows ",Anno[1]) 50 | 51 | for(kk in 2:chr_splitnum) 52 | { 53 | merge_command <- paste0(merge_command,Anno[kk]) 54 | } 55 | 56 | merge_command <- paste0(merge_command,"> ",output_path,"/chr",CHRN,"/Anno_chr",CHRN,".csv") 57 | system(merge_command) 58 | 59 | gc() 60 | ### read annotation data 61 | FunctionalAnnotation <- read_csv(paste0(output_path,"/chr",CHRN,"/Anno_chr",CHRN,".csv")) 62 | 63 | dim(FunctionalAnnotation) 64 | 65 | 66 | ## open GDS 67 | Anno.folder <- addfolder.gdsn(index.gdsn(genofile, "annotation/info"), "FunctionalAnnotation") 68 | add.gdsn(Anno.folder, "FunctionalAnnotation", val=FunctionalAnnotation, compress="LZMA_ra", closezip=TRUE) 69 | genofile 70 | 71 | seqClose(genofile) 72 | end_time<-Sys.time() 73 | 74 | print("time:") 75 | end_time - start_time 76 | -------------------------------------------------------------------------------- /Scripts/CSV/config.R: -------------------------------------------------------------------------------- 1 | ### DB split information 2 | file_DBsplit <- "../SQL/FAVORdatabase_chrsplit.csv" 3 | 4 | ### output 5 | output_path <- "../../../Output" 6 | 7 | ### xsv directory 8 | xsv <- "~/.cargo/bin/xsv" 9 | 10 | ### DB file 11 | DB_path <- "../../../FullDB/FAVORDB/" 12 | 13 | #---------chr1----------------------- 14 | vcf.chr1.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr1.vcf" 15 | gds.chr1.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr1.agds" 16 | #---------chr2----------------------- 17 | vcf.chr2.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr2.vcf" 18 | gds.chr2.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr2.agds" 19 | #---------chr3----------------------- 20 | vcf.chr3.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr3.vcf" 21 | gds.chr3.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr3.agds" 22 | #---------chr4----------------------- 23 | vcf.chr4.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr4.vcf" 24 | gds.chr4.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr4.agds" 25 | #---------chr5----------------------- 26 | vcf.chr5.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr5.vcf" 27 | gds.chr5.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr5.agds" 28 | #---------chr6----------------------- 29 | vcf.chr6.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr6.vcf" 30 | gds.chr6.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr6.agds" 31 | #---------chr7----------------------- 32 | vcf.chr7.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr7.vcf" 33 | gds.chr7.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr7.agds" 34 | #---------chr1----------------------- 35 | vcf.chr8.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr8.vcf" 36 | gds.chr8.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr8.agds" 37 | #---------chr1----------------------- 38 | vcf.chr9.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr9.vcf" 39 | gds.chr9.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr9.agds" 40 | #---------chr10----------------------- 41 | vcf.chr10.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr10.vcf" 42 | gds.chr10.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr10.agds" 43 | #---------chr10----------------------- 44 | vcf.chr10.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr10.vcf" 45 | gds.chr10.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr10.agds" 46 | #---------chr11----------------------- 47 | vcf.chr11.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr11.vcf" 48 | gds.chr11.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr11.agds" 49 | #---------chr11----------------------- 50 | vcf.chr11.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr11.vcf" 51 | gds.chr11.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr11.agds" 52 | #---------chr12----------------------- 53 | vcf.chr12.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr12.vcf" 54 | gds.chr12.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr12.agds" 55 | #---------chr13----------------------- 56 | vcf.chr13.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr13.vcf" 57 | gds.chr13.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr13.agds" 58 | #---------chr14----------------------- 59 | vcf.chr14.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr14.vcf" 60 | gds.chr14.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr14.agds" 61 | #---------chr15----------------------- 62 | vcf.chr15.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr15.vcf" 63 | gds.chr15.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr15.agds" 64 | #---------chr16----------------------- 65 | vcf.chr16.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr16.vcf" 66 | gds.chr16.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr16.agds" 67 | #---------chr17----------------------- 68 | vcf.chr17.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr17.vcf" 69 | gds.chr17.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr17.agds" 70 | #---------chr18----------------------- 71 | vcf.chr18.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr18.vcf" 72 | gds.chr18.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr18.agds" 73 | #---------chr19----------------------- 74 | vcf.chr19.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr19.vcf" 75 | gds.chr19.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr19.agds" 76 | #---------chr20----------------------- 77 | vcf.chr20.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr20.vcf" 78 | gds.chr20.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr20.agds" 79 | #---------chr21----------------------- 80 | vcf.chr21.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr21.vcf" 81 | gds.chr21.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr21.agds" 82 | #---------chr22----------------------- 83 | vcf.chr22.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr22.vcf" 84 | gds.chr22.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr22.agds" 85 | 86 | -------------------------------------------------------------------------------- /Scripts/CSV/convertVCFtoGDS.r: -------------------------------------------------------------------------------- 1 | ############################################################################# 2 | #Title: convertVCFtoGDS 3 | #Function: 4 | # * Build the GDS file from VCF files 5 | #Author: Hufeng Zhou 6 | #Time: Nov 27th 2021 7 | ############################################################################# 8 | library(gdsfmt) 9 | library(SeqArray) 10 | 11 | #import configuration file 12 | source('config.R') 13 | 14 | #vcf.chr10.fn=as.character(commandArgs(TRUE)[1]) 15 | #gds.chr10.fn=as.character(commandArgs(TRUE)[2]) 16 | CHRN=as.character(commandArgs(TRUE)[1]) 17 | seqVCF2GDS(eval(parse(text = paste0("vcf.chr",CHRN,".fn"))), eval(parse(text = paste0("gds.chr",CHRN,".fn"))), header = NULL, genotype.var.name = "GT", info.import=NULL, fmt.import=NULL, ignore.chr.prefix="chr", raise.error=TRUE, verbose=TRUE) 18 | genofile<-seqOpen(eval(parse(text = paste0("gds.chr",CHRN,".fn"))), readonly = FALSE) 19 | print("GDS built") 20 | 21 | ###Closing Up### 22 | genofile 23 | seqClose(genofile) 24 | -------------------------------------------------------------------------------- /Scripts/CSV/subBatchJobs.sh: -------------------------------------------------------------------------------- 1 | ## define your array 2 | MYFILES=(1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22) 3 | echo ${MYFILES[*]} 4 | ## count how many elements I have 5 | NUM=${#MYFILES[@]} 6 | ZBNUM=$(($NUM -1 )) 7 | 8 | if [ $ZBNUM -ge 0 ]; then 9 | ## not very elegant workaround to the array export issue: 10 | ## package the array into a string with a specific FS 11 | STRINGFILES=$( IFS=$','; echo "${MYFILES[*]}" ) 12 | 13 | printf "STRINGFILES = " 14 | echo $STRINGFILES 15 | 16 | export STRINGFILES 17 | 18 | ## example of how to reconvert into an array inside the slurm file 19 | #IFS=',' read -r -a MYNEWFILES <<< "$STRINGFILES" 20 | #myiid=2 21 | #CURRENTFILE=${MYNEWFILES[$myiid]} 22 | #echo "currentfile is $CURRENTFILE " 23 | 24 | ## submit job 25 | sbatch --array=0-$ZBNUM subBatchJobs.txt 26 | fi 27 | 28 | -------------------------------------------------------------------------------- /Scripts/CSV/subBatchJobs.txt: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -c 1 3 | #SBATCH -N 1 4 | #SBATCH -t 10000 5 | #SBATCH -p shared,xlin,xlin-lab 6 | #SBATCH --mem=128000 7 | #SBATCH -o array_%A_%a.out 8 | #SBATCH -e array_%A_%a.err 9 | #SBATCH --mail-type=ALL 10 | 11 | ## expand back variable into array 12 | IFS=',' read -r -a MYFILES <<< "$STRINGFILES" 13 | cur=${MYFILES[$SLURM_ARRAY_TASK_ID]} 14 | 15 | . ~/.bash_profile 16 | module load R/4.0.2-fasrc01 17 | 18 | cd /directory/aGDSFolder # put all the aGDS files to be annotated in the aGDSFolder 19 | 20 | Rscript FAVORannotatorCSVFullDB.R InputData.chr$cur.agds $cur 21 | 22 | Rscript FAVORannotatorCSVEssentialDB.R InputData.chr$cur.agds $cur 23 | 24 | -------------------------------------------------------------------------------- /Scripts/CSV/submitJobs.sh: -------------------------------------------------------------------------------- 1 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=65000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 1; Rscript ./FAVORannotatorv2aGDS.r 1' 2 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=65000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 2; Rscript ./FAVORannotatorv2aGDS.r 2' 3 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=60000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 3; Rscript ./FAVORannotatorv2aGDS.r 3' 4 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=60000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 4; Rscript ./FAVORannotatorv2aGDS.r 4' 5 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=55000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 5; Rscript ./FAVORannotatorv2aGDS.r 6' 6 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=50000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 6; Rscript ./FAVORannotatorv2aGDS.r 5' 7 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=50000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 7; Rscript ./FAVORannotatorv2aGDS.r 7' 8 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=50000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 8; Rscript ./FAVORannotatorv2aGDS.r 8' 9 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=40000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 9; Rscript ./FAVORannotatorv2aGDS.r 9' 10 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=40000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 10; Rscript ./FAVORannotatorv2aGDS.r 10' 11 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=40000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 11; Rscript ./FAVORannotatorv2aGDS.r 11' 12 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=40000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 12; Rscript ./FAVORannotatorv2aGDS.r 12' 13 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=40000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 13; Rscript ./FAVORannotatorv2aGDS.r 13' 14 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=35000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 14; Rscript ./FAVORannotatorv2aGDS.r 14' 15 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=35000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 15; Rscript ./FAVORannotatorv2aGDS.r 15' 16 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=30000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 16; Rscript ./FAVORannotatorv2aGDS.r 16' 17 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=30000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 17; Rscript ./FAVORannotatorv2aGDS.r 17' 18 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=30000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 18; Rscript ./FAVORannotatorv2aGDS.r 18' 19 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=30000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 19; Rscript ./FAVORannotatorv2aGDS.r 19' 20 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=20000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 20; Rscript ./FAVORannotatorv2aGDS.r 20' 21 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=20000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 21; Rscript ./FAVORannotatorv2aGDS.r 21' 22 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=20000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 22; Rscript ./FAVORannotatorv2aGDS.r 22' 23 | -------------------------------------------------------------------------------- /Scripts/Cloud/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Scripts/Cloud/.DS_Store -------------------------------------------------------------------------------- /Scripts/Cloud/._.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Scripts/Cloud/._.DS_Store -------------------------------------------------------------------------------- /Scripts/Cloud/DNAnexus/._FAVORannotatorDev.R: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Scripts/Cloud/DNAnexus/._FAVORannotatorDev.R -------------------------------------------------------------------------------- /Scripts/Cloud/DNAnexus/._code.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Scripts/Cloud/DNAnexus/._code.sh -------------------------------------------------------------------------------- /Scripts/Cloud/DNAnexus/._favorannotator.R: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Scripts/Cloud/DNAnexus/._favorannotator.R -------------------------------------------------------------------------------- /Scripts/Cloud/DNAnexus/FAVORannotatorDev.R: -------------------------------------------------------------------------------- 1 | args <- commandArgs(TRUE) 2 | ### mandatory 3 | outfile <- args[1] 4 | gds.file <- args[2] 5 | chr <- as.numeric(args[3]) 6 | #chr<-19 7 | use_compression <- args[4] 8 | 9 | 10 | 11 | ########################################################################## 12 | ### Step 0 (Download FAVOR Database) 13 | ########################################################################## 14 | URLs <- data.frame(chr = c(1:22), 15 | URL = c("https://dataverse.harvard.edu/api/access/datafile/6170506", 16 | "https://dataverse.harvard.edu/api/access/datafile/6170501", 17 | "https://dataverse.harvard.edu/api/access/datafile/6170502", 18 | "https://dataverse.harvard.edu/api/access/datafile/6170521", 19 | "https://dataverse.harvard.edu/api/access/datafile/6170511", 20 | "https://dataverse.harvard.edu/api/access/datafile/6170516", 21 | "https://dataverse.harvard.edu/api/access/datafile/6170505", 22 | "https://dataverse.harvard.edu/api/access/datafile/6170513", 23 | "https://dataverse.harvard.edu/api/access/datafile/6165867", 24 | "https://dataverse.harvard.edu/api/access/datafile/6170507", 25 | "https://dataverse.harvard.edu/api/access/datafile/6170517", 26 | "https://dataverse.harvard.edu/api/access/datafile/6170520", 27 | "https://dataverse.harvard.edu/api/access/datafile/6170503", 28 | "https://dataverse.harvard.edu/api/access/datafile/6170509", 29 | "https://dataverse.harvard.edu/api/access/datafile/6170515", 30 | "https://dataverse.harvard.edu/api/access/datafile/6170518", 31 | "https://dataverse.harvard.edu/api/access/datafile/6170510", 32 | "https://dataverse.harvard.edu/api/access/datafile/6170508", 33 | "https://dataverse.harvard.edu/api/access/datafile/6170514", 34 | "https://dataverse.harvard.edu/api/access/datafile/6170512", 35 | "https://dataverse.harvard.edu/api/access/datafile/6170519", 36 | "https://dataverse.harvard.edu/api/access/datafile/6170504")) 37 | 38 | URL <- URLs[chr, "URL"] 39 | system(paste0("wget --progress=bar:force:noscroll ", URLs[chr, "URL"])) 40 | system(paste0("tar -xvf ", gsub(".*?([0-9]+).*", "\\1", URL))) 41 | 42 | ########################################################################## 43 | ### Step 1 (Varinfo_gds) 44 | ########################################################################## 45 | 46 | ### output 47 | output_path <- "/root/./" 48 | 49 | ### make directory 50 | system(paste0("mkdir ",output_path,"chr",chr)) 51 | 52 | ### R package 53 | library(gdsfmt) 54 | library(SeqArray) 55 | library(SeqVarTools) 56 | 57 | ### chromosome number 58 | ## read info 59 | DB_info <- read.csv(url("https://raw.githubusercontent.com/xihaoli/STAARpipeline-Tutorial/main/FAVORannotator_csv/FAVORdatabase_chrsplit.csv"),header=TRUE) 60 | DB_info <- DB_info[DB_info$Chr==chr,] 61 | 62 | ## open GDS 63 | genofile <- seqOpen(gds.file) 64 | 65 | genofile 66 | 67 | CHR <- as.numeric(seqGetData(genofile, "chromosome")) 68 | position <- as.integer(seqGetData(genofile, "position")) 69 | REF <- as.character(seqGetData(genofile, "$ref")) 70 | ALT <- as.character(seqGetData(genofile, "$alt")) 71 | 72 | VarInfo_genome <- paste0(CHR,"-",position,"-",REF,"-",ALT) 73 | 74 | seqClose(genofile) 75 | 76 | ## Generate VarInfo 77 | for(kk in 1:dim(DB_info)[1]) 78 | { 79 | print(kk) 80 | 81 | VarInfo <- VarInfo_genome[(position>=DB_info$Start_Pos[kk])&(position<=DB_info$End_Pos[kk])] 82 | VarInfo <- data.frame(VarInfo) 83 | 84 | write.csv(VarInfo,paste0(output_path,"chr",chr,"/VarInfo_chr",chr,"_",kk,".csv"),quote=FALSE,row.names = FALSE) 85 | } 86 | 87 | ########################################################################## 88 | ### Step 2 (Annotate) 89 | ########################################################################## 90 | 91 | ### xsv directory 92 | xsv <- "/root/.cargo/bin/xsv" 93 | 94 | ### DB file 95 | DB_path <- "n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/" 96 | 97 | ### anno channel (subset) 98 | anno_colnum <- c(1,8:12,15,16,19,23,25:36) 99 | 100 | chr_splitnum <- sum(DB_info$Chr==chr) 101 | 102 | for(kk in 1:chr_splitnum) 103 | { 104 | print(kk) 105 | 106 | system(paste0(xsv," join --left VarInfo ",output_path,"chr",chr,"/VarInfo_chr",chr,"_",kk,".csv variant_vcf ",DB_path,"/chr",chr,"_",kk,".csv > ",output_path,"chr",chr,"/Anno_chr",chr,"_",kk,".csv")) 107 | } 108 | 109 | ## merge info 110 | Anno <- paste0(output_path,"chr",chr,"/Anno_chr",chr,"_",seq(1:chr_splitnum),".csv ") 111 | merge_command <- paste0(xsv," cat rows ",Anno[1]) 112 | 113 | for(kk in 2:chr_splitnum) 114 | { 115 | merge_command <- paste0(merge_command,Anno[kk]) 116 | } 117 | 118 | merge_command <- paste0(merge_command,"> ",output_path,"chr",chr,"/Anno_chr",chr,".csv") 119 | 120 | system(merge_command) 121 | 122 | ## subset 123 | anno_colnum_xsv <- c() 124 | for(kk in 1:(length(anno_colnum)-1)) 125 | { 126 | anno_colnum_xsv <- paste0(anno_colnum_xsv,anno_colnum[kk],",") 127 | } 128 | anno_colnum_xsv <- paste0(anno_colnum_xsv,anno_colnum[length(anno_colnum)]) 129 | 130 | system(paste0(xsv," select ",anno_colnum_xsv," ",output_path,"chr",chr,"/Anno_chr",chr,".csv > ",output_path,"chr",chr,"/Anno_chr",chr,"_STAARpipeline.csv")) 131 | 132 | ########################################################################## 133 | ### Step 3 (gds2agds) 134 | ########################################################################## 135 | 136 | ### annotation file 137 | dir_anno <- "/root/" 138 | anno_file_name_1 <- "Anno_chr" 139 | anno_file_name_2 <- "_STAARpipeline.csv" 140 | 141 | ### load required package 142 | library(gdsfmt) 143 | library(SeqArray) 144 | library(SeqVarTools) 145 | library(readr) 146 | 147 | ### read annotation data 148 | FunctionalAnnotation <- read_csv(paste0(dir_anno,"chr",chr,"/",anno_file_name_1,chr,anno_file_name_2), 149 | col_types=list(col_character(),col_double(),col_double(),col_double(),col_double(), 150 | col_double(),col_double(),col_double(),col_double(),col_double(), 151 | col_character(),col_character(),col_character(),col_double(),col_character(), 152 | col_character(),col_character(),col_character(),col_character(),col_double(), 153 | col_double(),col_character())) 154 | 155 | dim(FunctionalAnnotation) 156 | 157 | ## rename colnames 158 | colnames(FunctionalAnnotation)[2] <- "apc_conservation" 159 | colnames(FunctionalAnnotation)[7] <- "apc_local_nucleotide_diversity" 160 | colnames(FunctionalAnnotation)[9] <- "apc_protein_function" 161 | 162 | ## open GDS 163 | genofile <- seqOpen(gds.file, readonly = FALSE) 164 | 165 | #Anno.folder <- addfolder.gdsn(index.gdsn(genofile, "annotation/info"), "FunctionalAnnotation") 166 | Anno.folder <- index.gdsn(genofile, "annotation/info") 167 | if(use_compression == "YES") 168 | { 169 | add.gdsn(Anno.folder, "FunctionalAnnotation", val=FunctionalAnnotation, compress="LZMA_ra", closezip=TRUE) 170 | }else 171 | { 172 | add.gdsn(Anno.folder, "FunctionalAnnotation", val=FunctionalAnnotation) 173 | } 174 | 175 | seqClose(genofile) 176 | 177 | system(paste0("mv ", gds.file, " ", outfile, ".gds")) 178 | 179 | 180 | 181 | print(args) -------------------------------------------------------------------------------- /Scripts/Cloud/DNAnexus/code.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # favorannotator_v1.0.0 3 | # Generated by dx-app-wizard. 4 | # 5 | # Basic execution pattern: Your app will run on a single machine from 6 | # beginning to end. 7 | # 8 | # Your job's input variables (if any) will be loaded as environment 9 | # variables before this script runs. Any array inputs will be loaded 10 | # as bash arrays. 11 | # 12 | # Any code outside of main() (or any entry point you may add) is 13 | # ALWAYS executed, followed by running the entry point itself. 14 | # 15 | # See https://documentation.dnanexus.com/developer for tutorials on how 16 | # to modify this file. 17 | 18 | main() { 19 | 20 | echo "Value of outfile: '$outfile'" 21 | echo "Value of gds_file: '$gds_file'" 22 | echo "Value of chromosome: '$chromosome'" 23 | echo "Value of use_compression: '$use_compression'" 24 | 25 | # The following line(s) use the dx command-line tool to download your file 26 | # inputs to the local file system using variable names for the filenames. To 27 | # recover the original filenames, you can use the output of "dx describe 28 | # "$variable" --name". 29 | 30 | if [ -n "$gds_file" ] 31 | then 32 | dx download "$gds_file" -o gds_file.gds & 33 | gds_file2="gds_file.gds" 34 | else 35 | gds_file2="NO_GDS_FILE" 36 | fi 37 | 38 | echo "Installing xsv" 39 | curl https://sh.rustup.rs -sSf | sh -s -- -y 40 | source $HOME/.cargo/env 41 | cargo install xsv 42 | 43 | echo "Rscript --vanilla favorannotator.R $outfile $gds_file2 $chromosome $use_compression" 44 | dx-docker run -v /home/dnanexus/:/home/dnanexus/ -w /home/dnanexus/ zilinli/staarpipeline:0.9.6 Rscript --vanilla favorannotator.R $outfile $gds_file2 $chromosome $use_compression 45 | mkdir -p out/results 46 | mv ${outfile}.gds out/results 47 | dx-upload-all-outputs 48 | } 49 | 50 | -------------------------------------------------------------------------------- /Scripts/Cloud/DNAnexus/favorannotator.R: -------------------------------------------------------------------------------- 1 | args <- commandArgs(TRUE) 2 | ### mandatory 3 | outfile <- args[1] 4 | gds.file <- args[2] 5 | #chr <- as.numeric(args[3]) 6 | chr<-19 7 | use_compression <- args[4] 8 | 9 | ########################################################################## 10 | ### Step 0 (Download FAVOR Database) 11 | ########################################################################## 12 | URLs <- data.frame(chr = c(1:22), 13 | URL = c("https://dataverse.harvard.edu/api/access/datafile/6170506", 14 | "https://dataverse.harvard.edu/api/access/datafile/6170501", 15 | "https://dataverse.harvard.edu/api/access/datafile/6170502", 16 | "https://dataverse.harvard.edu/api/access/datafile/6170521", 17 | "https://dataverse.harvard.edu/api/access/datafile/6170511", 18 | "https://dataverse.harvard.edu/api/access/datafile/6170516", 19 | "https://dataverse.harvard.edu/api/access/datafile/6170505", 20 | "https://dataverse.harvard.edu/api/access/datafile/6170513", 21 | "https://dataverse.harvard.edu/api/access/datafile/6165867", 22 | "https://dataverse.harvard.edu/api/access/datafile/6170507", 23 | "https://dataverse.harvard.edu/api/access/datafile/6170517", 24 | "https://dataverse.harvard.edu/api/access/datafile/6170520", 25 | "https://dataverse.harvard.edu/api/access/datafile/6170503", 26 | "https://dataverse.harvard.edu/api/access/datafile/6170509", 27 | "https://dataverse.harvard.edu/api/access/datafile/6170515", 28 | "https://dataverse.harvard.edu/api/access/datafile/6170518", 29 | "https://dataverse.harvard.edu/api/access/datafile/6170510", 30 | "https://dataverse.harvard.edu/api/access/datafile/6170508", 31 | "https://dataverse.harvard.edu/api/access/datafile/6170514", 32 | "https://dataverse.harvard.edu/api/access/datafile/6170512", 33 | "https://dataverse.harvard.edu/api/access/datafile/6170519", 34 | "https://dataverse.harvard.edu/api/access/datafile/6170504")) 35 | 36 | URL <- URLs[chr, "URL"] 37 | system(paste0("wget --progress=bar:force:noscroll ", URLs[chr, "URL"])) 38 | system(paste0("tar -xvf ", gsub(".*?([0-9]+).*", "\\1", URL))) 39 | 40 | ########################################################################## 41 | ### Step 1 (Varinfo_gds) 42 | ########################################################################## 43 | 44 | ### output 45 | output_path <- "./" 46 | 47 | ### make directory 48 | system(paste0("mkdir ",output_path,"chr",chr)) 49 | 50 | ### R package 51 | library(gdsfmt) 52 | library(SeqArray) 53 | library(SeqVarTools) 54 | 55 | ### chromosome number 56 | ## read info 57 | DB_info <- read.csv(url("https://raw.githubusercontent.com/xihaoli/STAARpipeline-Tutorial/main/FAVORannotator_csv/FAVORdatabase_chrsplit.csv"),header=TRUE) 58 | DB_info <- DB_info[DB_info$Chr==chr,] 59 | 60 | ## open GDS 61 | genofile <- seqOpen(gds.file) 62 | 63 | CHR <- as.numeric(seqGetData(genofile, "chromosome")) 64 | position <- as.integer(seqGetData(genofile, "position")) 65 | REF <- as.character(seqGetData(genofile, "$ref")) 66 | ALT <- as.character(seqGetData(genofile, "$alt")) 67 | 68 | VarInfo_genome <- paste0(CHR,"-",position,"-",REF,"-",ALT) 69 | 70 | seqClose(genofile) 71 | 72 | ## Generate VarInfo 73 | for(kk in 1:dim(DB_info)[1]) 74 | { 75 | print(kk) 76 | 77 | VarInfo <- VarInfo_genome[(position>=DB_info$Start_Pos[kk])&(position<=DB_info$End_Pos[kk])] 78 | VarInfo <- data.frame(VarInfo) 79 | 80 | write.csv(VarInfo,paste0(output_path,"chr",chr,"/VarInfo_chr",chr,"_",kk,".csv"),quote=FALSE,row.names = FALSE) 81 | } 82 | 83 | ########################################################################## 84 | ### Step 2 (Annotate) 85 | ########################################################################## 86 | 87 | ### xsv directory 88 | xsv <- ".cargo/bin/xsv" 89 | 90 | ### DB file 91 | DB_path <- "n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/" 92 | 93 | ### anno channel (subset) 94 | anno_colnum <- c(1,8:12,15,16,19,23,25:36) 95 | 96 | chr_splitnum <- sum(DB_info$Chr==chr) 97 | 98 | for(kk in 1:chr_splitnum) 99 | { 100 | print(kk) 101 | 102 | system(paste0(xsv," join --left VarInfo ",output_path,"chr",chr,"/VarInfo_chr",chr,"_",kk,".csv variant_vcf ",DB_path,"/chr",chr,"_",kk,".csv > ",output_path,"chr",chr,"/Anno_chr",chr,"_",kk,".csv")) 103 | } 104 | 105 | ## merge info 106 | Anno <- paste0(output_path,"chr",chr,"/Anno_chr",chr,"_",seq(1:chr_splitnum),".csv ") 107 | merge_command <- paste0(xsv," cat rows ",Anno[1]) 108 | 109 | for(kk in 2:chr_splitnum) 110 | { 111 | merge_command <- paste0(merge_command,Anno[kk]) 112 | } 113 | 114 | merge_command <- paste0(merge_command,"> ",output_path,"chr",chr,"/Anno_chr",chr,".csv") 115 | 116 | system(merge_command) 117 | 118 | ## subset 119 | anno_colnum_xsv <- c() 120 | for(kk in 1:(length(anno_colnum)-1)) 121 | { 122 | anno_colnum_xsv <- paste0(anno_colnum_xsv,anno_colnum[kk],",") 123 | } 124 | anno_colnum_xsv <- paste0(anno_colnum_xsv,anno_colnum[length(anno_colnum)]) 125 | 126 | system(paste0(xsv," select ",anno_colnum_xsv," ",output_path,"chr",chr,"/Anno_chr",chr,".csv > ",output_path,"chr",chr,"/Anno_chr",chr,"_STAARpipeline.csv")) 127 | 128 | ########################################################################## 129 | ### Step 3 (gds2agds) 130 | ########################################################################## 131 | 132 | ### annotation file 133 | dir_anno <- "" 134 | anno_file_name_1 <- "Anno_chr" 135 | anno_file_name_2 <- "_STAARpipeline.csv" 136 | 137 | ### load required package 138 | library(gdsfmt) 139 | library(SeqArray) 140 | library(SeqVarTools) 141 | library(readr) 142 | 143 | ### read annotation data 144 | FunctionalAnnotation <- read_csv(paste0(dir_anno,"chr",chr,"/",anno_file_name_1,chr,anno_file_name_2), 145 | col_types=list(col_character(),col_double(),col_double(),col_double(),col_double(), 146 | col_double(),col_double(),col_double(),col_double(),col_double(), 147 | col_character(),col_character(),col_character(),col_double(),col_character(), 148 | col_character(),col_character(),col_character(),col_character(),col_double(), 149 | col_double(),col_character())) 150 | 151 | dim(FunctionalAnnotation) 152 | 153 | ## rename colnames 154 | colnames(FunctionalAnnotation)[2] <- "apc_conservation" 155 | colnames(FunctionalAnnotation)[7] <- "apc_local_nucleotide_diversity" 156 | colnames(FunctionalAnnotation)[9] <- "apc_protein_function" 157 | 158 | ## open GDS 159 | genofile <- seqOpen(gds.file, readonly = FALSE) 160 | 161 | #Anno.folder <- addfolder.gdsn(index.gdsn(genofile, "annotation/info"), "FunctionalAnnotation") 162 | Anno.folder <- index.gdsn(genofile, "annotation/info") 163 | if(use_compression == "YES") 164 | { 165 | add.gdsn(Anno.folder, "FunctionalAnnotation", val=FunctionalAnnotation, compress="LZMA_ra", closezip=TRUE) 166 | }else 167 | { 168 | add.gdsn(Anno.folder, "FunctionalAnnotation", val=FunctionalAnnotation) 169 | } 170 | 171 | seqClose(genofile) 172 | 173 | system(paste0("mv ", gds.file, " ", outfile, ".gds")) 174 | 175 | -------------------------------------------------------------------------------- /Scripts/Cloud/Terra/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Scripts/Cloud/Terra/.DS_Store -------------------------------------------------------------------------------- /Scripts/Cloud/Terra/.Rhistory: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Scripts/Cloud/Terra/.Rhistory -------------------------------------------------------------------------------- /Scripts/Cloud/Terra/FAVORannotatorEssentialDB.wdl: -------------------------------------------------------------------------------- 1 | workflow FAVORannotator{ 2 | 3 | File InputaGDS 4 | Int CHRN 5 | 6 | call FunctionalAnnotation { 7 | input: 8 | InputaGDS=InputaGDS, CHRN=CHRN 9 | 10 | } 11 | 12 | } 13 | 14 | task FunctionalAnnotation{ 15 | 16 | Int CHRN 17 | File InputaGDS 18 | File? FAVORannotator = "gs://fc-secure-38f900cb-e5ed-481d-b866-6c98b7e5e7ea/FAVORannotatorTerra.R" 19 | 20 | runtime{ 21 | docker: "zilinli/staarpipeline:0.9.6" 22 | memory: "36G" 23 | cpu: "1" 24 | zones: "us-central1-c us-central1-b" 25 | disks: "local-disk " + 500 + " HDD" 26 | preemptible: 1 27 | } 28 | 29 | command { 30 | curl https://sh.rustup.rs -sSf | sh -s -- -y 31 | source $HOME/.cargo/env 32 | cargo install xsv 33 | echo ${InputaGDS} 34 | echo ${CHRN} 35 | df -a -h 36 | Rscript ${FAVORannotator} ${InputaGDS} ${CHRN} 37 | echo "Finished: in wdl r scripts" 38 | df -a -h 39 | mv ${InputaGDS} AnnotatedOutput.${CHRN}.agds 40 | } 41 | 42 | output { 43 | File OutputResults = "AnnotatedOutput.${CHRN}.agds" 44 | } 45 | } 46 | 47 | -------------------------------------------------------------------------------- /Scripts/Cloud/Terra/FAVORannotatorFullDB.wdl: -------------------------------------------------------------------------------- 1 | workflow FAVORannotator{ 2 | 3 | File InputaGDS 4 | Int CHRN 5 | 6 | call FunctionalAnnotation { 7 | input: 8 | InputaGDS=InputaGDS, CHRN=CHRN 9 | } 10 | 11 | } 12 | 13 | task FunctionalAnnotation{ 14 | 15 | Int CHRN 16 | File InputaGDS 17 | File? FAVORannotator = "gs://fc-secure-38f900cb-e5ed-481d-b866-6c98b7e5e7ea/FAVORannotatorTerraFullDB.R" 18 | 19 | runtime{ 20 | docker: "zilinli/staarpipeline:0.9.6" 21 | memory: "56G" 22 | cpu: "1" 23 | zones: "us-central1-c us-central1-b" 24 | disks: "local-disk " + 500 + " HDD" 25 | preemptible: 1 26 | } 27 | 28 | command { 29 | curl https://sh.rustup.rs -sSf | sh -s -- -y 30 | source $HOME/.cargo/env 31 | cargo install xsv 32 | echo ${InputaGDS} 33 | echo ${CHRN} 34 | df -a -h 35 | Rscript ${FAVORannotator} ${InputaGDS} ${CHRN} 36 | echo "Finished: in wdl r scripts" 37 | df -a -h 38 | mv ${InputaGDS} AnnotatedOutput.${CHRN}.agds 39 | } 40 | 41 | output { 42 | File OutputResults = "AnnotatedOutput.${CHRN}.agds" 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /Scripts/Cloud/Terra/FAVORannotatorTerra.r: -------------------------------------------------------------------------------- 1 | args <- commandArgs(TRUE) 2 | ### mandatory 3 | 4 | gds.file <- args[1] 5 | print(paste0("gds.file: ",gds.file)) 6 | 7 | #outfile <- args[2] 8 | #print(paste0("outfile: ",outfile)) 9 | 10 | chr <- as.numeric(args[2]) 11 | print(paste0("chr: ",chr)) 12 | #chr<-19 13 | 14 | #use_compression <- args[4] 15 | use_compression <- "Yes" 16 | print(paste0("use_compression: ",use_compression)) 17 | 18 | ########################################################################## 19 | ### Step 0 (Download FAVOR Database) 20 | ########################################################################## 21 | URLs <- data.frame(chr = c(1:22), 22 | URL = c("https://dataverse.harvard.edu/api/access/datafile/6170506", 23 | "https://dataverse.harvard.edu/api/access/datafile/6170501", 24 | "https://dataverse.harvard.edu/api/access/datafile/6170502", 25 | "https://dataverse.harvard.edu/api/access/datafile/6170521", 26 | "https://dataverse.harvard.edu/api/access/datafile/6170511", 27 | "https://dataverse.harvard.edu/api/access/datafile/6170516", 28 | "https://dataverse.harvard.edu/api/access/datafile/6170505", 29 | "https://dataverse.harvard.edu/api/access/datafile/6170513", 30 | "https://dataverse.harvard.edu/api/access/datafile/6165867", 31 | "https://dataverse.harvard.edu/api/access/datafile/6170507", 32 | "https://dataverse.harvard.edu/api/access/datafile/6170517", 33 | "https://dataverse.harvard.edu/api/access/datafile/6170520", 34 | "https://dataverse.harvard.edu/api/access/datafile/6170503", 35 | "https://dataverse.harvard.edu/api/access/datafile/6170509", 36 | "https://dataverse.harvard.edu/api/access/datafile/6170515", 37 | "https://dataverse.harvard.edu/api/access/datafile/6170518", 38 | "https://dataverse.harvard.edu/api/access/datafile/6170510", 39 | "https://dataverse.harvard.edu/api/access/datafile/6170508", 40 | "https://dataverse.harvard.edu/api/access/datafile/6170514", 41 | "https://dataverse.harvard.edu/api/access/datafile/6170512", 42 | "https://dataverse.harvard.edu/api/access/datafile/6170519", 43 | "https://dataverse.harvard.edu/api/access/datafile/6170504")) 44 | 45 | URL <- URLs[chr, "URL"] 46 | system(paste0("wget --progress=bar:force:noscroll ", URLs[chr, "URL"])) 47 | system(paste0("tar -xvf ", gsub(".*?([0-9]+).*", "\\1", URL))) 48 | 49 | ########################################################################## 50 | ### Step 1 (Varinfo_gds) 51 | ########################################################################## 52 | 53 | ### output 54 | output_path <- "/cromwell_root/./" 55 | #output_path <- "/root/./" 56 | 57 | ### make directory 58 | system(paste0("mkdir ",output_path,"chr",chr)) 59 | 60 | ### R package 61 | library(gdsfmt) 62 | library(SeqArray) 63 | library(SeqVarTools) 64 | 65 | ### chromosome number 66 | ## read info 67 | DB_info <- read.csv(url("https://raw.githubusercontent.com/xihaoli/STAARpipeline-Tutorial/main/FAVORannotator_csv/FAVORdatabase_chrsplit.csv"),header=TRUE) 68 | DB_info <- DB_info[DB_info$Chr==chr,] 69 | 70 | ## open GDS 71 | genofile <- seqOpen(gds.file) 72 | 73 | genofile 74 | 75 | CHR <- as.numeric(seqGetData(genofile, "chromosome")) 76 | position <- as.integer(seqGetData(genofile, "position")) 77 | REF <- as.character(seqGetData(genofile, "$ref")) 78 | ALT <- as.character(seqGetData(genofile, "$alt")) 79 | 80 | VarInfo_genome <- paste0(CHR,"-",position,"-",REF,"-",ALT) 81 | 82 | seqClose(genofile) 83 | 84 | ## Generate VarInfo 85 | for(kk in 1:dim(DB_info)[1]) 86 | { 87 | print(kk) 88 | 89 | VarInfo <- VarInfo_genome[(position>=DB_info$Start_Pos[kk])&(position<=DB_info$End_Pos[kk])] 90 | VarInfo <- data.frame(VarInfo) 91 | 92 | write.csv(VarInfo,paste0(output_path,"chr",chr,"/VarInfo_chr",chr,"_",kk,".csv"),quote=FALSE,row.names = FALSE) 93 | } 94 | 95 | ########################################################################## 96 | ### Step 2 (Annotate) 97 | ########################################################################## 98 | 99 | ### xsv directory 100 | #xsv <- "/cromwell_root/.cargo/bin/xsv" 101 | xsv <- "/root/.cargo/bin/xsv" 102 | 103 | ### DB file 104 | DB_path <- "n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/" 105 | 106 | ### anno channel (subset) 107 | anno_colnum <- c(1,8:12,15,16,19,23,25:36) 108 | 109 | chr_splitnum <- sum(DB_info$Chr==chr) 110 | 111 | for(kk in 1:chr_splitnum) 112 | { 113 | print(kk) 114 | 115 | system(paste0(xsv," join --left VarInfo ",output_path,"chr",chr,"/VarInfo_chr",chr,"_",kk,".csv variant_vcf ",DB_path,"/chr",chr,"_",kk,".csv > ",output_path,"chr",chr,"/Anno_chr",chr,"_",kk,".csv")) 116 | } 117 | 118 | ## merge info 119 | Anno <- paste0(output_path,"chr",chr,"/Anno_chr",chr,"_",seq(1:chr_splitnum),".csv ") 120 | merge_command <- paste0(xsv," cat rows ",Anno[1]) 121 | 122 | for(kk in 2:chr_splitnum) 123 | { 124 | merge_command <- paste0(merge_command,Anno[kk]) 125 | } 126 | 127 | merge_command <- paste0(merge_command,"> ",output_path,"chr",chr,"/Anno_chr",chr,".csv") 128 | 129 | system(merge_command) 130 | 131 | ## subset 132 | anno_colnum_xsv <- c() 133 | for(kk in 1:(length(anno_colnum)-1)) 134 | { 135 | anno_colnum_xsv <- paste0(anno_colnum_xsv,anno_colnum[kk],",") 136 | } 137 | anno_colnum_xsv <- paste0(anno_colnum_xsv,anno_colnum[length(anno_colnum)]) 138 | 139 | system(paste0(xsv," select ",anno_colnum_xsv," ",output_path,"chr",chr,"/Anno_chr",chr,".csv > ",output_path,"chr",chr,"/Anno_chr",chr,"_STAARpipeline.csv")) 140 | 141 | ########################################################################## 142 | ### Step 3 (gds2agds) 143 | ########################################################################## 144 | 145 | ### annotation file 146 | dir_anno <- "/cromwell_root/" 147 | #dir_anno <- "/root/" 148 | anno_file_name_1 <- "Anno_chr" 149 | anno_file_name_2 <- "_STAARpipeline.csv" 150 | 151 | ### load required package 152 | library(gdsfmt) 153 | library(SeqArray) 154 | library(SeqVarTools) 155 | library(readr) 156 | 157 | ### read annotation data 158 | FunctionalAnnotation <- read_csv(paste0(dir_anno,"chr",chr,"/",anno_file_name_1,chr,anno_file_name_2), 159 | col_types=list(col_character(),col_double(),col_double(),col_double(),col_double(), 160 | col_double(),col_double(),col_double(),col_double(),col_double(), 161 | col_character(),col_character(),col_character(),col_double(),col_character(), 162 | col_character(),col_character(),col_character(),col_character(),col_double(), 163 | col_double(),col_character())) 164 | 165 | dim(FunctionalAnnotation) 166 | 167 | ## rename colnames 168 | colnames(FunctionalAnnotation)[2] <- "apc_conservation" 169 | colnames(FunctionalAnnotation)[7] <- "apc_local_nucleotide_diversity" 170 | colnames(FunctionalAnnotation)[9] <- "apc_protein_function" 171 | 172 | ## open GDS 173 | genofile <- seqOpen(gds.file, readonly = FALSE) 174 | 175 | #Anno.folder <- addfolder.gdsn(index.gdsn(genofile, "annotation/info"), "FunctionalAnnotationTest1") 176 | Anno.folder <- index.gdsn(genofile, "annotation/info") 177 | if(use_compression == "YES") 178 | { 179 | add.gdsn(Anno.folder, "FunctionalAnnotationJun1st2022", val=FunctionalAnnotation, compress="LZMA_ra", closezip=TRUE) 180 | }else 181 | { 182 | add.gdsn(Anno.folder, "FunctionalAnnotationJun1st2022", val=FunctionalAnnotation) 183 | } 184 | genofile 185 | 186 | seqClose(genofile) 187 | 188 | #system(paste0("mv ", gds.file, " ", outfile, ".gds")) 189 | 190 | -------------------------------------------------------------------------------- /Scripts/Cloud/Terra/FAVORannotatorTerraEssentialDB.R: -------------------------------------------------------------------------------- 1 | args <- commandArgs(TRUE) 2 | ### mandatory 3 | 4 | gds.file <- args[1] 5 | print(paste0("gds.file: ",gds.file)) 6 | 7 | #outfile <- args[2] 8 | #print(paste0("outfile: ",outfile)) 9 | 10 | chr <- as.numeric(args[2]) 11 | print(paste0("chr: ",chr)) 12 | #chr<-19 13 | 14 | #use_compression <- args[4] 15 | use_compression <- "Yes" 16 | print(paste0("use_compression: ",use_compression)) 17 | 18 | ########################################################################## 19 | ### Step 0 (Download FAVOR Database) 20 | ########################################################################## 21 | URLs <- data.frame(chr = c(1:22), 22 | URL = c("https://dataverse.harvard.edu/api/access/datafile/6170506", 23 | "https://dataverse.harvard.edu/api/access/datafile/6170501", 24 | "https://dataverse.harvard.edu/api/access/datafile/6170502", 25 | "https://dataverse.harvard.edu/api/access/datafile/6170521", 26 | "https://dataverse.harvard.edu/api/access/datafile/6170511", 27 | "https://dataverse.harvard.edu/api/access/datafile/6170516", 28 | "https://dataverse.harvard.edu/api/access/datafile/6170505", 29 | "https://dataverse.harvard.edu/api/access/datafile/6170513", 30 | "https://dataverse.harvard.edu/api/access/datafile/6165867", 31 | "https://dataverse.harvard.edu/api/access/datafile/6170507", 32 | "https://dataverse.harvard.edu/api/access/datafile/6170517", 33 | "https://dataverse.harvard.edu/api/access/datafile/6170520", 34 | "https://dataverse.harvard.edu/api/access/datafile/6170503", 35 | "https://dataverse.harvard.edu/api/access/datafile/6170509", 36 | "https://dataverse.harvard.edu/api/access/datafile/6170515", 37 | "https://dataverse.harvard.edu/api/access/datafile/6170518", 38 | "https://dataverse.harvard.edu/api/access/datafile/6170510", 39 | "https://dataverse.harvard.edu/api/access/datafile/6170508", 40 | "https://dataverse.harvard.edu/api/access/datafile/6170514", 41 | "https://dataverse.harvard.edu/api/access/datafile/6170512", 42 | "https://dataverse.harvard.edu/api/access/datafile/6170519", 43 | "https://dataverse.harvard.edu/api/access/datafile/6170504")) 44 | 45 | URL <- URLs[chr, "URL"] 46 | system(paste0("wget --progress=bar:force:noscroll ", URLs[chr, "URL"])) 47 | system(paste0("tar -xvf ", gsub(".*?([0-9]+).*", "\\1", URL))) 48 | 49 | ########################################################################## 50 | ### Step 1 (Varinfo_gds) 51 | ########################################################################## 52 | 53 | ### output 54 | output_path <- "/root/./" 55 | 56 | ### make directory 57 | system(paste0("mkdir ",output_path,"chr",chr)) 58 | 59 | ### R package 60 | library(gdsfmt) 61 | library(SeqArray) 62 | library(SeqVarTools) 63 | 64 | ### chromosome number 65 | ## read info 66 | DB_info <- read.csv(url("https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/main/Scripts/SQL/FAVORdatabase_chrsplit.csv"),header=TRUE) 67 | DB_info <- DB_info[DB_info$Chr==chr,] 68 | 69 | ## open GDS 70 | genofile <- seqOpen(gds.file) 71 | 72 | genofile 73 | 74 | CHR <- as.numeric(seqGetData(genofile, "chromosome")) 75 | position <- as.integer(seqGetData(genofile, "position")) 76 | REF <- as.character(seqGetData(genofile, "$ref")) 77 | ALT <- as.character(seqGetData(genofile, "$alt")) 78 | 79 | VarInfo_genome <- paste0(CHR,"-",position,"-",REF,"-",ALT) 80 | 81 | seqClose(genofile) 82 | 83 | ## Generate VarInfo 84 | for(kk in 1:dim(DB_info)[1]) 85 | { 86 | print(kk) 87 | 88 | VarInfo <- VarInfo_genome[(position>=DB_info$Start_Pos[kk])&(position<=DB_info$End_Pos[kk])] 89 | VarInfo <- data.frame(VarInfo) 90 | 91 | write.csv(VarInfo,paste0(output_path,"chr",chr,"/VarInfo_chr",chr,"_",kk,".csv"),quote=FALSE,row.names = FALSE) 92 | } 93 | 94 | ########################################################################## 95 | ### Step 2 (Annotate) 96 | ########################################################################## 97 | 98 | ### xsv directory 99 | xsv <- "/root/.cargo/bin/xsv" 100 | 101 | ### DB file 102 | DB_path <- "n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/" 103 | 104 | ### anno channel (subset) 105 | anno_colnum <- c(1,8:12,15,16,19,23,25:36) 106 | 107 | chr_splitnum <- sum(DB_info$Chr==chr) 108 | 109 | for(kk in 1:chr_splitnum) 110 | { 111 | print(kk) 112 | 113 | system(paste0(xsv," join --left VarInfo ",output_path,"chr",chr,"/VarInfo_chr",chr,"_",kk,".csv variant_vcf ",DB_path,"/chr",chr,"_",kk,".csv > ",output_path,"chr",chr,"/Anno_chr",chr,"_",kk,".csv")) 114 | } 115 | 116 | ## merge info 117 | Anno <- paste0(output_path,"chr",chr,"/Anno_chr",chr,"_",seq(1:chr_splitnum),".csv ") 118 | merge_command <- paste0(xsv," cat rows ",Anno[1]) 119 | 120 | for(kk in 2:chr_splitnum) 121 | { 122 | merge_command <- paste0(merge_command,Anno[kk]) 123 | } 124 | 125 | merge_command <- paste0(merge_command,"> ",output_path,"chr",chr,"/Anno_chr",chr,".csv") 126 | 127 | system(merge_command) 128 | 129 | ## subset 130 | anno_colnum_xsv <- c() 131 | for(kk in 1:(length(anno_colnum)-1)) 132 | { 133 | anno_colnum_xsv <- paste0(anno_colnum_xsv,anno_colnum[kk],",") 134 | } 135 | anno_colnum_xsv <- paste0(anno_colnum_xsv,anno_colnum[length(anno_colnum)]) 136 | 137 | system(paste0(xsv," select ",anno_colnum_xsv," ",output_path,"chr",chr,"/Anno_chr",chr,".csv > ",output_path,"chr",chr,"/Anno_chr",chr,"_STAARpipeline.csv")) 138 | 139 | ########################################################################## 140 | ### Step 3 (gds2agds) 141 | ########################################################################## 142 | 143 | ### annotation file 144 | dir_anno <- "/root/" 145 | anno_file_name_1 <- "Anno_chr" 146 | anno_file_name_2 <- "_STAARpipeline.csv" 147 | 148 | ### load required package 149 | library(gdsfmt) 150 | library(SeqArray) 151 | library(SeqVarTools) 152 | library(readr) 153 | 154 | ### read annotation data 155 | FunctionalAnnotation <- read_csv(paste0(dir_anno,"chr",chr,"/",anno_file_name_1,chr,anno_file_name_2), 156 | col_types=list(col_character(),col_double(),col_double(),col_double(),col_double(), 157 | col_double(),col_double(),col_double(),col_double(),col_double(), 158 | col_character(),col_character(),col_character(),col_double(),col_character(), 159 | col_character(),col_character(),col_character(),col_character(),col_double(), 160 | col_double(),col_character())) 161 | 162 | dim(FunctionalAnnotation) 163 | 164 | ## rename colnames 165 | colnames(FunctionalAnnotation)[2] <- "apc_conservation" 166 | colnames(FunctionalAnnotation)[7] <- "apc_local_nucleotide_diversity" 167 | colnames(FunctionalAnnotation)[9] <- "apc_protein_function" 168 | 169 | ## open GDS 170 | genofile <- seqOpen(gds.file, readonly = FALSE) 171 | 172 | #Anno.folder <- addfolder.gdsn(index.gdsn(genofile, "annotation/info"), "FunctionalAnnotationTest1") 173 | Anno.folder <- index.gdsn(genofile, "annotation/info") 174 | if(use_compression == "YES") 175 | { 176 | add.gdsn(Anno.folder, "FunctionalAnnotationJun1st2022", val=FunctionalAnnotation, compress="LZMA_ra", closezip=TRUE) 177 | }else 178 | { 179 | add.gdsn(Anno.folder, "FunctionalAnnotationJun1st2022", val=FunctionalAnnotation) 180 | } 181 | genofile 182 | 183 | seqClose(genofile) 184 | 185 | #system(paste0("mv ", gds.file, " ", outfile, ".gds")) 186 | 187 | -------------------------------------------------------------------------------- /Scripts/Cloud/Terra/FAVORannotatorTerraFullDB.R: -------------------------------------------------------------------------------- 1 | args <- commandArgs(TRUE) 2 | ### mandatory 3 | 4 | gds.file <- args[1] 5 | print(paste0("gds.file: ",gds.file)) 6 | 7 | #outfile <- args[2] 8 | #print(paste0("outfile: ",outfile)) 9 | 10 | chr <- as.numeric(args[2]) 11 | print(paste0("chr: ",chr)) 12 | #chr<-19 13 | 14 | #use_compression <- args[4] 15 | use_compression <- "Yes" 16 | print(paste0("use_compression: ",use_compression)) 17 | 18 | ########################################################################## 19 | ### Step 0 (Download FAVOR Database) 20 | ########################################################################## 21 | URLs <- data.frame(chr = c(1:22), 22 | URL = c("https://dataverse.harvard.edu/api/access/datafile/6380374", #1 23 | "https://dataverse.harvard.edu/api/access/datafile/6380471", #2 24 | "https://dataverse.harvard.edu/api/access/datafile/6380732", #3 25 | "https://dataverse.harvard.edu/api/access/datafile/6381512", #4 26 | "https://dataverse.harvard.edu/api/access/datafile/6381457", #5 27 | "https://dataverse.harvard.edu/api/access/datafile/6381327", #6 28 | "https://dataverse.harvard.edu/api/access/datafile/6384125", #7 29 | "https://dataverse.harvard.edu/api/access/datafile/6382573", #8 30 | "https://dataverse.harvard.edu/api/access/datafile/6384268", #9 31 | "https://dataverse.harvard.edu/api/access/datafile/6380273", #10 32 | "https://dataverse.harvard.edu/api/access/datafile/6384154", #11 33 | "https://dataverse.harvard.edu/api/access/datafile/6384198", #12 34 | "https://dataverse.harvard.edu/api/access/datafile/6388366", #13 35 | "https://dataverse.harvard.edu/api/access/datafile/6388406", #14 36 | "https://dataverse.harvard.edu/api/access/datafile/6388427", #15 37 | "https://dataverse.harvard.edu/api/access/datafile/6388551", #16 38 | "https://dataverse.harvard.edu/api/access/datafile/6388894", #17 39 | "https://dataverse.harvard.edu/api/access/datafile/6376523", #18 40 | "https://dataverse.harvard.edu/api/access/datafile/6376522", #19 41 | "https://dataverse.harvard.edu/api/access/datafile/6376521", #20 42 | "https://dataverse.harvard.edu/api/access/datafile/6358305", #21 43 | "https://dataverse.harvard.edu/api/access/datafile/6358299")) #22 44 | 45 | URL <- URLs[chr, "URL"] 46 | system(paste0("wget --progress=bar:force:noscroll ", URLs[chr, "URL"])) 47 | system(paste0("tar -xvf ", gsub(".*?([0-9]+).*", "\\1", URL))) 48 | 49 | ########################################################################## 50 | ### Step 1 (Varinfo_gds) 51 | ########################################################################## 52 | 53 | ### output 54 | output_path <- "/cromwell_root/./" 55 | #output_path <- "/root/./" 56 | 57 | ### make directory 58 | system(paste0("mkdir ",output_path,"chr",chr)) 59 | 60 | ### R package 61 | library(gdsfmt) 62 | library(SeqArray) 63 | library(SeqVarTools) 64 | 65 | ### chromosome number 66 | ## read info 67 | DB_info <- read.csv(url("https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/main/Scripts/SQL/FAVORdatabase_chrsplit.csv"),header=TRUE) 68 | DB_info <- DB_info[DB_info$Chr==chr,] 69 | 70 | ## open GDS 71 | genofile <- seqOpen(gds.file) 72 | 73 | genofile 74 | 75 | CHR <- as.numeric(seqGetData(genofile, "chromosome")) 76 | position <- as.integer(seqGetData(genofile, "position")) 77 | REF <- as.character(seqGetData(genofile, "$ref")) 78 | ALT <- as.character(seqGetData(genofile, "$alt")) 79 | 80 | VarInfo_genome <- paste0(CHR,"-",position,"-",REF,"-",ALT) 81 | 82 | seqClose(genofile) 83 | 84 | ## Generate VarInfo 85 | for(kk in 1:dim(DB_info)[1]) 86 | { 87 | print(kk) 88 | 89 | VarInfo <- VarInfo_genome[(position>=DB_info$Start_Pos[kk])&(position<=DB_info$End_Pos[kk])] 90 | VarInfo <- data.frame(VarInfo) 91 | 92 | write.csv(VarInfo,paste0(output_path,"chr",chr,"/VarInfo_chr",chr,"_",kk,".csv"),quote=FALSE,row.names = FALSE) 93 | } 94 | 95 | ########################################################################## 96 | ### Step 2 (Annotate) 97 | ########################################################################## 98 | 99 | ### xsv directory 100 | #xsv <- "/cromwell_root/.cargo/bin/xsv" 101 | xsv <- "/root/.cargo/bin/xsv" 102 | 103 | ### DB file 104 | DB_path <- "/cromwell_root/./" 105 | 106 | ### anno channel (subset) 107 | #anno_colnum <- c(1,8:12,15,16,19,23,25:36) 108 | anno_colnum <- c(2:160) 109 | 110 | chr_splitnum <- sum(DB_info$Chr==chr) 111 | 112 | for(kk in 1:chr_splitnum) 113 | { 114 | print(kk) 115 | 116 | #system(paste0(xsv," index ",DB_path,"/chr",chr,"_",kk,".csv)) 117 | system(paste0(xsv," join --left VarInfo ",output_path,"chr",chr,"/VarInfo_chr",chr,"_",kk,".csv variant_vcf ",DB_path,"/chr",chr,"_",kk,".csv > ",output_path,"chr",chr,"/Anno_chr",chr,"_",kk,".csv")) 118 | } 119 | 120 | ## merge info 121 | Anno <- paste0(output_path,"chr",chr,"/Anno_chr",chr,"_",seq(1:chr_splitnum),".csv ") 122 | merge_command <- paste0(xsv," cat rows ",Anno[1]) 123 | 124 | for(kk in 2:chr_splitnum) 125 | { 126 | merge_command <- paste0(merge_command,Anno[kk]) 127 | } 128 | 129 | merge_command <- paste0(merge_command,"> ",output_path,"chr",chr,"/Anno_chr",chr,".csv") 130 | 131 | system(merge_command) 132 | 133 | ## subset 134 | anno_colnum_xsv <- c() 135 | for(kk in 1:(length(anno_colnum)-1)) 136 | { 137 | anno_colnum_xsv <- paste0(anno_colnum_xsv,anno_colnum[kk],",") 138 | } 139 | anno_colnum_xsv <- paste0(anno_colnum_xsv,anno_colnum[length(anno_colnum)]) 140 | 141 | system(paste0(xsv," select ",anno_colnum_xsv," ",output_path,"chr",chr,"/Anno_chr",chr,".csv > ",output_path,"chr",chr,"/Anno_chr",chr,"_STAARpipeline.csv")) 142 | 143 | ########################################################################## 144 | ### Step 3 (gds2agds) 145 | ########################################################################## 146 | 147 | ### annotation file 148 | dir_anno <- "/cromwell_root/" 149 | #dir_anno <- "/root/" 150 | anno_file_name_1 <- "Anno_chr" 151 | anno_file_name_2 <- "_STAARpipeline.csv" 152 | 153 | ### load required package 154 | library(gdsfmt) 155 | library(SeqArray) 156 | library(SeqVarTools) 157 | library(readr) 158 | 159 | ### read annotation data 160 | FunctionalAnnotation <- read_csv(paste0(dir_anno,"chr",chr,"/",anno_file_name_1,chr,anno_file_name_2)) 161 | 162 | #FunctionalAnnotation <- read_csv(paste0(dir_anno,"chr",chr,"/",anno_file_name_1,chr,anno_file_name_2), 163 | # col_types=list(col_character(),col_double(),col_double(),col_double(),col_double(), 164 | # col_double(),col_double(),col_double(),col_double(),col_double(), 165 | # col_character(),col_character(),col_character(),col_double(),col_character(), 166 | # col_character(),col_character(),col_character(),col_character(),col_double(), 167 | # col_double(),col_character())) 168 | 169 | dim(FunctionalAnnotation) 170 | 171 | ## rename colnames 172 | #colnames(FunctionalAnnotation)[2] <- "apc_conservation" 173 | #colnames(FunctionalAnnotation)[7] <- "apc_local_nucleotide_diversity" 174 | #colnames(FunctionalAnnotation)[9] <- "apc_protein_function" 175 | 176 | ## open GDS 177 | genofile <- seqOpen(gds.file, readonly = FALSE) 178 | 179 | #Anno.folder <- addfolder.gdsn(index.gdsn(genofile, "annotation/info"), "FunctionalAnnotationTest1") 180 | Anno.folder <- index.gdsn(genofile, "annotation/info") 181 | if(use_compression == "YES") 182 | { 183 | add.gdsn(Anno.folder, "FAVORFullDBAug1st2022", val=FunctionalAnnotation, compress="LZMA_ra", closezip=TRUE) 184 | }else 185 | { 186 | add.gdsn(Anno.folder, "FAVORFullDBAug1st2022", val=FunctionalAnnotation) 187 | } 188 | genofile 189 | 190 | seqClose(genofile) 191 | 192 | #system(paste0("mv ", gds.file, " ", outfile, ".gds")) 193 | 194 | -------------------------------------------------------------------------------- /Scripts/Cloud/Terra/convertVCFtoGDS.R: -------------------------------------------------------------------------------- 1 | ############################################################################# 2 | #Title: convertVCFtoGDS 3 | #Function: 4 | # * Build the GDS file from VCF files 5 | #Author: Hufeng Zhou 6 | #Time: Nov 27th 2021 7 | ############################################################################# 8 | library(gdsfmt) 9 | library(SeqArray) 10 | 11 | args <- commandArgs(TRUE) 12 | ### mandatory 13 | 14 | vcf.file <- args[1] 15 | print(paste0("gds.file: ",gds.file)) 16 | 17 | gds.file <- args[2] 18 | print(paste0("gds.file: ",gds.file)) 19 | 20 | seqVCF2GDS(vcf.file, gds.file, header = NULL, genotype.var.name = "GT", info.import=NULL, fmt.import=NULL, ignore.chr.prefix="chr", raise.error=TRUE, verbose=TRUE) 21 | genofile<-seqOpen(gds.file, readonly = FALSE) 22 | print("GDS built") 23 | 24 | ###Closing Up### 25 | genofile 26 | seqClose(genofile) 27 | -------------------------------------------------------------------------------- /Scripts/Cloud/Terra/headercolumn.txt: -------------------------------------------------------------------------------- 1 | col_double(),variant_vcf,variant_annovar,col_double(),col_double(),col_double(),ref_annovar,alt_annovar,col_double(),ref_vcf,alt_vcf,aloft_value,aloft_description,apc_conservation,apc_conservation_v2,apc_epigenetics_active,apc_epigenetics,apc_epigenetics_repressed,apc_epigenetics_transcription,apc_local_nucleotide_diversity,apc_local_nucleotide_diversity_v2,apc_local_nucleotide_diversity_v3,apc_mappability,apc_micro_rna,apc_mutation_density,apc_protein_function,apc_protein_function_v2,apc_protein_function_v3,apc_proximity_to_coding,apc_proximity_to_coding_v2,apc_proximity_to_tsstes,apc_transcription_factor,bravo_an,bravo_af,filter_status,cage_enhancer,cage_promoter,cage_tc,clnsig,clnsigincl,clndn,clndnincl,clnrevstat,origin,clndisdb,clndisdbincl,geneinfo,polyphen2_hdiv_score,polyphen2_hvar_score,mutation_taster_score,mutation_assessor_score,metasvm_pred,fathmm_xf,funseq_value,funseq_description,genecode_comprehensive_category,genecode_comprehensive_info,genecode_comprehensive_exonic_category,genecode_comprehensive_exonic_info,genehancer,af_total,af_asj_female,af_eas_female,af_afr_male,af_female,af_fin_male,af_oth_female,af_ami,af_oth,af_male,af_ami_female,af_afr,af_eas_male,af_sas,af_nfe_female,af_asj_male,af_raw,af_oth_male,af_nfe_male,af_asj,af_amr_male,af_amr_female,af_sas_female,af_fin,af_afr_female,af_sas_male,af_amr,af_nfe,af_eas,af_ami_male,af_fin_female,linsight,gc,cpg,min_dist_tss,min_dist_tse,sift_cat,sift_val,polyphen_cat,polyphen_val,priphcons,mamphcons,verphcons,priphylop,mamphylop,verphylop,bstatistic,chmm_e1,chmm_e2,chmm_e3,chmm_e4,chmm_e5,chmm_e6,chmm_e7,chmm_e8,chmm_e9,chmm_e10,chmm_e11,chmm_e12,chmm_e13,chmm_e14,chmm_e15,chmm_e16,chmm_e17,chmm_e18,chmm_e19,chmm_e20,chmm_e21,chmm_e22,chmm_e23,chmm_e24,chmm_e25,gerp_rs,gerp_rs_pval,gerp_n,gerp_s,encodeh3k4me1_sum,encodeh3k4me2_sum,encodeh3k4me3_sum,encodeh3k9ac_sum,encodeh3k9me3_sum,encodeh3k27ac_sum,encodeh3k27me3_sum,encodeh3k36me3_sum,encodeh3k79me2_sum,encodeh4k20me1_sum,encodeh2afz_sum,encode_dnase_sum,encodetotal_rna_sum,grantham,freq100bp,rare100bp,sngl100bp,freq1000bp,rare1000bp,sngl1000bp,freq10000bp,rare10000bp,sngl10000bp,remap_overlap_tf,remap_overlap_cl,cadd_rawscore,cadd_phred,k24_bismap,k24_umap,k36_bismap,k36_umap,k50_bismap,k50_umap,k100_bismap,k100_umap,nucdiv,rdhs,recombination_rate,refseq_category,refseq_info,refseq_exonic_category,refseq_exonic_info,super_enhancer,tg_afr,tg_all,tg_amr,tg_eas,tg_eur,tg_sas,ucsc_category,ucsc_info,ucsc_exonic_category,ucsc_exonic_info 2 | 3 | 4 | col_types=list(col_character(),col_double(),col_double(),col_double(),col_double(),col_double(),col_double(),col_double(),col_double(),col_double(),col_character(),col_character(),col_character(),col_double(),col_character(),col_character(),col_character(),col_character(),col_character(),col_double(),col_double(),col_character())) 5 | 6 | -------------------------------------------------------------------------------- /Scripts/Cloud/Terra/test.R: -------------------------------------------------------------------------------- 1 | args <- commandArgs(TRUE) 2 | ### mandatory 3 | outfile <- args[1] 4 | gds.file <- args[2] 5 | chr <- as.numeric(args[3]) 6 | use_compression <- args[4] 7 | 8 | ########################################################################## 9 | ### Step 0 (Download FAVOR Database) 10 | ########################################################################## 11 | URLs <- data.frame(chr = c(1:22), 12 | URL = c("https://dataverse.harvard.edu/api/access/datafile/6170506", 13 | "https://dataverse.harvard.edu/api/access/datafile/6170501", 14 | "https://dataverse.harvard.edu/api/access/datafile/6170502", 15 | "https://dataverse.harvard.edu/api/access/datafile/6170521", 16 | "https://dataverse.harvard.edu/api/access/datafile/6170511", 17 | "https://dataverse.harvard.edu/api/access/datafile/6170516", 18 | "https://dataverse.harvard.edu/api/access/datafile/6170505", 19 | "https://dataverse.harvard.edu/api/access/datafile/6170513", 20 | "https://dataverse.harvard.edu/api/access/datafile/6165867", 21 | "https://dataverse.harvard.edu/api/access/datafile/6170507", 22 | "https://dataverse.harvard.edu/api/access/datafile/6170517", 23 | "https://dataverse.harvard.edu/api/access/datafile/6170520", 24 | "https://dataverse.harvard.edu/api/access/datafile/6170503", 25 | "https://dataverse.harvard.edu/api/access/datafile/6170509", 26 | "https://dataverse.harvard.edu/api/access/datafile/6170515", 27 | "https://dataverse.harvard.edu/api/access/datafile/6170518", 28 | "https://dataverse.harvard.edu/api/access/datafile/6170510", 29 | "https://dataverse.harvard.edu/api/access/datafile/6170508", 30 | "https://dataverse.harvard.edu/api/access/datafile/6170514", 31 | "https://dataverse.harvard.edu/api/access/datafile/6170512", 32 | "https://dataverse.harvard.edu/api/access/datafile/6170519", 33 | "https://dataverse.harvard.edu/api/access/datafile/6170504")) 34 | 35 | URL <- URLs[chr, "URL"] 36 | system(paste0("wget --progress=bar:force:noscroll ", URLs[chr, "URL"])) 37 | system(paste0("tar -xvf ", gsub(".*?([0-9]+).*", "\\1", URL))) 38 | 39 | ########################################################################## 40 | ### Step 1 (Varinfo_gds) 41 | ########################################################################## 42 | 43 | ### output 44 | output_path <- "./" 45 | 46 | ### make directory 47 | system(paste0("mkdir ",output_path,"chr",chr)) 48 | 49 | ### R package 50 | library(gdsfmt) 51 | library(SeqArray) 52 | library(SeqVarTools) 53 | 54 | ### chromosome number 55 | ## read info 56 | DB_info <- read.csv(url("https://raw.githubusercontent.com/xihaoli/STAARpipeline-Tutorial/main/FAVORannotator_csv/FAVORdatabase_chrsplit.csv"),header=TRUE) 57 | DB_info <- DB_info[DB_info$Chr==chr,] 58 | 59 | ## open GDS 60 | genofile <- seqOpen(gds.file) 61 | 62 | CHR <- as.numeric(seqGetData(genofile, "chromosome")) 63 | position <- as.integer(seqGetData(genofile, "position")) 64 | REF <- as.character(seqGetData(genofile, "$ref")) 65 | ALT <- as.character(seqGetData(genofile, "$alt")) 66 | 67 | VarInfo_genome <- paste0(CHR,"-",position,"-",REF,"-",ALT) 68 | 69 | seqClose(genofile) 70 | 71 | ## Generate VarInfo 72 | for(kk in 1:dim(DB_info)[1]) 73 | { 74 | print(kk) 75 | 76 | VarInfo <- VarInfo_genome[(position>=DB_info$Start_Pos[kk])&(position<=DB_info$End_Pos[kk])] 77 | VarInfo <- data.frame(VarInfo) 78 | 79 | write.csv(VarInfo,paste0(output_path,"chr",chr,"/VarInfo_chr",chr,"_",kk,".csv"),quote=FALSE,row.names = FALSE) 80 | } 81 | 82 | ########################################################################## 83 | ### Step 2 (Annotate) 84 | ########################################################################## 85 | 86 | ### xsv directory 87 | xsv <- ".cargo/bin/xsv" 88 | 89 | ### DB file 90 | DB_path <- "n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/" 91 | 92 | ### anno channel (subset) 93 | anno_colnum <- c(1,8:12,15,16,19,23,25:36) 94 | 95 | chr_splitnum <- sum(DB_info$Chr==chr) 96 | 97 | for(kk in 1:chr_splitnum) 98 | { 99 | print(kk) 100 | 101 | system(paste0(xsv," join --left VarInfo ",output_path,"chr",chr,"/VarInfo_chr",chr,"_",kk,".csv variant_vcf ",DB_path,"/chr",chr,"_",kk,".csv > ",output_path,"chr",chr,"/Anno_chr",chr,"_",kk,".csv")) 102 | } 103 | 104 | ## merge info 105 | Anno <- paste0(output_path,"chr",chr,"/Anno_chr",chr,"_",seq(1:chr_splitnum),".csv ") 106 | merge_command <- paste0(xsv," cat rows ",Anno[1]) 107 | 108 | for(kk in 2:chr_splitnum) 109 | { 110 | merge_command <- paste0(merge_command,Anno[kk]) 111 | } 112 | 113 | merge_command <- paste0(merge_command,"> ",output_path,"chr",chr,"/Anno_chr",chr,".csv") 114 | 115 | system(merge_command) 116 | 117 | ## subset 118 | anno_colnum_xsv <- c() 119 | for(kk in 1:(length(anno_colnum)-1)) 120 | { 121 | anno_colnum_xsv <- paste0(anno_colnum_xsv,anno_colnum[kk],",") 122 | } 123 | anno_colnum_xsv <- paste0(anno_colnum_xsv,anno_colnum[length(anno_colnum)]) 124 | 125 | system(paste0(xsv," select ",anno_colnum_xsv," ",output_path,"chr",chr,"/Anno_chr",chr,".csv > ",output_path,"chr",chr,"/Anno_chr",chr,"_STAARpipeline.csv")) 126 | 127 | ########################################################################## 128 | ### Step 3 (gds2agds) 129 | ########################################################################## 130 | 131 | ### annotation file 132 | dir_anno <- "" 133 | anno_file_name_1 <- "Anno_chr" 134 | anno_file_name_2 <- "_STAARpipeline.csv" 135 | 136 | ### load required package 137 | library(gdsfmt) 138 | library(SeqArray) 139 | library(SeqVarTools) 140 | library(readr) 141 | 142 | ### read annotation data 143 | FunctionalAnnotation <- read_csv(paste0(dir_anno,"chr",chr,"/",anno_file_name_1,chr,anno_file_name_2), 144 | col_types=list(col_character(),col_double(),col_double(),col_double(),col_double(), 145 | col_double(),col_double(),col_double(),col_double(),col_double(), 146 | col_character(),col_character(),col_character(),col_double(),col_character(), 147 | col_character(),col_character(),col_character(),col_character(),col_double(), 148 | col_double(),col_character())) 149 | 150 | dim(FunctionalAnnotation) 151 | 152 | ## rename colnames 153 | colnames(FunctionalAnnotation)[2] <- "apc_conservation" 154 | colnames(FunctionalAnnotation)[7] <- "apc_local_nucleotide_diversity" 155 | colnames(FunctionalAnnotation)[9] <- "apc_protein_function" 156 | 157 | ## open GDS 158 | genofile <- seqOpen(gds.file, readonly = FALSE) 159 | 160 | #Anno.folder <- addfolder.gdsn(index.gdsn(genofile, "annotation/info"), "FunctionalAnnotation") 161 | Anno.folder <- index.gdsn(genofile, "annotation/info") 162 | if(use_compression == "YES") 163 | { 164 | add.gdsn(Anno.folder, "FunctionalAnnotation", val=FunctionalAnnotation, compress="LZMA_ra", closezip=TRUE) 165 | }else 166 | { 167 | add.gdsn(Anno.folder, "FunctionalAnnotation", val=FunctionalAnnotation) 168 | } 169 | 170 | seqClose(genofile) 171 | 172 | system(paste0("mv ", gds.file, " ", outfile, ".gds")) 173 | 174 | -------------------------------------------------------------------------------- /Scripts/Dockerize/Dockerfile.txt: -------------------------------------------------------------------------------- 1 | # Base image https://hub.docker.com/u/rocker/ 2 | FROM rocker/r-base:latest 3 | 4 | ## create directories 5 | RUN mkdir -p /FAVORannotatorDocker 6 | 7 | ## copy files 8 | COPY ../CSV/FAVORannotatorv2aGDS.r . 9 | COPY ../CSV/convertVCFtoGDS.r 10 | COPY ../CSV/config.R 11 | 12 | ## Install R-packages 13 | RUN Rscript install_packages.R 14 | 15 | ## Run R-scripts 16 | RUN Rscript FAVORannotatorv2aGDS.r 17 | -------------------------------------------------------------------------------- /Scripts/Dockerize/ExampleDockerFiles.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Base image https://hub.docker.com/u/rocker/ 4 | FROM rocker/shiny:latest 5 | 6 | # system libraries of general use 7 | ## install debian packages 8 | RUN apt-get update -qq && apt-get -y --no-install-recommends install \ 9 | libxml2-dev \ 10 | libcairo2-dev \ 11 | libsqlite3-dev \ 12 | libmariadbd-dev \ 13 | libpq-dev \ 14 | libssh2-1-dev \ 15 | unixodbc-dev \ 16 | libcurl4-openssl-dev \ 17 | libssl-dev \ 18 | coinor-libcbc-dev coinor-libclp-dev libglpk-dev 19 | 20 | 21 | ## update system libraries 22 | RUN apt-get update && \ 23 | apt-get upgrade -y && \ 24 | apt-get clean 25 | 26 | # copy necessary files 27 | ## app folder 28 | COPY ./bms_msisuite ./app 29 | 30 | # Docker inheritance 31 | FROM bioconductor/bioconductor_docker:devel 32 | 33 | RUN apt-get update 34 | RUN R -e 'BiocManager::install(ask = F)' && R -e 'BiocManager::install(c("rtracklayer", \ 35 | "GenomicAlignments", "Biostrings", "SummarizedExperiment", "Rsamtools", ask = F))' 36 | # install renv & restore packages 37 | RUN Rscript -e 'install.packages("renv")' 38 | RUN Rscript -e 'install.packages("devtools")' 39 | RUN Rscript -e 'install.packages("shiny")' 40 | RUN Rscript -e 'install.packages("shinyBS")' 41 | RUN Rscript -e 'install.packages("ggvis")' 42 | RUN Rscript -e 'install.packages("shinydashboardPlus")' 43 | RUN Rscript -e 'install.packages("shinycssloaders")' 44 | RUN Rscript -e 'install.packages("shinyWidgets")' 45 | RUN Rscript -e 'install.packages("plotly")' 46 | RUN Rscript -e 'install.packages("RSQLite")' 47 | RUN Rscript -e 'install.packages("forecast", dependencies = TRUE)' 48 | RUN Rscript -e 'install.packages("tsutils")' 49 | RUN Rscript -e 'install.packages("readxl")' 50 | RUN Rscript -e 'install.packages("tidyverse")' 51 | RUN Rscript -e 'install.packages("knitr")' 52 | RUN Rscript -e 'install.packages("knitcitations")' 53 | RUN Rscript -e 'install.packages("nycflights13")' 54 | RUN Rscript -e 'install.packages("Matrix")' 55 | RUN Rscript -e 'install.packages("plotly")' 56 | RUN Rscript -e 'install.packages("igraph")' 57 | RUN Rscript -e 'install.packages("ggthemes")' 58 | RUN Rscript -e 'install.packages("evaluate")' 59 | RUN Rscript -e 'install.packages("psych")' 60 | RUN Rscript -e 'install.packages("kableExtra")' 61 | RUN Rscript -e 'install.packages("ggjoy")' 62 | RUN Rscript -e 'install.packages("gtools")' 63 | RUN Rscript -e 'install.packages("gridExtra")' 64 | RUN Rscript -e 'install.packages("cowplot")' 65 | RUN Rscript -e 'install.packages("ggrepel")' 66 | RUN Rscript -e 'install.packages("data.table")' 67 | RUN Rscript -e 'install.packages("stringr")' 68 | RUN Rscript -e 'install.packages("rmarkdown")' 69 | RUN Rscript -e 'install.packages("shinyjqui")' 70 | RUN Rscript -e 'install.packages("V8")' 71 | RUN Rscript -e 'devtools::install_github("ThomasSiegmund/D3TableFilter")' 72 | RUN Rscript -e 'devtools::install_github("leonawicz/apputils")' 73 | RUN Rscript -e 'devtools::install_github("Marlin-Na/trewjb")' 74 | 75 | RUN Rscript -e 'devtools::install_github("dirkschumacher/ompr")' 76 | RUN Rscript -e 'devtools::install_github("dirkschumacher/ompr.roi")' 77 | 78 | RUN Rscript -e 'install.packages("ROI.plugin.glpk")' 79 | 80 | RUN Rscript -e 'install.packages("shinydashboard")' 81 | RUN Rscript -e 'install.packages("dplyr")' 82 | RUN Rscript -e 'install.packages("dashboardthemes")' 83 | RUN Rscript -e 'install.packages("shinyjs")' 84 | RUN Rscript -e 'install.packages("magrittr")' 85 | RUN Rscript -e 'install.packages("DT")' 86 | RUN Rscript -e 'install.packages("rhandsontable")' 87 | RUN Rscript -e 'renv::consent(provided = TRUE)' 88 | RUN Rscript -e 'renv::restore()' 89 | 90 | 91 | 92 | # expose port 93 | EXPOSE 3838 94 | 95 | # run app on container start 96 | CMD ["R", "-e", "shiny::runApp('/app', host = '0.0.0.0', port = 3838)"] 97 | -------------------------------------------------------------------------------- /Scripts/Dockerize/install_packages.R: -------------------------------------------------------------------------------- 1 | FROM bioconductor/bioconductor_docker:devel 2 | 3 | RUN apt-get update 4 | RUN R -e 'BiocManager::install(ask = F)' && R -e 'BiocManager::install(c("gdsfmt", "SeqArray", "SeqVarTools", ask = F))' 5 | 6 | RUN Rscript -e 'install.packages("readr")' 7 | RUN Rscript -e 'install.packages("devtools")' 8 | 9 | RUN Rscript -e 'devtools::install_github("zhengxwen/gdsfmt")' 10 | 11 | -------------------------------------------------------------------------------- /Scripts/SQL/FAVORannotatorv2aGDS.r: -------------------------------------------------------------------------------- 1 | ############################################################################# 2 | #Title: FAVORannotator 3 | #Function: 4 | # * Build the GDS file from VCF files 5 | # * Extract the variant sites from the GDS to obtain functional annotation. 6 | # * Read the offline FAVOR V2 sql database and provide functional annotation. 7 | # * Built in the functional annotation into GDS to build aGDS. 8 | #Author: Hufeng Zhou 9 | #Time: Dec 16th 2021 10 | ############################################################################# 11 | library(gdsfmt) 12 | library(SeqArray) 13 | library(dplyr) 14 | library(readr) 15 | library(stringi) 16 | library(stringr) 17 | library(RPostgreSQL) 18 | library(pryr) 19 | source('config.R') 20 | mem_used() 21 | #vcf.fn=as.character(commandArgs(TRUE)[1]) 22 | #out.fn=as.character(commandArgs(TRUE)[2]) 23 | 24 | #N=as.character(commandArgs(TRUE)[1]) 25 | #seqVCF2GDS(vcf.fn, out.fn, header = NULL, genotype.var.name = "GT", info.import=NULL, fmt.import=NULL, ignore.chr.prefix="chr", raise.error=TRUE, verbose=TRUE) 26 | start.time <- Sys.time() 27 | CHRN=as.character(commandArgs(TRUE)[1]) 28 | genofile<-seqOpen(eval(parse(text = paste0("gds.chr",CHRN,".fn"))), readonly = FALSE) 29 | print("GDS built") 30 | genofile 31 | CHR<-seqGetData(genofile,"chromosome") 32 | POS<-seqGetData(genofile,"position") 33 | REF<-seqGetData(genofile,"$ref") 34 | ALT<-seqGetData(genofile,"$alt") 35 | ############################################################################# 36 | #Here VariantsAnno is the data frame needs to retrieve functional annotation 37 | #It needs to be feed into the SQL 38 | ############################################################################# 39 | VariantsAnno <- data.frame(CHR, POS, REF, ALT) 40 | VariantsAnno$CHR <- as.character(VariantsAnno$CHR) 41 | VariantsAnno$POS <- as.integer(VariantsAnno$POS) 42 | VariantsAnno$REF <- as.character(VariantsAnno$REF) 43 | VariantsAnno$ALT <- as.character(VariantsAnno$ALT) 44 | 45 | 46 | genDelimitedVariantString <- function(inputs) { 47 | quotedVariants <- dbQuoteString(ANSI(), inputs) 48 | collapsedVariants <- paste(quotedVariants, collapse = "),(") 49 | collapsedVariants <- paste0("(", collapsedVariants, ")") 50 | return(collapsedVariants) 51 | } 52 | 53 | #performs batch annotation using the offline database for the 54 | #specified variants 55 | batchAnnotate <- function(inputData,blknum) { 56 | 57 | #parse input, silently ignoring variants which do not follow format 58 | variants <- paste(paste0(inputData[, 1]), inputData[, 2], inputData[, 3], inputData[, 4], sep='-') 59 | variants <- str_subset(variants, "^[:alnum:]+-\\d+-[:upper:]+-[:upper:]+$") 60 | 61 | #connect to database 62 | driver <- dbDriver("PostgreSQL") 63 | connection <- dbConnect(driver, dbname= eval(parse(text = paste0("DBNAME_chr",CHRN))), host=eval(parse(text = paste0("HOST_chr",CHRN))), port=eval(parse(text = paste0("PORT_chr",CHRN))), user=USER_G, password=PASSWORD_G) 64 | 65 | #drop the variant table if it already exists 66 | variantTable <- "batch_variants" 67 | if(dbExistsTable(connection, variantTable)) { 68 | dbRemoveTable(connection, variantTable) 69 | } 70 | 71 | #store variants in temporary table 72 | collapsedVariants <- genDelimitedVariantString(variants) 73 | query <- paste0("CREATE TEMP TABLE ", variantTable, " AS (VALUES ", collapsedVariants, ")") 74 | results <- data.frame() 75 | tryCatch({ 76 | results <- dbGetQuery(connection, query) 77 | }, 78 | error = function(e) { 79 | stop("Error sending variants to database") 80 | }, 81 | warning = function(w) { 82 | stop("Error sending variants to database") 83 | }) 84 | 85 | #retrieve data 86 | results <- data.frame() 87 | query <- paste0("SELECT offline_view",blknum,".* FROM ", variantTable, " LEFT JOIN offline_view",blknum," ON ", variantTable, ".column1=offline_view",blknum,".variant_vcf") 88 | tryCatch({ 89 | results <- dbGetQuery(connection, query) 90 | }, 91 | error = function(e) { 92 | stop("Error retrieving results from database") 93 | }, 94 | warning = function(w) { 95 | stop("Error retrieving results from database") 96 | }) 97 | 98 | #clean up 99 | dbDisconnect(connection) 100 | 101 | return(results) 102 | } 103 | 104 | DB_info <- read.csv("FAVORdatabase_chrsplit.csv",header=TRUE) 105 | DB_info <- DB_info[DB_info$Chr==CHRN,] 106 | VariantsAnnoTMP<-VariantsAnno[!duplicated(VariantsAnno),]; 107 | VariantsBatchAnno <- data.frame(); 108 | outlist<- list(); 109 | for(kk in 1:dim(DB_info)[1]){ 110 | print(kk) 111 | dx<-VariantsAnnoTMP[(POS>=DB_info$Start_Pos[kk])&(POS<=DB_info$End_Pos[kk]),] 112 | outdx<-batchAnnotate(dx,kk) 113 | #VariantsBatchAnno<-bind_rows(VariantsBatchAnno,outdx) 114 | print(paste0(("finish annotate rounds/blocks: "),kk)) 115 | outlist[[kk]]<-outdx 116 | } 117 | VariantsBatchAnno<-bind_rows(outlist); 118 | rm(dx,outdx) 119 | rm(VariantsAnnoTMP) 120 | rm(CHR, POS, REF, ALT) 121 | head(VariantsBatchAnno) 122 | mem_used() 123 | gc() 124 | ############################################ 125 | ####This Variant is a searching key######### 126 | ############################################ 127 | Anno.folder <- addfolder.gdsn(index.gdsn(genofile, "annotation/info"), "FunctionalAnnotation") 128 | #Anno.folder <- index.gdsn(genofile, "annotation/info/FunctionalAnnotation") 129 | #VariantsBatchAnno<-VariantsBatchAnno[!duplicated(VariantsBatchAnno),] 130 | VariantsAnno <- dplyr::left_join(VariantsAnno,VariantsBatchAnno, by = c("CHR" = "chromosome","POS" = "position","REF" = "ref_vcf","ALT" = "alt_vcf")) 131 | add.gdsn(Anno.folder, "FAVORannotator", val=VariantsAnno, compress="LZMA_ra", closezip=TRUE) 132 | ###Closing Up### 133 | genofile 134 | seqClose(genofile) 135 | 136 | ###Time Count### 137 | end.time <- Sys.time() 138 | time.taken <- end.time - start.time 139 | time.taken 140 | -------------------------------------------------------------------------------- /Scripts/SQL/FAVORdatabase_chrsplit.csv: -------------------------------------------------------------------------------- 1 | Chr,File_No,Start_Pos,End_Pos,Site_start,Site_end 2 | 1,1,10001,16747958,0,50000002 3 | 1,2,16747959,33299099,50000002,100000003 4 | 1,3,33299100,49816132,100000003,150000003 5 | 1,4,49816133,66336187,150000003,200000001 6 | 1,5,66336188,82858506,200000001,250000004 7 | 1,6,82858507,99383549,250000004,300000004 8 | 1,7,99383550,115901640,300000004,350000004 9 | 1,8,115901641,150511302,350000004,400000003 10 | 1,9,150511303,167027454,400000003,450000002 11 | 1,10,167027455,183550376,450000002,500000004 12 | 1,11,183550377,200062349,500000004,550000005 13 | 1,12,200062350,216579517,550000005,600000005 14 | 1,13,216579518,233199672,600000005,650000004 15 | 1,14,233199673,248946422,650000004,697723257 16 | 2,1,10001,16521124,0,50000003 17 | 2,2,16521125,33039678,50000003,100000003 18 | 2,3,33039679,49545408,100000003,150000003 19 | 2,4,49545409,66059434,150000003,200000004 20 | 2,5,66059435,82573646,200000004,250000005 21 | 2,6,82573647,100719302,250000005,300000003 22 | 2,7,100719303,117241395,300000003,350000005 23 | 2,8,117241396,133761855,350000005,400000003 24 | 2,9,133761856,150281681,400000003,450000004 25 | 2,10,150281682,166803862,450000004,500000004 26 | 2,11,166803863,183319344,500000004,550000005 27 | 2,12,183319345,199836331,550000005,600000004 28 | 2,13,199836332,216347313,600000004,650000003 29 | 2,14,216347314,232858536,650000003,700000001 30 | 2,15,232858537,242183529,700000001,728249325 31 | 3,1,10001,16524420,0,50000003 32 | 3,2,16524421,33040541,50000003,100000005 33 | 3,3,33040542,49557337,100000005,150000005 34 | 3,4,49557338,66074840,150000005,200000005 35 | 3,5,66074841,82594743,200000005,250000003 36 | 3,6,82594744,99251340,250000003,300000005 37 | 3,7,99251341,115776819,300000005,350000005 38 | 3,8,115776820,132298678,350000005,400000003 39 | 3,9,132298679,148820162,400000003,450000003 40 | 3,10,148820163,165336299,450000003,500000003 41 | 3,11,165336300,181849212,500000003,550000004 42 | 3,12,181849213,198235559,550000004,599657406 43 | 4,1,10001,16590803,0,50000004 44 | 4,2,16590804,33122697,50000004,100000003 45 | 4,3,33122698,49829213,100000003,150000003 46 | 4,4,49829214,66446786,150000003,200000004 47 | 4,5,66446787,82967242,200000004,250000003 48 | 4,6,82967243,99485400,250000003,300000003 49 | 4,7,99485401,116006538,300000003,350000005 50 | 4,8,116006539,132525686,350000005,400000004 51 | 4,9,132525687,149046341,400000004,450000005 52 | 4,10,149046342,165559516,450000005,500000003 53 | 4,11,165559517,182072370,500000003,550000005 54 | 4,12,182072371,190204555,550000005,574493843 55 | 5,1,10001,16522729,0,50000003 56 | 5,2,16522730,33074583,50000003,100000003 57 | 5,3,33074584,49720402,100000003,150000005 58 | 5,4,49720403,66290955,150000005,200000004 59 | 5,5,66290956,82815341,200000004,250000003 60 | 5,6,82815342,99338210,250000003,300000004 61 | 5,7,99338211,115854447,300000004,350000004 62 | 5,8,115854448,132370902,350000004,400000005 63 | 5,9,132370903,148894412,400000005,450000004 64 | 5,10,148894413,165417298,450000004,500000005 65 | 5,11,165417299,181478259,500000005,548653509 66 | 6,1,60001,16563465,0,50000003 67 | 6,2,16563466,33061324,50000003,100000003 68 | 6,3,33061325,49572052,100000003,150000005 69 | 6,4,49572053,66610016,150000005,200000003 70 | 6,5,66610017,83124127,200000003,250000004 71 | 6,6,83124128,99696645,250000004,300000005 72 | 6,7,99696646,116211798,300000005,350000005 73 | 6,8,116211799,132735327,350000005,400000003 74 | 6,9,132735328,149256752,400000003,450000003 75 | 6,10,149256753,165761505,450000003,500000005 76 | 6,11,165761506,170745979,500000005,514950784 77 | 7,1,10001,16497672,0,50000005 78 | 7,2,16497673,33011778,50000005,100000005 79 | 7,3,33011779,49535306,100000005,150000004 80 | 7,4,49535307,66368293,150000004,200000003 81 | 7,5,66368294,82861258,200000003,250000004 82 | 7,6,82861259,99375599,250000004,300000004 83 | 7,7,99375600,115883173,300000004,350000005 84 | 7,8,115883174,132397651,350000005,400000005 85 | 7,9,132397652,148959856,400000005,450000005 86 | 7,10,148959857,159335973,450000005,481465078 87 | 8,1,60001,16657938,0,50000004 88 | 8,2,16657939,33168508,50000004,100000005 89 | 8,3,33168509,49798213,100000005,150000004 90 | 8,4,49798214,66324168,150000004,200000005 91 | 8,5,66324169,82841802,200000005,250000005 92 | 8,6,82841803,99411327,250000005,300000003 93 | 8,7,99411328,115927799,300000003,350000005 94 | 8,8,115927800,132449771,350000005,400000004 95 | 8,9,132449772,145078636,400000004,438236670 96 | 9,1,10001,16519111,0,50000004 97 | 9,2,16519112,33034522,50000004,100000003 98 | 9,3,33034523,65561731,100000003,150000005 99 | 9,4,65561732,82636109,150000005,200000004 100 | 9,5,82636110,99151058,200000004,250000005 101 | 9,6,99151059,115668007,250000005,300000005 102 | 9,7,115668008,132176643,300000005,350000005 103 | 9,8,132176644,138334717,350000005,368668774 104 | 10,1,10001,16497977,0,50000005 105 | 10,2,16497978,32992389,50000005,100000003 106 | 10,3,32992390,49979963,100000003,150000005 107 | 10,4,49979964,66490426,150000005,200000003 108 | 10,5,66490427,82999924,200000003,250000003 109 | 10,6,82999925,99518158,250000003,300000003 110 | 10,7,99518159,116025435,300000003,350000005 111 | 10,8,116025436,133787422,350000005,403649527 112 | 11,1,60001,16573403,0,50000005 113 | 11,2,16573404,33089158,50000005,100000004 114 | 11,3,33089159,49604102,100000004,150000003 115 | 11,4,49604103,66492354,150000003,200000005 116 | 11,5,66492355,83106677,200000005,250000004 117 | 11,6,83106678,99650866,250000004,300000005 118 | 11,7,99650867,116169860,300000005,350000004 119 | 11,8,116169861,135076622,350000004,407232600 120 | 12,1,10001,16512884,0,50000005 121 | 12,2,16512885,33020486,50000005,100000003 122 | 12,3,33020487,49663834,100000003,150000003 123 | 12,4,49663835,66172393,150000003,200000004 124 | 12,5,66172394,82688873,200000004,250000005 125 | 12,6,82688874,99202936,250000005,300000003 126 | 12,7,99202937,115715845,300000003,350000004 127 | 12,8,115715846,133265309,350000004,403205116 128 | 13,1,16000001,32685483,0,50000005 129 | 13,2,32685484,49205873,50000005,100000003 130 | 13,3,49205874,65719740,100000003,150000003 131 | 13,4,65719741,82234653,150000003,200000005 132 | 13,5,82234654,98792965,200000005,250000005 133 | 13,6,98792966,114354328,250000005,296684670 134 | 14,1,16000001,32844491,0,50000003 135 | 14,2,32844492,49355389,50000003,100000003 136 | 14,3,49355390,65865851,100000003,150000002 137 | 14,4,65865852,82379468,150000002,200000005 138 | 14,5,82379469,98894359,200000005,250000004 139 | 14,6,98894360,106883718,250000004,274216494 140 | 15,1,17000001,33820808,0,50000004 141 | 15,2,33820809,50332483,50000004,100000003 142 | 15,3,50332484,66837081,100000003,150000005 143 | 15,4,66837082,83350271,150000005,200000004 144 | 15,5,83350272,99908885,200000004,250000003 145 | 15,6,99908886,101981189,250000003,256275016 146 | 16,1,10001,16489529,0,50000004 147 | 16,2,16489530,33041390,50000004,100000005 148 | 16,3,33041391,57939584,100000005,150000004 149 | 16,4,57939585,74442138,150000004,200000005 150 | 16,5,74442139,90228345,200000005,247862947 151 | 17,1,60001,16583426,0,50000004 152 | 17,2,16583427,33296314,50000004,100000005 153 | 17,3,33296315,49805199,100000005,150000004 154 | 17,4,49805200,66311745,150000004,200000003 155 | 17,5,66311746,83247441,200000003,251209815 156 | 18,1,10001,16579729,0,50000005 157 | 18,2,16579730,33187036,50000005,100000003 158 | 18,3,33187037,49761154,100000003,150000004 159 | 18,4,49761155,66276260,150000004,200000003 160 | 18,5,66276261,80263285,200000003,242369265 161 | 19,1,60001,16502931,0,50000003 162 | 19,2,16502932,33121104,50000003,100000005 163 | 19,3,33121105,49598738,100000005,150000004 164 | 19,4,49598739,58607616,150000004,177368363 165 | 20,1,60001,16578325,0,50000004 166 | 20,2,16578326,33448578,50000004,100000003 167 | 20,3,33448579,49959412,100000003,150000003 168 | 20,4,49959413,64334167,150000003,193559674 169 | 21,1,5010001,23102900,0,50000004 170 | 21,2,23102901,39607529,50000004,100000004 171 | 21,3,39607530,46699983,100000004,121352389 172 | 22,1,10510001,28173294,0,50000003 173 | 22,2,28173295,44666670,50000003,100000005 174 | 22,3,44666671,50808468,100000005,118627486 175 | -------------------------------------------------------------------------------- /Scripts/SQL/config.R: -------------------------------------------------------------------------------- 1 | USER_G <- 'user name' 2 | PASSWORD_G <- 'password' 3 | 4 | #---------chr1----------------------- 5 | vcf.chr1.fn<-"/n/location/input.vcf" 6 | gds.chr1.fn<-"/n/location/output.gds" 7 | 8 | DBNAME_chr1 <- 'postgres' 9 | HOST_chr1 <- 'localhost' 10 | PORT_chr1 <- 5432 11 | 12 | #---------chr2----------------------- 13 | vcf.chr2.fn<-"/n/location/input.vcf" 14 | gds.chr2.fn<-"/n/location/output.gds" 15 | 16 | DBNAME_chr2 <- 'postgres' 17 | HOST_chr2 <- 'localhost' 18 | PORT_chr2 <- 5432 19 | 20 | #---------chr3----------------------- 21 | vcf.chr3.fn<-"/n/location/input.vcf" 22 | gds.chr3.fn<-"/n/location/output.gds" 23 | 24 | DBNAME_chr3 <- 'postgres' 25 | HOST_chr3 <- 'localhost' 26 | PORT_chr3 <- 5432 27 | 28 | #---------chr4----------------------- 29 | vcf.chr4.fn<-"/n/location/input.vcf" 30 | gds.chr4.fn<-"/n/location/output.gds" 31 | 32 | DBNAME_chr4 <- 'postgres' 33 | HOST_chr4 <- 'localhost' 34 | PORT_chr4 <- 5432 35 | 36 | #---------chr5----------------------- 37 | vcf.chr5.fn<-"/n/location/input.vcf" 38 | gds.chr5.fn<-"/n/location/output.gds" 39 | 40 | DBNAME_chr5 <- 'postgres' 41 | HOST_chr5 <- 'localhost' 42 | PORT_chr5 <- 5432 43 | 44 | #---------chr6----------------------- 45 | vcf.chr6.fn<-"/n/location/input.vcf" 46 | gds.chr6.fn<-"/n/location/output.gds" 47 | 48 | DBNAME_chr6 <- 'postgres' 49 | HOST_chr6 <- 'localhost' 50 | PORT_chr6 <- 5432 51 | 52 | #---------chr7----------------------- 53 | vcf.chr7.fn<-"/n/location/input.vcf" 54 | gds.chr7.fn<-"/n/location/output.gds" 55 | 56 | DBNAME_chr7 <- 'postgres' 57 | HOST_chr7 <- 'localhost' 58 | PORT_chr7 <- 5432 59 | 60 | #---------chr8----------------------- 61 | vcf.chr8.fn<-"/n/location/input.vcf" 62 | gds.chr8.fn<-"/n/location/output.gds" 63 | 64 | DBNAME_chr8 <- 'postgres' 65 | HOST_chr8 <- 'localhost' 66 | PORT_chr8 <- 5432 67 | 68 | #---------chr9----------------------- 69 | vcf.chr9.fn<-"/n/location/input.vcf" 70 | gds.chr9.fn<-"/n/location/output.gds" 71 | 72 | DBNAME_chr9 <- 'postgres' 73 | HOST_chr9 <- 'localhost' 74 | PORT_chr9 <- 5432 75 | 76 | #---------chr10----------------------- 77 | vcf.chr10.fn<-"/n/location/input.vcf" 78 | gds.chr10.fn<-"/n/location/output.gds" 79 | 80 | DBNAME_chr10 <- 'postgres' 81 | HOST_chr10 <- 'localhost' 82 | PORT_chr10 <- 5432 83 | 84 | #---------chr11----------------------- 85 | vcf.chr11.fn<-"/n/location/input.vcf" 86 | gds.chr11.fn<-"/n/location/output.gds" 87 | 88 | DBNAME_chr11 <- 'postgres' 89 | HOST_chr11 <- 'localhost' 90 | PORT_chr11 <- 5432 91 | 92 | #---------chr12----------------------- 93 | vcf.chr12.fn<-"/n/location/input.vcf" 94 | gds.chr12.fn<-"/n/location/output.gds" 95 | 96 | DBNAME_chr12 <- 'postgres' 97 | HOST_chr12 <- 'localhost' 98 | PORT_chr12 <- 5432 99 | 100 | #---------chr13----------------------- 101 | vcf.chr13.fn<-"/n/location/input.vcf" 102 | gds.chr13.fn<-"/n/location/output.gds" 103 | 104 | DBNAME_chr13 <- 'postgres' 105 | HOST_chr13 <- 'localhost' 106 | PORT_chr13 <- 5432 107 | 108 | #---------chr14----------------------- 109 | vcf.chr14.fn<-"/n/location/input.vcf" 110 | gds.chr14.fn<-"/n/location/output.gds" 111 | 112 | DBNAME_chr14 <- 'postgres' 113 | HOST_chr14 <- 'localhost' 114 | PORT_chr14 <- 5432 115 | 116 | #---------chr15----------------------- 117 | vcf.chr15.fn<-"/n/location/input.vcf" 118 | gds.chr15.fn<-"/n/location/output.gds" 119 | 120 | DBNAME_chr15 <- 'postgres' 121 | HOST_chr15 <- 'localhost' 122 | PORT_chr15 <- 5432 123 | 124 | #---------chr16----------------------- 125 | vcf.chr16.fn<-"/n/location/input.vcf" 126 | gds.chr16.fn<-"/n/location/output.gds" 127 | 128 | DBNAME_chr16 <- 'postgres' 129 | HOST_chr16 <- 'localhost' 130 | PORT_chr16 <- 5432 131 | 132 | #---------chr17----------------------- 133 | vcf.chr17.fn<-"/n/location/input.vcf" 134 | gds.chr17.fn<-"/n/location/output.gds" 135 | 136 | DBNAME_chr17 <- 'postgres' 137 | HOST_chr17 <- 'localhost' 138 | PORT_chr17 <- 5432 139 | 140 | #---------chr18----------------------- 141 | vcf.chr18.fn<-"/n/location/input.vcf" 142 | gds.chr18.fn<-"/n/location/output.gds" 143 | 144 | DBNAME_chr18 <- 'postgres' 145 | HOST_chr18 <- 'localhost' 146 | PORT_chr18 <- 5432 147 | 148 | #---------chr19----------------------- 149 | vcf.chr19.fn<-"/n/location/input.vcf" 150 | gds.chr19.fn<-"/n/location/output.gds" 151 | 152 | DBNAME_chr19 <- 'postgres' 153 | HOST_chr19 <- 'localhost' 154 | PORT_chr19 <- 5432 155 | 156 | #---------chr20----------------------- 157 | vcf.chr20.fn<-"/n/location/input.vcf" 158 | gds.chr20.fn<-"/n/location/output.gds" 159 | 160 | DBNAME_chr20 <- 'postgres' 161 | HOST_chr20 <- 'localhost' 162 | PORT_chr20 <- 5432 163 | 164 | #---------chr21----------------------- 165 | vcf.chr21.fn<-"/n/location/input.vcf" 166 | gds.chr21.fn<-"/n/location/output.gds" 167 | 168 | DBNAME_chr21 <- 'postgres' 169 | HOST_chr21 <- 'localhost' 170 | PORT_chr21 <- 5432 171 | 172 | #---------chr22----------------------- 173 | vcf.chr22.fn<-"/n/location/input.vcf" 174 | gds.chr22.fn<-"/n/location/output.gds" 175 | 176 | DBNAME_chr22 <- 'postgres' 177 | HOST_chr22 <- 'localhost' 178 | PORT_chr22 <- 5432 179 | 180 | #---------chrX----------------------- 181 | vcf.chrX.fn<-"/n/location/input.vcf" 182 | gds.chrX.fn<-"/n/location/output.gds" 183 | 184 | DBNAME_chrX <- 'postgres' 185 | HOST_chrX <- 'localhost' 186 | PORT_chrX <- 5432 187 | 188 | #---------chrY----------------------- 189 | vcf.chrY.fn<-"/n/location/input.vcf" 190 | gds.chrY.fn<-"/n/location/output.gds" 191 | 192 | DBNAME_chrY <- 'postgres' 193 | HOST_chrY <- 'localhost' 194 | PORT_chrY <- 5432 195 | -------------------------------------------------------------------------------- /Scripts/SQL/convertVCFtoGDS.r: -------------------------------------------------------------------------------- 1 | ############################################################################# 2 | #Title: convertVCFtoGDS 3 | #Function: 4 | # * Build the GDS file from VCF files 5 | #Author: Hufeng Zhou 6 | #Time: Nov 27th 2021 7 | ############################################################################# 8 | library(gdsfmt) 9 | library(SeqArray) 10 | 11 | #import configuration file 12 | source('config.R') 13 | 14 | #vcf.chr10.fn=as.character(commandArgs(TRUE)[1]) 15 | #gds.chr10.fn=as.character(commandArgs(TRUE)[2]) 16 | CHRN=as.character(commandArgs(TRUE)[1]) 17 | seqVCF2GDS(eval(parse(text = paste0("vcf.chr",CHRN,".fn"))), eval(parse(text = paste0("gds.chr",CHRN,".fn"))), header = NULL, genotype.var.name = "GT", info.import=NULL, fmt.import=NULL, ignore.chr.prefix="chr", raise.error=TRUE, verbose=TRUE) 18 | genofile<-seqOpen(eval(parse(text = paste0("gds.chr",CHRN,".fn"))), readonly = FALSE) 19 | print("GDS built") 20 | 21 | ###Closing Up### 22 | genofile 23 | seqClose(genofile) 24 | -------------------------------------------------------------------------------- /Scripts/SQL/convertVCFtoNullGenotypeGDS.r: -------------------------------------------------------------------------------- 1 | ############################################################################# 2 | #Title: convertVCFtoGDS 3 | #Function: 4 | # * Build the GDS file from VCF files 5 | #Author: Hufeng Zhou 6 | #Time: Nov 27th 2021 7 | ############################################################################# 8 | library(gdsfmt) 9 | library(SeqArray) 10 | 11 | #import configuration file 12 | source('config.R') 13 | 14 | #vcf.fn=as.character(commandArgs(TRUE)[1]) 15 | #gds.fn=as.character(commandArgs(TRUE)[2]) 16 | nogenotype.fn=as.character(commandArgs(TRUE)[2]) 17 | 18 | seqVCF2GDS(vcf.fn, gds.fn, header = NULL, genotype.var.name = "GT", info.import=NULL, fmt.import=NULL, ignore.chr.prefix="chr", raise.error=TRUE, verbose=TRUE) 19 | genofile<-seqOpen(gds.fn, readonly = FALSE) 20 | print("GDS built") 21 | 22 | ############################################################################# 23 | # Remove samples/genotype data from the full GDS file 24 | ############################################################################# 25 | seqSetFilter(genofile,sample.id=character(0)) 26 | seqExport(genofile,nogenotype.fn,fmt.var=character(),samp.var=character(0),optimize=TRUE,digest=TRUE,verbose=TRUE) 27 | seqClose(genofile) 28 | 29 | genofile<-seqOpen(nogenotype.fn, readonly = FALSE) 30 | ###Closing Up### 31 | genofile 32 | seqClose(genofile) 33 | -------------------------------------------------------------------------------- /Scripts/SQL/importCommands.sql: -------------------------------------------------------------------------------- 1 | /* Title: Import the database into postgreSQL 2 | * Time: April 29th 2021 3 | * Author: Ted and Hufeng 4 | */ 5 | 6 | psql -h localhost -p portnumber -d FAVORV2 7 | 8 | CREATE TABLE MAIN( 9 | variant_vcf text, 10 | chromosome text, 11 | position integer, 12 | ref_vcf text, 13 | alt_vcf text, 14 | apc_conservation numeric, 15 | apc_conservation_v2 numeric, 16 | apc_epigenetics numeric, 17 | apc_epigenetics_active numeric, 18 | apc_epigenetics_repressed numeric, 19 | apc_epigenetics_transcription numeric, 20 | apc_local_nucleotide_diversity numeric, 21 | apc_local_nucleotide_diversity_v2 numeric, 22 | apc_local_nucleotide_diversity_v3 numeric, 23 | apc_mappability numeric, 24 | apc_micro_rna numeric, 25 | apc_mutation_density numeric, 26 | apc_protein_function numeric, 27 | apc_proximity_to_coding numeric, 28 | apc_proximity_to_coding_v2 numeric, 29 | apc_proximity_to_tsstes numeric, 30 | apc_transcription_factor numeric, 31 | cage_promoter text, 32 | cage_tc text, 33 | metasvm_pred text, 34 | rsid text, 35 | fathmm_xf numeric, 36 | genecode_comprehensive_category text, 37 | genecode_comprehensive_info text, 38 | genecode_comprehensive_exonic_info text, 39 | genecode_comprehensive_exonic_category text, 40 | genehancer text, 41 | linsight numeric, 42 | cadd_phred numeric, 43 | rdhs text); 44 | 45 | COPY main FROM '/n/holystore01/LABS/xlin/Lab/zhouhufeng/DB/FAVORannotator/NewDB/FAVORAnnotatorDB.22.txt' CSV HEADER; 46 | 47 | CREATE VIEW offline_view AS SELECT * FROM main; 48 | 49 | CREATE INDEX ON main USING HASH(variant_vcf); 50 | 51 | CREATE USER annotator WITH SUPERUSER PASSWORD 'DoMeAFAVOR'; 52 | -------------------------------------------------------------------------------- /Scripts/SQL/submitJobs.sh: -------------------------------------------------------------------------------- 1 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=65000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 1; Rscript ./FAVORannotatorv2aGDS.r 1' 2 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=65000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 2; Rscript ./FAVORannotatorv2aGDS.r 2' 3 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=60000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 3; Rscript ./FAVORannotatorv2aGDS.r 3' 4 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=60000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 4; Rscript ./FAVORannotatorv2aGDS.r 4' 5 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=55000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 5; Rscript ./FAVORannotatorv2aGDS.r 6' 6 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=50000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 6; Rscript ./FAVORannotatorv2aGDS.r 5' 7 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=50000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 7; Rscript ./FAVORannotatorv2aGDS.r 7' 8 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=50000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 8; Rscript ./FAVORannotatorv2aGDS.r 8' 9 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=40000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 9; Rscript ./FAVORannotatorv2aGDS.r 9' 10 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=40000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 10; Rscript ./FAVORannotatorv2aGDS.r 10' 11 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=40000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 11; Rscript ./FAVORannotatorv2aGDS.r 11' 12 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=40000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 12; Rscript ./FAVORannotatorv2aGDS.r 12' 13 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=40000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 13; Rscript ./FAVORannotatorv2aGDS.r 13' 14 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=35000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 14; Rscript ./FAVORannotatorv2aGDS.r 14' 15 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=35000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 15; Rscript ./FAVORannotatorv2aGDS.r 15' 16 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=30000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 16; Rscript ./FAVORannotatorv2aGDS.r 16' 17 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=30000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 17; Rscript ./FAVORannotatorv2aGDS.r 17' 18 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=30000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 18; Rscript ./FAVORannotatorv2aGDS.r 18' 19 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=30000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 19; Rscript ./FAVORannotatorv2aGDS.r 19' 20 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=20000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 20; Rscript ./FAVORannotatorv2aGDS.r 20' 21 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=20000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 21; Rscript ./FAVORannotatorv2aGDS.r 21' 22 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=20000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 22; Rscript ./FAVORannotatorv2aGDS.r 22' 23 | -------------------------------------------------------------------------------- /Scripts/UTL/FAVORannotatorAddIn.R: -------------------------------------------------------------------------------- 1 | ############################################################################# 2 | #Title: FAVORannotatorAddIn 3 | #Function: 4 | # * Add in additional functional annotation into the aGDS file 5 | #Author: Hufeng Zhou 6 | #Time: Aug 27th 2022 7 | ############################################################################# 8 | 9 | args <- commandArgs(TRUE) 10 | ### mandatory 11 | 12 | gds.file <- args[1] 13 | print(paste0("gds.file: ",gds.file)) 14 | 15 | anno.file <- args[2] 16 | print(paste0("anno.file: ",anno.file)) 17 | 18 | 19 | start_time <- Sys.time() 20 | use_compression <- "Yes" 21 | print(paste0("use_compression: ",use_compression)) 22 | 23 | ### annotation file 24 | dir_anno <- "./" 25 | 26 | ### load required package 27 | library(gdsfmt) 28 | library(SeqArray) 29 | library(readr) 30 | 31 | ### read annotation data 32 | #FunctionalAnnotation <- read_csv(paste0(dir_anno,"chr",chr,"/Anno_chr",chr,".csv")) 33 | FunctionalAnnotation <- read_delim(anno.file,delim = NULL) 34 | 35 | dim(FunctionalAnnotation) 36 | 37 | ## open GDS 38 | print("Before Adding Functional Annotation") 39 | genofile <- seqOpen(gds.file, readonly = FALSE) 40 | print("Working on Adding") 41 | genofile 42 | 43 | Anno.folder <- index.gdsn(genofile, "annotation/info") 44 | add.gdsn(Anno.folder, "NewAnnotation", val=FunctionalAnnotation, compress="LZMA_ra", closezip=TRUE) 45 | 46 | genofile 47 | 48 | print("Add in Functional Annotation") 49 | 50 | seqClose(genofile) 51 | end_time <- Sys.time() 52 | 53 | print("time") 54 | end_time - start_time 55 | 56 | -------------------------------------------------------------------------------- /Scripts/UTL/convBCF2GDS.r: -------------------------------------------------------------------------------- 1 | ############################################################################# 2 | #Title: convertVCFtoGDS 3 | #Function: 4 | # * Build the GDS file from BCF files 5 | #Author: Hufeng Zhou 6 | #Time: Aug 27th 2022 7 | # This only runs on single core, therefore very slow. 8 | ############################################################################# 9 | library(gdsfmt) 10 | library(SeqArray) 11 | 12 | vcf.fn=as.character(commandArgs(TRUE)[1]) 13 | gds.fn=as.character(commandArgs(TRUE)[2]) 14 | seqBCF2GDS(vcf.fn, gds.fn, storage.option="LZMA_RA", bcftools="bcftools") 15 | genofile<-seqOpen(gds.fn, readonly = FALSE) 16 | print("GDS built") 17 | 18 | ###Closing Up### 19 | genofile 20 | seqClose(genofile) 21 | -------------------------------------------------------------------------------- /Scripts/UTL/convertVCFtoGDS.r: -------------------------------------------------------------------------------- 1 | ############################################################################# 2 | #Title: convertVCFtoGDS 3 | #Function: 4 | # * Build the GDS file from VCF files 5 | #Author: Hufeng Zhou 6 | #Time: Aug 27th 2022 7 | ############################################################################# 8 | library(gdsfmt) 9 | library(SeqArray) 10 | 11 | vcf.fn=as.character(commandArgs(TRUE)[1]) 12 | gds.fn=as.character(commandArgs(TRUE)[2]) 13 | seqVCF2GDS(vcf.fn, gds.fn, parallel=10) 14 | genofile<-seqOpen(gds.fn, readonly = FALSE) 15 | print("GDS built") 16 | 17 | ###Closing Up### 18 | genofile 19 | seqClose(genofile) 20 | -------------------------------------------------------------------------------- /Scripts/UTL/convertVCFtoNullGenotypeGDS.r: -------------------------------------------------------------------------------- 1 | ############################################################################# 2 | #Title: convertVCFtoGDS 3 | #Function: 4 | # * Build the GDS file from VCF files 5 | #Author: Hufeng Zhou 6 | #Time: Aug 27th 2021 7 | ############################################################################# 8 | library(gdsfmt) 9 | library(SeqArray) 10 | 11 | 12 | vcf.fn=as.character(commandArgs(TRUE)[1]) 13 | gds.fn=as.character(commandArgs(TRUE)[2]) 14 | nogenotype.fn=as.character(commandArgs(TRUE)[2]) 15 | 16 | seqVCF2GDS(vcf.fn, gds.fn, header = NULL, genotype.var.name = "GT", info.import=NULL, fmt.import=NULL, ignore.chr.prefix="chr", raise.error=TRUE, verbose=TRUE) 17 | genofile<-seqOpen(gds.fn, readonly = FALSE) 18 | print("GDS built") 19 | 20 | ############################################################################# 21 | # Remove samples/genotype data from the full GDS file 22 | ############################################################################# 23 | seqSetFilter(genofile,sample.id=character(0)) 24 | seqExport(genofile,nogenotype.fn,fmt.var=character(),samp.var=character(0),optimize=TRUE,digest=TRUE,verbose=TRUE) 25 | seqClose(genofile) 26 | 27 | genofile<-seqOpen(nogenotype.fn, readonly = FALSE) 28 | ###Closing Up### 29 | genofile 30 | seqClose(genofile) 31 | -------------------------------------------------------------------------------- /Scripts/UTL/convertaGDStoVCF.r: -------------------------------------------------------------------------------- 1 | ############################################################################# 2 | #Title: convertVCFtoGDS 3 | #Function: 4 | # * Build the GDS file from VCF files 5 | #Author: Hufeng Zhou 6 | #Time: Aug 27th 2022 7 | ############################################################################# 8 | library(gdsfmt) 9 | library(SeqArray) 10 | 11 | gds.fn=as.character(commandArgs(TRUE)[1]) 12 | vcf.fn=as.character(commandArgs(TRUE)[2]) 13 | #seqVCF2GDS(vcf.fn, gds.fn, parallel=10) 14 | genofile<-seqOpen(gds.fn, readonly = FALSE) 15 | 16 | ###Closing Up### 17 | genofile 18 | seqClose(genofile) 19 | 20 | ###Write Out### 21 | seqGDS2VCF(gds.fn,vcf.fn) 22 | print("GDS built") 23 | 24 | -------------------------------------------------------------------------------- /Scripts/UTL/preProcessingVCF.sh: -------------------------------------------------------------------------------- 1 | #Fixed Headers [make sure all fields are defined in header]. 2 | #Remove Duplicated VCFs [Make sure there is no duplicated VCF files]. 3 | 4 | #Remove FORMAT variables but only keep GT [multi-core] 5 | for fl in ukb23156_c19_b*_v1.vcf.gz; do bcftools annotate -x ^FORMAT/GT $fl --threads 12 -Oz -o ./CVCF/$fl &; done 6 | 7 | #Concat the smaller VCFs (sliced by variants) within each study into one VCF file [24 mins] 8 | bcftools concat --threads 12 ./CVCF/ukb23156_c19_b*_v1.vcf.gz -Oz -o ./ConcatVCF/ukb23156_c19_c12.vcf.gz 9 | 10 | #Break the multi-allelic sites into multiple rows of all the VCFs of each study [Indexed VCFs]. 11 | bcftools norm -m -any --threads 12 ./ConcatVCF/ukb23156_c19_c12.vcf.gz -Oz -o ./ConcatVCF/ukb23156_c19_c12.bk.vcf.gz 12 | 13 | #Normalize (left) the broken multi-allelic VCFs [Indexed VCFs]. 14 | bcftools norm -f --threads 12 hg38.p13.fa ./ConcatVCF/ukb23156_c19_c12.bk.vcf.gz -Oz -o ./ConcatVCF/ukb23156_c19_c12.bk.nm.vcf.gz 15 | 16 | #Indexed cleaned VCFs 17 | bcftools index ./ConcatVCF/ukb23156_c19_c12.bk.nm.vcf.gz 18 | 19 | #Sliced the Normalized VCFs into each chromosome. [Indexed VCFs] 20 | bcftools view -r chr19 ./ConcatVCF/ukb23156_c12.bk.nm.vcf.gz -Oz -o ./ConcatVCF/ukb23156_c19_c12.bk.nm.vcf.gz 21 | 22 | #Merge the Normalized VCFs (sliced by different samples) of each study into one VCF (per chromosome). 23 | bcftools merge -m all --threads 6 ./DifferentStudies/ukbb*.bk.nm.vcf.gz -Oz -o ./MergedVCF/ukbb.merged.bk.nm.vcf.gz 24 | 25 | #Convert the merged VCFs per chromosomes into GDSs (per chromosome) [72 mins]. 26 | Rscripts ./convertVCFtoGDS.r ./MergedVCF/ukbb.merged.bk.nm.vcf.gz ./MergedGDS/ukbb.merged.bk.nm.gds 27 | --------------------------------------------------------------------------------