├── Data
    └── TestData
    │   ├── 1000G
    │       ├── All.chr22.27022019.GRCh38.phased.gds
    │       └── LOG.txt
    │   └── Input
    │       ├── ._FAVOR.T2210k.gds
    │       ├── FAVOR.T2210k.gds
    │       └── FAVOR.T2210k.vcf
├── Docs
    ├── .DS_Store
    └── Tutorial
    │   ├── .DS_Store
    │   ├── Demos
    │       ├── FASRC.md
    │       ├── UKBB200KWESpreprocessVCF.md
    │       └── preprocessVCF.md
    │   ├── Detailed-Explanation
    │       └── FAVORFullDB.xlsx
    │   ├── Figures
    │       ├── FASRC1.jpg
    │       ├── FAVORannotatorOnTerra.png
    │       ├── Figure2A.png
    │       ├── Figure2B.png
    │       ├── Figure2C.png
    │       ├── HarvardDataVerse.png
    │       ├── LiveDemo.png
    │       ├── createDBinstance.png
    │       ├── figure1.png
    │       ├── figure4.png
    │       ├── postgreSQLdb.png
    │       ├── runningInstance.png
    │       ├── versions.png
    │       └── versions1.png
    │   └── Tables
    │       ├── table 1.png
    │       └── table1.png
├── README.md
└── Scripts
    ├── CSV
        ├── Dockerfile.txt
        ├── FAVORannotatorCSVEssentialDB.R
        ├── FAVORannotatorCSVFullDB.R
        ├── FAVORannotatorv2aGDS.r
        ├── config.R
        ├── convertVCFtoGDS.r
        ├── subBatchJobs.sh
        ├── subBatchJobs.txt
        └── submitJobs.sh
    ├── Cloud
        ├── .DS_Store
        ├── ._.DS_Store
        ├── DNAnexus
        │   ├── ._FAVORannotatorDev.R
        │   ├── ._code.sh
        │   ├── ._favorannotator.R
        │   ├── FAVORannotatorDev.R
        │   ├── code.sh
        │   └── favorannotator.R
        └── Terra
        │   ├── .DS_Store
        │   ├── .Rhistory
        │   ├── FAVORannotatorEssentialDB.wdl
        │   ├── FAVORannotatorFullDB.wdl
        │   ├── FAVORannotatorTerra.r
        │   ├── FAVORannotatorTerraEssentialDB.R
        │   ├── FAVORannotatorTerraFullDB.R
        │   ├── convertVCFtoGDS.R
        │   ├── headercolumn.txt
        │   └── test.R
    ├── Dockerize
        ├── Dockerfile.txt
        ├── ExampleDockerFiles.txt
        └── install_packages.R
    ├── SQL
        ├── FAVORannotatorv2aGDS.r
        ├── FAVORdatabase_chrsplit.csv
        ├── config.R
        ├── convertVCFtoGDS.r
        ├── convertVCFtoNullGenotypeGDS.r
        ├── importCommands.sql
        └── submitJobs.sh
    └── UTL
        ├── FAVORannotatorAddIn.R
        ├── convBCF2GDS.r
        ├── convertVCFtoGDS.r
        ├── convertVCFtoNullGenotypeGDS.r
        ├── convertaGDStoVCF.r
        └── preProcessingVCF.sh


/Data/TestData/1000G/All.chr22.27022019.GRCh38.phased.gds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Data/TestData/1000G/All.chr22.27022019.GRCh38.phased.gds


--------------------------------------------------------------------------------
/Data/TestData/1000G/LOG.txt:
--------------------------------------------------------------------------------
  1 | wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chr22.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz
  2 | 
  3 | cd ../../../Scripts/UTL
  4 | 
  5 | Rscript convertVCFtoGDS.r ../../Data/TestData/1000G/ALL.chr22.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz ../../Data/TestData/1000G/All.chr22.27022019.GRCh38.phased.gds
  6 | 
  7 | 
  8 | Rscript convertVCFtoGDS.r ../../Data/TestData/Input/ALL.chr22.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz ../../Data/1000G/All.chr22.27022019.GRCh38.phased.gds
  9 | Tue Sep 13 09:41:36 2022
 10 | Variant Call Format (VCF) Import:
 11 |     file(s):
 12 |         ALL.chr22.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz (176.9M)
 13 |     file format: VCFv4.3
 14 |     the number of sets of chromosomes (ploidy): 2
 15 |     the number of samples: 2,548
 16 |     genotype storage: bit2
 17 |     compression method: LZMA_RA
 18 |     # of samples: 2548
 19 | Output:
 20 |     ../../Data/1000G/All.chr22.27022019.GRCh38.phased.gds
 21 | Parsing 'ALL.chr22.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz':
 22 | + genotype/data   { Bit2 2x2548x1059079 LZMA_ra(1.98%), 25.5M }
 23 | Digests:
 24 |     sample.id  [md5: cc8afb576aed4d02012126932df7cad6]
 25 |     variant.id  [md5: 7c017d53094de68d314b6ad6d5731cee]
 26 |     position  [md5: 661ae4bc37d222bc242b379ac5b4103c]
 27 |     chromosome  [md5: 0f71906ff5f7af239ab447459e0fd340]
 28 |     allele  [md5: e03733491972cf350905736dc3ba7897]
 29 |     genotype  [md5: 310a491df81e5e5d015cfd8b0534c343]
 30 |     phase  [md5: feef32f42a2bebbf7e8aca22a385acef]
 31 |     annotation/id  [md5: af0e6be931baefc61425e7d80e8a7d6c]
 32 |     annotation/qual  [md5: de3d57a832d4552c0b92a592f0c30ab3]
 33 |     annotation/filter  [md5: 12aa343d303c14e0e724b2c3ac634d59]
 34 |     annotation/info/AF  [md5: 08ba51bd9a4fe4c8d65124d906d651be]
 35 |     annotation/info/AC  [md5: f50cf8580f617f21755b775c998a79a7]
 36 |     annotation/info/NS  [md5: 3f8d2c2fe9b610e0407b63069cdcca19]
 37 |     annotation/info/AN  [md5: 66dc16416504683004b60bf1259370d3]
 38 |     annotation/info/EAS_AF  [md5: 6268475df4da4ecfe85ff45a31985bf2]
 39 |     annotation/info/EUR_AF  [md5: 11f69a8880a343f916f428d428ee0e3e]
 40 |     annotation/info/AFR_AF  [md5: cde11169e2c527e079563326ec5eb603]
 41 |     annotation/info/AMR_AF  [md5: d85787dac3642db9f70cb05a9f22248a]
 42 |     annotation/info/SAS_AF  [md5: 70bb72b5bf8b850a68da314769c6b09d]
 43 |     annotation/info/VT  [md5: f7172d73a09bf45b641029eb2bde879e]
 44 |     annotation/info/EX_TARGET  [md5: 401261c4071060a74aa7994bdce29065]
 45 |     annotation/info/DP  [md5: 47cd81d4a60b61552a300cb09fa0a2cf]
 46 | Done.
 47 | Tue Sep 13 09:44:39 2022
 48 | Optimize the access efficiency ...
 49 | Clean up the fragments of GDS file:
 50 |     open the file '../../Data/1000G/All.chr22.27022019.GRCh38.phased.gds' (31.9M)
 51 |     # of fragments: 795
 52 |     save to '../../Data/1000G/All.chr22.27022019.GRCh38.phased.gds.tmp'
 53 |     rename '../../Data/1000G/All.chr22.27022019.GRCh38.phased.gds.tmp' (31.9M, reduced: 8.2K)
 54 |     # of fragments: 92
 55 | Object of class "SeqVarGDSClass"
 56 | File: ./Data/1000G/All.chr22.27022019.GRCh38.phased.gds (31.9M)
 57 | +    [  ] *
 58 | |--+ description   [  ] *
 59 | |--+ sample.id   { Str8 2548 LZMA_ra(7.84%), 1.6K } *
 60 | |--+ variant.id   { Int32 1059079 LZMA_ra(6.20%), 256.6K } *
 61 | |--+ position   { Int32 1059079 LZMA_ra(27.0%), 1.1M } *
 62 | |--+ chromosome   { Str8 1059079 LZMA_ra(0.02%), 617B } *
 63 | |--+ allele   { Str8 1059079 LZMA_ra(15.4%), 665.6K } *
 64 | |--+ genotype   [  ] *
 65 | |  |--+ data   { Bit2 2x2548x1059079 LZMA_ra(1.98%), 25.5M } *
 66 | |  |--+ extra.index   { Int32 3x0 LZMA_ra, 18B } *
 67 | |  \--+ extra   { Int16 0 LZMA_ra, 18B }
 68 | |--+ phase   [  ]
 69 | |  |--+ data   { Bit1 2548x1059079 LZMA_ra(0.01%), 48.1K } *
 70 | |  |--+ extra.index   { Int32 3x0 LZMA_ra, 18B } *
 71 | |  \--+ extra   { Bit1 0 LZMA_ra, 18B }
 72 | |--+ annotation   [  ]
 73 | |  |--+ id   { Str8 1059079 LZMA_ra(0.03%), 305B } *
 74 | |  |--+ qual   { Float32 1059079 LZMA_ra(0.02%), 777B } *
 75 | |  |--+ filter   { Int32,factor 1059079 LZMA_ra(0.02%), 777B } *
 76 | |  |--+ info   [  ]
 77 | |  |  |--+ AF   { Float32 1059079 LZMA_ra(7.72%), 319.6K } *
 78 | |  |  |--+ AC   { Int32 1059079 LZMA_ra(19.0%), 788.0K } *
 79 | |  |  |--+ NS   { Int32 1059079 LZMA_ra(0.02%), 777B } *
 80 | |  |  |--+ AN   { Int32 1059079 LZMA_ra(0.02%), 777B } *
 81 | |  |  |--+ EAS_AF   { Float32 1059079 LZMA_ra(5.73%), 237.2K } *
 82 | |  |  |--+ EUR_AF   { Float32 1059079 LZMA_ra(6.18%), 255.7K } *
 83 | |  |  |--+ AFR_AF   { Float32 1059079 LZMA_ra(8.56%), 354.1K } *
 84 | |  |  |--+ AMR_AF   { Float32 1059079 LZMA_ra(6.70%), 277.2K } *
 85 | |  |  |--+ SAS_AF   { Float32 1059079 LZMA_ra(6.45%), 266.8K } *
 86 | |  |  |--+ VT   { Str8 1059079 LZMA_ra(2.06%), 88.0K } *
 87 | |  |  |--+ EX_TARGET   { Bit1 1059079 LZMA_ra(6.62%), 8.6K } *
 88 | |  |  \--+ DP   { Int32 1059079 LZMA_ra(45.0%), 1.8M } *
 89 | |  \--+ format   [  ]
 90 | \--+ sample.annotation   [  ]
 91 | 
 92 | 
 93 | 
 94 | 
 95 | 
 96 | 
 97 | Rscript convertVCFtoGDS.r ../../Data/TestData/1000G/ALL.chr1.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz ../../Data/TestData/1000G/All.chr1.27022019.GRCh38.phased.gds
 98 | Wed Sep 14 15:34:06 2022
 99 | Variant Call Format (VCF) Import:
100 |     file(s):
101 |         ALL.chr1.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz (992.0M)
102 |     file format: VCFv4.3
103 |     the number of sets of chromosomes (ploidy): 2
104 |     the number of samples: 2,548
105 |     genotype storage: bit2
106 |     compression method: LZMA_RA
107 |     # of samples: 2548
108 | Output:
109 |     ../../Data/TestData/1000G/All.chr1.27022019.GRCh38.phased.gds
110 | Parsing 'ALL.chr1.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz':
111 | + genotype/data   { Bit2 2x2548x6191833 LZMA_ra(1.66%), 124.8M }
112 | Digests:
113 |     sample.id  [md5: cc8afb576aed4d02012126932df7cad6]
114 |     variant.id  [md5: 279f387b3778a8c9d445cfb30d10a171]
115 |     position  [md5: a6c28615b3f1ee1c947a1b05edee371c]
116 |     chromosome  [md5: 54f1983c4511a2c4390a5d5df7caa405]
117 |     allele  [md5: 05ba696424c744191d4fa03bbbb513da]
118 |     genotype  [md5: 1cc8fb9f7c258ad52077d9fba0cd0b28]
119 |     phase  [md5: 65e7447bf92f4d3f01ed41bef75a7909]
120 |     annotation/id  [md5: dbfffce3ef30f3f18399e274c1310a1b]
121 |     annotation/qual  [md5: fce0966ea8b5452c728dedad5bc274e9]
122 |     annotation/filter  [md5: 2816dfe618d22aecb05a7c7ee4dd0b15]
123 |     annotation/info/AF  [md5: 11a4bf622d63d108bb353ea9fc1401df]
124 |     annotation/info/AC  [md5: dee598f41dd55a2e53df96d7ac639e23]
125 |     annotation/info/NS  [md5: 03af0bc2c44c6ca6bef0e93a68260dce]
126 |     annotation/info/AN  [md5: 5cf4736661784e0f51f33d5c7514a963]
127 |     annotation/info/EAS_AF  [md5: 297f3fa3a884a7f4e8cf1a977fce980c]
128 |     annotation/info/EUR_AF  [md5: 7ad89550530d1118871c8a4013b6a04f]
129 |     annotation/info/AFR_AF  [md5: 24f627bbc19e5e300ed57ee265e67904]
130 |     annotation/info/AMR_AF  [md5: 5a84744b92f10161982d1143896a19ec]
131 |     annotation/info/SAS_AF  [md5: dfaa87f4f0e77c5375c657652f9c0fd8]
132 |     annotation/info/VT  [md5: afbc4e4d62b4497a42d627dd938ae5ac]
133 |     annotation/info/EX_TARGET  [md5: dbf69680949973e984135aac3ab2d290]
134 |     annotation/info/DP  [md5: a64abd5bc289ef2b94d8807371ef601c]
135 | Done.
136 | Wed Sep 14 15:54:28 2022
137 | Optimize the access efficiency ...
138 | Clean up the fragments of GDS file:
139 |     open the file '../../Data/TestData/1000G/All.chr1.27022019.GRCh38.phased.gds' (161.1M)
140 |     # of fragments: 3534
141 |     save to '../../Data/TestData/1000G/All.chr1.27022019.GRCh38.phased.gds.tmp'
142 |     rename '../../Data/TestData/1000G/All.chr1.27022019.GRCh38.phased.gds.tmp' (161.0M, reduced: 40.3K)
143 |     # of fragments: 92
144 | Wed Sep 14 15:54:29 2022
145 | [1] "GDS built"
146 | Object of class "SeqVarGDSClass"
147 | File: ./Data/TestData/1000G/All.chr1.27022019.GRCh38.phased.gds (161.0M)
148 | +    [  ] *
149 | |--+ description   [  ] *
150 | |--+ sample.id   { Str8 2548 LZMA_ra(7.84%), 1.6K } *
151 | |--+ variant.id   { Int32 6191833 LZMA_ra(2.92%), 706.4K } *
152 | |--+ position   { Int32 6191833 LZMA_ra(27.9%), 6.6M } *
153 | |--+ chromosome   { Str8 6191833 LZMA_ra(0.02%), 1.9K } *
154 | |--+ allele   { Str8 6191833 LZMA_ra(15.7%), 3.9M } *
155 | |--+ genotype   [  ] *
156 | |  |--+ data   { Bit2 2x2548x6191833 LZMA_ra(1.66%), 124.8M } *
157 | |  |--+ extra.index   { Int32 3x0 LZMA_ra, 18B } *
158 | |  \--+ extra   { Int16 0 LZMA_ra, 18B }
159 | |--+ phase   [  ]
160 | |  |--+ data   { Bit1 2548x6191833 LZMA_ra(0.01%), 280.4K } *
161 | |  |--+ extra.index   { Int32 3x0 LZMA_ra, 18B } *
162 | |  \--+ extra   { Bit1 0 LZMA_ra, 18B }
163 | |--+ annotation   [  ]
164 | |  |--+ id   { Str8 6191833 LZMA_ra(0.02%), 1.0K } *
165 | |  |--+ qual   { Float32 6191833 LZMA_ra(0.02%), 3.7K } *
166 | |  |--+ filter   { Int32,factor 6191833 LZMA_ra(0.02%), 3.7K } *
167 | |  |--+ info   [  ]
168 | |  |  |--+ AF   { Float32 6191833 LZMA_ra(7.39%), 1.7M } *
169 | |  |  |--+ AC   { Int32 6191833 LZMA_ra(18.5%), 4.4M } *
170 | |  |  |--+ NS   { Int32 6191833 LZMA_ra(0.02%), 3.7K } *
171 | |  |  |--+ AN   { Int32 6191833 LZMA_ra(0.02%), 3.7K } *
172 | |  |  |--+ EAS_AF   { Float32 6191833 LZMA_ra(5.54%), 1.3M } *
173 | |  |  |--+ EUR_AF   { Float32 6191833 LZMA_ra(5.90%), 1.4M } *
174 | |  |  |--+ AFR_AF   { Float32 6191833 LZMA_ra(8.20%), 1.9M } *
175 | |  |  |--+ AMR_AF   { Float32 6191833 LZMA_ra(6.52%), 1.5M } *
176 | |  |  |--+ SAS_AF   { Float32 6191833 LZMA_ra(6.17%), 1.5M } *
177 | |  |  |--+ VT   { Str8 6191833 LZMA_ra(2.09%), 522.7K } *
178 | |  |  |--+ EX_TARGET   { Bit1 6191833 LZMA_ra(4.82%), 36.4K } *
179 | |  |  \--+ DP   { Int32 6191833 LZMA_ra(44.0%), 10.4M } *
180 | |  \--+ format   [  ]
181 | \--+ sample.annotation   [  ]
182 | 
183 | 
184 | zhou@M1 Full % cp ../../FAVORannotator/Scripts/CSV/FAVORannotatorCSVFullDB.R .
185 | zhou@M1 Full % Rscript FAVORannotatorCSVFullDB.R All.chr22.27022019.GRCh38.phased.gds 22
186 | [1] "gds.file:  All.chr22.27022019.GRCh38.phased.gds"
187 | [1] "chr:  22"
188 | [1] "use_compression: Yes"
189 | --2022-09-14 16:39:31--  https://dataverse.harvard.edu/api/access/datafile/6358299
190 | Resolving dataverse.harvard.edu (dataverse.harvard.edu)... 3.219.100.164, 54.211.138.37, 3.226.192.24
191 | Connecting to dataverse.harvard.edu (dataverse.harvard.edu)|3.219.100.164|:443... connected.
192 | HTTP request sent, awaiting response... 303 See Other
193 | Location: https://dvn-cloud.s3.amazonaws.com/10.7910/DVN/KFUBKG/181abe84e49-d907a5916c0e?response-content-disposition=attachment%3B%20filename%2A%3DUTF-8%27%27FAVOR.FullDB.Chr22.tar.gz&response-content-type=application%2Fgzip&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20220914T203932Z&X-Amz-SignedHeaders=host&X-Amz-Expires=3600&X-Amz-Credential=AKIAIEJ3NV7UYCSRJC7A%2F20220914%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=705ebe6c586f25f965028b6008bcafc810a4afeec25a6aaca5eca23c11ff87a2 [following]
194 | --2022-09-14 16:39:32--  https://dvn-cloud.s3.amazonaws.com/10.7910/DVN/KFUBKG/181abe84e49-d907a5916c0e?response-content-disposition=attachment%3B%20filename%2A%3DUTF-8%27%27FAVOR.FullDB.Chr22.tar.gz&response-content-type=application%2Fgzip&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20220914T203932Z&X-Amz-SignedHeaders=host&X-Amz-Expires=3600&X-Amz-Credential=AKIAIEJ3NV7UYCSRJC7A%2F20220914%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=705ebe6c586f25f965028b6008bcafc810a4afeec25a6aaca5eca23c11ff87a2
195 | Resolving dvn-cloud.s3.amazonaws.com (dvn-cloud.s3.amazonaws.com)... 3.5.8.193
196 | Connecting to dvn-cloud.s3.amazonaws.com (dvn-cloud.s3.amazonaws.com)|3.5.8.193|:443... connected.
197 | HTTP request sent, awaiting response... 200 OK
198 | Length: 9565974525 (8.9G) [application/gzip]
199 | Saving to: ‘6358299’
200 | 
201 | 6358299                                             88%[===================================================================================================>    6358299                                     6358299                                            100%[================================================================================================================>]   8.91G  3.71MB/s    in 44m 54s 
202 | 
203 | 2022-09-14 17:24:26 (3.39 MB/s) - ‘6358299’ saved [9565974525/9565974525]
204 | 
205 | x chr22_1.csv
206 | x chr22_2.csv
207 | x chr22_3.csv
208 | Object of class "SeqVarGDSClass"
209 | File: /Users/zhou/Storage/Research/Projects/Test/Full/All.chr22.27022019.GRCh38.phased.gds (31.9M)
210 | +    [  ] *
211 | |--+ description   [  ] *
212 | |--+ sample.id   { Str8 2548 LZMA_ra(7.84%), 1.6K } *
213 | |--+ variant.id   { Int32 1059079 LZMA_ra(6.20%), 256.6K } *
214 | |--+ position   { Int32 1059079 LZMA_ra(27.0%), 1.1M } *
215 | |--+ chromosome   { Str8 1059079 LZMA_ra(0.02%), 617B } *
216 | |--+ allele   { Str8 1059079 LZMA_ra(15.4%), 665.6K } *
217 | |--+ genotype   [  ] *
218 | |  |--+ data   { Bit2 2x2548x1059079 LZMA_ra(1.98%), 25.5M } *
219 | |  |--+ extra.index   { Int32 3x0 LZMA_ra, 18B } *
220 | |  \--+ extra   { Int16 0 LZMA_ra, 18B }
221 | |--+ phase   [  ]
222 | |  |--+ data   { Bit1 2548x1059079 LZMA_ra(0.01%), 48.1K } *
223 | |  |--+ extra.index   { Int32 3x0 LZMA_ra, 18B } *
224 | |  \--+ extra   { Bit1 0 LZMA_ra, 18B }
225 | |--+ annotation   [  ]
226 | |  |--+ id   { Str8 1059079 LZMA_ra(0.03%), 305B } *
227 | |  |--+ qual   { Float32 1059079 LZMA_ra(0.02%), 777B } *
228 | |  |--+ filter   { Int32,factor 1059079 LZMA_ra(0.02%), 777B } *
229 | |  |--+ info   [  ]
230 | |  |  |--+ AF   { Float32 1059079 LZMA_ra(7.72%), 319.6K } *
231 | |  |  |--+ AC   { Int32 1059079 LZMA_ra(19.0%), 788.0K } *
232 | |  |  |--+ NS   { Int32 1059079 LZMA_ra(0.02%), 777B } *
233 | |  |  |--+ AN   { Int32 1059079 LZMA_ra(0.02%), 777B } *
234 | |  |  |--+ EAS_AF   { Float32 1059079 LZMA_ra(5.73%), 237.2K } *
235 | |  |  |--+ EUR_AF   { Float32 1059079 LZMA_ra(6.18%), 255.7K } *
236 | |  |  |--+ AFR_AF   { Float32 1059079 LZMA_ra(8.56%), 354.1K } *
237 | |  |  |--+ AMR_AF   { Float32 1059079 LZMA_ra(6.70%), 277.2K } *
238 | |  |  |--+ SAS_AF   { Float32 1059079 LZMA_ra(6.45%), 266.8K } *
239 | |  |  |--+ VT   { Str8 1059079 LZMA_ra(2.06%), 88.0K } *
240 | |  |  |--+ EX_TARGET   { Bit1 1059079 LZMA_ra(6.62%), 8.6K } *
241 | |  |  \--+ DP   { Int32 1059079 LZMA_ra(45.0%), 1.8M } *
242 | |  \--+ format   [  ]
243 | \--+ sample.annotation   [  ]
244 | [1] 1
245 | [1] 2
246 | [1] 3
247 | [1] 1
248 | [1] 2
249 | [1] 3
250 | ── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
251 | cols(
252 |   .default = col_double(),
253 |   VarInfo = col_character(),
254 |   variant_vcf = col_character(),
255 |   variant_annovar = col_character(),
256 |   ref_annovar = col_character(),
257 |   alt_annovar = col_character(),
258 |   ref_vcf = col_character(),
259 |   alt_vcf = col_character(),
260 |   aloft_value = col_logical(),
261 |   aloft_description = col_logical(),
262 |   filter_status = col_character(),
263 |   cage_enhancer = col_logical(),
264 |   cage_promoter = col_logical(),
265 |   cage_tc = col_character(),
266 |   clnsig = col_logical(),
267 |   clnsigincl = col_logical(),
268 |   clndn = col_logical(),
269 |   clndnincl = col_logical(),
270 |   clnrevstat = col_logical(),
271 |   origin = col_logical(),
272 |   clndisdb = col_logical()
273 |   # ... with 38 more columns
274 | )
275 | ℹ Use `spec()` for the full column specifications.
276 | 
277 | Warning: 7150897 parsing failures.
278 |  row           col           expected                     actual                     file
279 | 1421 cage_promoter 1/0/T/F/TRUE/FALSE chr22:10729410..10729413,- './chr22/Anno_chr22.csv'
280 | 1466 cage_promoter 1/0/T/F/TRUE/FALSE chr22:10730403..10730417,- './chr22/Anno_chr22.csv'
281 | 1467 cage_promoter 1/0/T/F/TRUE/FALSE chr22:10730403..10730417,- './chr22/Anno_chr22.csv'
282 | 1468 cage_promoter 1/0/T/F/TRUE/FALSE chr22:10730403..10730417,- './chr22/Anno_chr22.csv'
283 | 1469 cage_promoter 1/0/T/F/TRUE/FALSE chr22:10730403..10730417,- './chr22/Anno_chr22.csv'
284 | .... ............. .................. .......................... ........................
285 | See problems(...) for more details.
286 | 
287 | [1] 1059079     190
288 | There were 14 warnings (use warnings() to see them)
289 | Object of class "SeqVarGDSClass"
290 | File: /Users/zhou/Storage/Research/Projects/Test/Full/All.chr22.27022019.GRCh38.phased.gds (226.1M)
291 | +    [  ] *
292 | |--+ description   [  ] *
293 | |--+ sample.id   { Str8 2548 LZMA_ra(7.84%), 1.6K } *
294 | |--+ variant.id   { Int32 1059079 LZMA_ra(6.20%), 256.6K } *
295 | |--+ position   { Int32 1059079 LZMA_ra(27.0%), 1.1M } *
296 | |--+ chromosome   { Str8 1059079 LZMA_ra(0.02%), 617B } *
297 | |--+ allele   { Str8 1059079 LZMA_ra(15.4%), 665.6K } *
298 | |--+ genotype   [  ] *
299 | |  |--+ data   { Bit2 2x2548x1059079 LZMA_ra(1.98%), 25.5M } *
300 | |  |--+ extra.index   { Int32 3x0 LZMA_ra, 18B } *
301 | |  \--+ extra   { Int16 0 LZMA_ra, 18B }
302 | |--+ phase   [  ]
303 | |  |--+ data   { Bit1 2548x1059079 LZMA_ra(0.01%), 48.1K } *
304 | |  |--+ extra.index   { Int32 3x0 LZMA_ra, 18B } *
305 | |  \--+ extra   { Bit1 0 LZMA_ra, 18B }
306 | |--+ annotation   [  ]
307 | |  |--+ id   { Str8 1059079 LZMA_ra(0.03%), 305B } *
308 | |  |--+ qual   { Float32 1059079 LZMA_ra(0.02%), 777B } *
309 | |  |--+ filter   { Int32,factor 1059079 LZMA_ra(0.02%), 777B } *
310 | |  |--+ info   [  ]
311 | |  |  |--+ AF   { Float32 1059079 LZMA_ra(7.72%), 319.6K } *
312 | |  |  |--+ AC   { Int32 1059079 LZMA_ra(19.0%), 788.0K } *
313 | |  |  |--+ NS   { Int32 1059079 LZMA_ra(0.02%), 777B } *
314 | |  |  |--+ AN   { Int32 1059079 LZMA_ra(0.02%), 777B } *
315 | |  |  |--+ EAS_AF   { Float32 1059079 LZMA_ra(5.73%), 237.2K } *
316 | |  |  |--+ EUR_AF   { Float32 1059079 LZMA_ra(6.18%), 255.7K } *
317 | |  |  |--+ AFR_AF   { Float32 1059079 LZMA_ra(8.56%), 354.1K } *
318 | |  |  |--+ AMR_AF   { Float32 1059079 LZMA_ra(6.70%), 277.2K } *
319 | |  |  |--+ SAS_AF   { Float32 1059079 LZMA_ra(6.45%), 266.8K } *
320 | |  |  |--+ VT   { Str8 1059079 LZMA_ra(2.06%), 88.0K } *
321 | |  |  |--+ EX_TARGET   { Bit1 1059079 LZMA_ra(6.62%), 8.6K } *
322 | |  |  |--+ DP   { Int32 1059079 LZMA_ra(45.0%), 1.8M } *
323 | |  |  \--+ FAVORFullDBAug1st2022   [ spec_tbl_df,tbl_df,tbl,data.frame,list ] *
324 | |  |     |--+ VarInfo   { Str8 1059079 LZMA_ra(15.6%), 2.5M }
325 | |  |     |--+ vid   { Float64 1059079 LZMA_ra(21.8%), 1.8M }
326 | |  |     |--+ variant_vcf   { Str8 1059079 LZMA_ra(15.5%), 2.5M }
327 | |  |     |--+ variant_annovar   { Str8 1059079 LZMA_ra(11.7%), 2.9M }
328 | |  |     |--+ chromosome   { Float64 1059079 LZMA_ra(0.35%), 29.4K }
329 | |  |     |--+ start_position   { Float64 1059079 LZMA_ra(18.4%), 1.5M }
330 | |  |     |--+ end_position   { Float64 1059079 LZMA_ra(18.4%), 1.5M }
331 | |  |     |--+ ref_annovar   { Str8 1059079 LZMA_ra(17.9%), 383.0K }
332 | |  |     |--+ alt_annovar   { Str8 1059079 LZMA_ra(16.9%), 351.1K }
333 | |  |     |--+ position   { Float64 1059079 LZMA_ra(18.4%), 1.5M }
334 | |  |     |--+ ref_vcf   { Str8 1059079 LZMA_ra(18.4%), 399.5K }
335 | |  |     |--+ alt_vcf   { Str8 1059079 LZMA_ra(16.6%), 347.3K }
336 | |  |     |--+ aloft_value   { Int32,logical 1059079 LZMA_ra(0.02%), 777B } *
337 | |  |     |--+ aloft_description   { Int32,logical 1059079 LZMA_ra(0.02%), 777B } *
338 | |  |     |--+ apc_conservation   { Float64 1059079 LZMA_ra(87.0%), 7.0M }
339 | |  |     |--+ apc_conservation_v2   { Float64 1059079 LZMA_ra(86.9%), 7.0M }
340 | |  |     |--+ apc_epigenetics_active   { Float64 1059079 LZMA_ra(79.4%), 6.4M }
341 | |  |     |--+ apc_epigenetics   { Float64 1059079 LZMA_ra(86.2%), 7.0M }
342 | |  |     |--+ apc_epigenetics_repressed   { Float64 1059079 LZMA_ra(48.9%), 3.9M }
343 | |  |     |--+ apc_epigenetics_transcription   { Float64 1059079 LZMA_ra(48.5%), 3.9M }
344 | |  |     |--+ apc_local_nucleotide_diversity   { Float64 1059079 LZMA_ra(1.92%), 158.5K }
345 | |  |     |--+ apc_local_nucleotide_diversity_v2   { Float64 1059079 LZMA_ra(83.5%), 6.8M }
346 | |  |     |--+ apc_local_nucleotide_diversity_v3   { Float64 1059079 LZMA_ra(84.1%), 6.8M }
347 | |  |     |--+ apc_mappability   { Float64 1059079 LZMA_ra(31.4%), 2.5M }
348 | |  |     |--+ apc_micro_rna   { Float64 1059079 LZMA_ra(2.91%), 241.2K }
349 | |  |     |--+ apc_mutation_density   { Float64 1059079 LZMA_ra(83.5%), 6.7M }
350 | |  |     |--+ apc_protein_function   { Float64 1059079 LZMA_ra(2.57%), 212.7K }
351 | |  |     |--+ apc_protein_function_v2   { Float64 1059079 LZMA_ra(2.59%), 214.4K }
352 | |  |     |--+ apc_protein_function_v3   { Float64 1059079 LZMA_ra(2.58%), 213.5K }
353 | |  |     |--+ apc_proximity_to_coding   { Float64 1059079 LZMA_ra(14.1%), 1.1M }
354 | |  |     |--+ apc_proximity_to_coding_v2   { Float64 1059079 LZMA_ra(6.53%), 540.0K }
355 | |  |     |--+ apc_proximity_to_tsstes   { Float64 1059079 LZMA_ra(74.6%), 6.0M }
356 | |  |     |--+ apc_transcription_factor   { Float64 1059079 LZMA_ra(8.40%), 694.7K }
357 | |  |     |--+ bravo_an   { Float64 1059079 LZMA_ra(1.68%), 138.6K }
358 | |  |     |--+ bravo_af   { Float64 1059079 LZMA_ra(26.3%), 2.1M }
359 | |  |     |--+ filter_status   { Str8 1059079 LZMA_ra(4.39%), 205.2K }
360 | |  |     |--+ cage_enhancer   { Int32,logical 1059079 LZMA_ra(0.02%), 777B } *
361 | |  |     |--+ cage_promoter   { Int32,logical 1059079 LZMA_ra(0.02%), 777B } *
362 | |  |     |--+ cage_tc   { Str8 1059079 LZMA_ra(5.61%), 98.1K }
363 | |  |     |--+ clnsig   { Int32,logical 1059079 LZMA_ra(0.02%), 777B } *
364 | |  |     |--+ clnsigincl   { Int32,logical 1059079 LZMA_ra(0.02%), 777B } *
365 | |  |     |--+ clndn   { Int32,logical 1059079 LZMA_ra(0.02%), 777B } *
366 | |  |     |--+ clndnincl   { Int32,logical 1059079 LZMA_ra(0.02%), 777B } *
367 | |  |     |--+ clnrevstat   { Int32,logical 1059079 LZMA_ra(0.02%), 777B } *
368 | |  |     |--+ origin   { Int32,logical 1059079 LZMA_ra(0.22%), 9.3K } *
369 | |  |     |--+ clndisdb   { Int32,logical 1059079 LZMA_ra(0.02%), 777B } *
370 | |  |     |--+ clndisdbincl   { Int32,logical 1059079 LZMA_ra(0.02%), 777B } *
371 | |  |     |--+ geneinfo   { Int32,logical 1059079 LZMA_ra(0.02%), 777B } *
372 | |  |     |--+ polyphen2_hdiv_score   { Int32,logical 1059079 LZMA_ra(0.02%), 777B } *
373 | |  |     |--+ polyphen2_hvar_score   { Int32,logical 1059079 LZMA_ra(0.02%), 777B } *
374 | |  |     |--+ mutation_taster_score   { Int32,logical 1059079 LZMA_ra(0.27%), 11.0K } *
375 | |  |     |--+ mutation_assessor_score   { Int32,logical 1059079 LZMA_ra(0.05%), 2.2K } *
376 | |  |     |--+ metasvm_pred   { Int32,logical 1059079 LZMA_ra(0.33%), 13.7K } *
377 | |  |     |--+ fathmm_xf   { Float64 1059079 LZMA_ra(54.4%), 4.4M }
378 | |  |     |--+ funseq_value   { Int32,logical 1059079 LZMA_ra(0.38%), 15.7K } *
379 | |  |     |--+ funseq_description   { Int32,logical 1059079 LZMA_ra(0.02%), 777B } *
380 | |  |     |--+ genecode_comprehensive_category   { Str8 1059079 LZMA_ra(0.64%), 66.9K }
381 | |  |     |--+ genecode_comprehensive_info   { Str8 1059079 LZMA_ra(5.62%), 1.1M }
382 | |  |     |--+ genecode_comprehensive_exonic_category   { Int32,logical 1059079 LZMA_ra(0.02%), 777B } *
383 | |  |     |--+ genecode_comprehensive_exonic_info   { Int32,logical 1059079 LZMA_ra(0.02%), 777B } *
384 | |  |     |--+ genehancer   { Str8 1059079 LZMA_ra(0.24%), 171.9K }
385 | |  |     |--+ af_total   { Float64 1059079 LZMA_ra(56.3%), 4.6M }
386 | |  |     |--+ af_asj_female   { Float64 1059079 LZMA_ra(9.65%), 798.1K }
387 | |  |     |--+ af_eas_female   { Float64 1059079 LZMA_ra(9.85%), 815.3K }
388 | |  |     |--+ af_afr_male   { Float64 1059079 LZMA_ra(34.1%), 2.8M }
389 | |  |     |--+ af_female   { Float64 1059079 LZMA_ra(47.3%), 3.8M }
390 | |  |     |--+ af_fin_male   { Float64 1059079 LZMA_ra(16.1%), 1.3M }
391 | |  |     |--+ af_oth_female   { Float64 1059079 LZMA_ra(12.6%), 1.0M }
392 | |  |     |--+ af_ami   { Float64 1059079 LZMA_ra(7.24%), 599.2K }
393 | |  |     |--+ af_oth   { Float64 1059079 LZMA_ra(16.2%), 1.3M }
394 | |  |     |--+ af_male   { Float64 1059079 LZMA_ra(48.7%), 3.9M }
395 | |  |     |--+ af_ami_female   { Float64 1059079 LZMA_ra(6.35%), 525.5K }
396 | |  |     |--+ af_afr   { Float64 1059079 LZMA_ra(42.0%), 3.4M }
397 | |  |     |--+ af_eas_male   { Float64 1059079 LZMA_ra(10.3%), 855.4K }
398 | |  |     |--+ af_sas   { Float64 1059079 LZMA_ra(16.0%), 1.3M }
399 | |  |     |--+ af_nfe_female   { Float64 1059079 LZMA_ra(26.0%), 2.1M }
400 | |  |     |--+ af_asj_male   { Float64 1059079 LZMA_ra(9.37%), 775.5K }
401 | |  |     |--+ af_raw   { Float64 1059079 LZMA_ra(49.4%), 4.0M }
402 | |  |     |--+ af_oth_male   { Float64 1059079 LZMA_ra(12.6%), 1.0M }
403 | |  |     |--+ af_nfe_male   { Float64 1059079 LZMA_ra(24.4%), 2.0M }
404 | |  |     |--+ af_asj   { Float64 1059079 LZMA_ra(11.5%), 947.8K }
405 | |  |     |--+ af_amr_male   { Float64 1059079 LZMA_ra(22.3%), 1.8M }
406 | |  |     |--+ af_amr_female   { Float64 1059079 LZMA_ra(21.0%), 1.7M }
407 | |  |     |--+ af_sas_female   { Float64 1059079 LZMA_ra(8.97%), 742.4K }
408 | |  |     |--+ af_fin   { Float64 1059079 LZMA_ra(16.9%), 1.4M }
409 | |  |     |--+ af_afr_female   { Float64 1059079 LZMA_ra(35.8%), 2.9M }
410 | |  |     |--+ af_sas_male   { Float64 1059079 LZMA_ra(14.8%), 1.2M }
411 | |  |     |--+ af_amr   { Float64 1059079 LZMA_ra(26.8%), 2.2M }
412 | |  |     |--+ af_nfe   { Float64 1059079 LZMA_ra(30.3%), 2.4M }
413 | |  |     |--+ af_eas   { Float64 1059079 LZMA_ra(12.6%), 1.0M }
414 | |  |     |--+ af_ami_male   { Float64 1059079 LZMA_ra(6.20%), 512.7K }
415 | |  |     |--+ af_fin_female   { Float64 1059079 LZMA_ra(11.3%), 934.2K }
416 | |  |     |--+ linsight   { Float64 1059079 LZMA_ra(25.8%), 2.1M }
417 | |  |     |--+ gc   { Float64 1059079 LZMA_ra(9.14%), 756.2K }
418 | |  |     |--+ cpg   { Float64 1059079 LZMA_ra(4.39%), 363.1K }
419 | |  |     |--+ min_dist_tss   { Float64 1059079 LZMA_ra(18.5%), 1.5M }
420 | |  |     |--+ min_dist_tse   { Float64 1059079 LZMA_ra(18.5%), 1.5M }
421 | |  |     |--+ sift_cat   { Int32,logical 1059079 LZMA_ra(0.02%), 777B } *
422 | |  |     |--+ sift_val   { Int32,logical 1059079 LZMA_ra(0.21%), 8.9K } *
423 | |  |     |--+ polyphen_cat   { Int32,logical 1059079 LZMA_ra(0.02%), 777B } *
424 | |  |     |--+ polyphen_val   { Int32,logical 1059079 LZMA_ra(0.15%), 6.1K } *
425 | |  |     |--+ priphcons   { Float64 1059079 LZMA_ra(14.0%), 1.1M }
426 | |  |     |--+ mamphcons   { Float64 1059079 LZMA_ra(9.34%), 773.0K }
427 | |  |     |--+ verphcons   { Float64 1059079 LZMA_ra(9.14%), 755.9K }
428 | |  |     |--+ priphylop   { Float64 1059079 LZMA_ra(14.7%), 1.2M }
429 | |  |     |--+ mamphylop   { Float64 1059079 LZMA_ra(20.7%), 1.7M }
430 | |  |     |--+ verphylop   { Float64 1059079 LZMA_ra(21.0%), 1.7M }
431 | |  |     |--+ bstatistic   { Int32,logical 1059079 LZMA_ra(0.02%), 777B } *
432 | |  |     |--+ chmm_e1   { Float64 1059079 LZMA_ra(0.70%), 58.1K }
433 | |  |     |--+ chmm_e2   { Float64 1059079 LZMA_ra(0.63%), 52.2K }
434 | |  |     |--+ chmm_e3   { Float64 1059079 LZMA_ra(0.84%), 69.2K }
435 | |  |     |--+ chmm_e4   { Float64 1059079 LZMA_ra(0.98%), 81.3K }
436 | |  |     |--+ chmm_e5   { Float64 1059079 LZMA_ra(0.78%), 64.3K }
437 | |  |     |--+ chmm_e6   { Float64 1059079 LZMA_ra(0.65%), 53.7K }
438 | |  |     |--+ chmm_e7   { Float64 1059079 LZMA_ra(1.33%), 110.3K }
439 | |  |     |--+ chmm_e8   { Float64 1059079 LZMA_ra(0.97%), 80.0K }
440 | |  |     |--+ chmm_e9   { Float64 1059079 LZMA_ra(0.57%), 47.5K }
441 | |  |     |--+ chmm_e10   { Float64 1059079 LZMA_ra(0.69%), 57.4K }
442 | |  |     |--+ chmm_e11   { Float64 1059079 LZMA_ra(0.87%), 71.8K }
443 | |  |     |--+ chmm_e12   { Float64 1059079 LZMA_ra(1.37%), 113.7K }
444 | |  |     |--+ chmm_e13   { Float64 1059079 LZMA_ra(1.00%), 82.9K }
445 | |  |     |--+ chmm_e14   { Float64 1059079 LZMA_ra(0.99%), 81.6K }
446 | |  |     |--+ chmm_e15   { Float64 1059079 LZMA_ra(2.18%), 180.8K }
447 | |  |     |--+ chmm_e16   { Float64 1059079 LZMA_ra(0.55%), 45.6K }
448 | |  |     |--+ chmm_e17   { Float64 1059079 LZMA_ra(0.70%), 58.3K }
449 | |  |     |--+ chmm_e18   { Float64 1059079 LZMA_ra(0.69%), 57.5K }
450 | |  |     |--+ chmm_e19   { Float64 1059079 LZMA_ra(0.65%), 53.4K }
451 | |  |     |--+ chmm_e20   { Float64 1059079 LZMA_ra(0.63%), 52.4K }
452 | |  |     |--+ chmm_e21   { Float64 1059079 LZMA_ra(1.33%), 109.8K }
453 | |  |     |--+ chmm_e22   { Float64 1059079 LZMA_ra(1.03%), 85.6K }
454 | |  |     |--+ chmm_e23   { Float64 1059079 LZMA_ra(0.79%), 65.6K }
455 | |  |     |--+ chmm_e24   { Float64 1059079 LZMA_ra(1.31%), 108.6K }
456 | |  |     |--+ chmm_e25   { Float64 1059079 LZMA_ra(0.83%), 68.7K }
457 | |  |     |--+ gerp_rs   { Float64 1059079 LZMA_ra(1.00%), 82.8K }
458 | |  |     |--+ gerp_rs_pval   { Float64 1059079 LZMA_ra(1.41%), 117.1K }
459 | |  |     |--+ gerp_n   { Float64 1059079 LZMA_ra(14.0%), 1.1M }
460 | |  |     |--+ gerp_s   { Float64 1059079 LZMA_ra(18.9%), 1.5M }
461 | |  |     |--+ encodeh3k4me1_sum   { Float64 1059079 LZMA_ra(18.2%), 1.5M }
462 | |  |     |--+ encodeh3k4me2_sum   { Float64 1059079 LZMA_ra(17.4%), 1.4M }
463 | |  |     |--+ encodeh3k4me3_sum   { Float64 1059079 LZMA_ra(17.4%), 1.4M }
464 | |  |     |--+ encodeh3k9ac_sum   { Float64 1059079 LZMA_ra(17.5%), 1.4M }
465 | |  |     |--+ encodeh3k9me3_sum   { Float64 1059079 LZMA_ra(17.7%), 1.4M }
466 | |  |     |--+ encodeh3k27ac_sum   { Float64 1059079 LZMA_ra(17.9%), 1.4M }
467 | |  |     |--+ encodeh3k27me3_sum   { Float64 1059079 LZMA_ra(18.5%), 1.5M }
468 | |  |     |--+ encodeh3k36me3_sum   { Float64 1059079 LZMA_ra(17.4%), 1.4M }
469 | |  |     |--+ encodeh3k79me2_sum   { Float64 1059079 LZMA_ra(17.6%), 1.4M }
470 | |  |     |--+ encodeh4k20me1_sum   { Float64 1059079 LZMA_ra(18.0%), 1.5M }
471 | |  |     |--+ encodeh2afz_sum   { Float64 1059079 LZMA_ra(17.9%), 1.4M }
472 | |  |     |--+ encode_dnase_sum   { Float64 1059079 LZMA_ra(9.53%), 788.3K }
473 | |  |     |--+ encodetotal_rna_sum   { Float64 1059079 LZMA_ra(5.74%), 474.7K }
474 | |  |     |--+ grantham   { Int32,logical 1059079 LZMA_ra(0.04%), 1.8K } *
475 | |  |     |--+ freq100bp   { Float64 1059079 LZMA_ra(1.96%), 162.4K }
476 | |  |     |--+ rare100bp   { Float64 1059079 LZMA_ra(3.02%), 249.5K }
477 | |  |     |--+ sngl100bp   { Float64 1059079 LZMA_ra(6.39%), 528.8K }
478 | |  |     |--+ freq1000bp   { Float64 1059079 LZMA_ra(2.28%), 188.6K }
479 | |  |     |--+ rare1000bp   { Float64 1059079 LZMA_ra(3.46%), 286.1K }
480 | |  |     |--+ sngl1000bp   { Float64 1059079 LZMA_ra(7.47%), 617.8K }
481 | |  |     |--+ freq10000bp   { Float64 1059079 LZMA_ra(2.64%), 218.1K }
482 | |  |     |--+ rare10000bp   { Float64 1059079 LZMA_ra(4.01%), 332.0K }
483 | |  |     |--+ sngl10000bp   { Float64 1059079 LZMA_ra(7.93%), 655.9K }
484 | |  |     |--+ remap_overlap_tf   { Float64 1059079 LZMA_ra(4.28%), 354.0K }
485 | |  |     |--+ remap_overlap_cl   { Float64 1059079 LZMA_ra(4.72%), 390.2K }
486 | |  |     |--+ cadd_rawscore   { Float64 1059079 LZMA_ra(46.0%), 3.7M }
487 | |  |     |--+ cadd_phred   { Float64 1059079 LZMA_ra(23.8%), 1.9M }
488 | |  |     |--+ k24_bismap   { Float64 1059079 LZMA_ra(7.19%), 594.8K }
489 | |  |     |--+ k24_umap   { Float64 1059079 LZMA_ra(3.94%), 325.6K }
490 | |  |     |--+ k36_bismap   { Float64 1059079 LZMA_ra(4.25%), 351.3K }
491 | |  |     |--+ k36_umap   { Float64 1059079 LZMA_ra(3.44%), 284.5K }
492 | |  |     |--+ k50_bismap   { Float64 1059079 LZMA_ra(3.75%), 310.0K }
493 | |  |     |--+ k50_umap   { Float64 1059079 LZMA_ra(2.69%), 222.6K }
494 | |  |     |--+ k100_bismap   { Float64 1059079 LZMA_ra(1.97%), 162.7K }
495 | |  |     |--+ k100_umap   { Float64 1059079 LZMA_ra(0.68%), 56.6K }
496 | |  |     |--+ nucdiv   { Int32,logical 1059079 LZMA_ra(0.02%), 777B } *
497 | |  |     |--+ rdhs   { Str8 1059079 LZMA_ra(2.52%), 112.2K }
498 | |  |     |--+ recombination_rate   { Int32,logical 1059079 LZMA_ra(0.02%), 777B } *
499 | |  |     |--+ refseq_category   { Int32,logical 1059079 LZMA_ra(0.02%), 777B } *
500 | |  |     |--+ refseq_info   { Int32,logical 1059079 LZMA_ra(0.02%), 777B } *
501 | |  |     |--+ refseq_exonic_category   { Int32,logical 1059079 LZMA_ra(0.02%), 777B } *
502 | |  |     |--+ refseq_exonic_info   { Int32,logical 1059079 LZMA_ra(0.02%), 777B } *
503 | |  |     |--+ super_enhancer   { Int32,logical 1059079 LZMA_ra(0.02%), 777B } *
504 | |  |     |--+ tg_afr   { Int32,logical 1059079 LZMA_ra(0.08%), 3.2K } *
505 | |  |     |--+ tg_all   { Int32,logical 1059079 LZMA_ra(0.05%), 2.3K } *
506 | |  |     |--+ tg_amr   { Int32,logical 1059079 LZMA_ra(0.07%), 3.0K } *
507 | |  |     |--+ tg_eas   { Int32,logical 1059079 LZMA_ra(0.13%), 5.5K } *
508 | |  |     |--+ tg_eur   { Int32,logical 1059079 LZMA_ra(0.07%), 3.0K } *
509 | |  |     |--+ tg_sas   { Int32,logical 1059079 LZMA_ra(0.08%), 3.4K } *
510 | |  |     |--+ ucsc_category   { Str8 1059079 LZMA_ra(0.62%), 74.5K }
511 | |  |     |--+ ucsc_info   { Str8 1059079 LZMA_ra(2.24%), 1.1M }
512 | |  |     |--+ ucsc_exonic_category   { Int32,logical 1059079 LZMA_ra(0.02%), 777B } *
513 | |  |     \--+ ucsc_exonic_info   { Int32,logical 1059079 LZMA_ra(0.02%), 777B } *
514 | |  \--+ format   [  ]
515 | \--+ sample.annotation   [  ]
516 | [1] "time"
517 | Time difference of 1.045753 hours
518 | 
519 | 
520 | 
521 | 
522 | 
523 | 
524 | zhou@M1 Test % Rscript FAVORannotatorCSVEssentialDB.R All.chr22.27022019.GRCh38.phased.gds 22
525 | [1] "gds.file:  All.chr22.27022019.GRCh38.phased.gds"
526 | [1] "chr:  22"
527 | [1] "use_compression: Yes"
528 | --2022-09-14 16:42:28--  https://dataverse.harvard.edu/api/access/datafile/6170504
529 | Resolving dataverse.harvard.edu (dataverse.harvard.edu)... 3.219.100.164, 3.226.192.24, 54.211.138.37
530 | Connecting to dataverse.harvard.edu (dataverse.harvard.edu)|3.219.100.164|:443... connected.
531 | HTTP request sent, awaiting response... 303 See Other
532 | Location: https://dvn-cloud.s3.amazonaws.com/10.7910/DVN/1VGTJI/17fe155b1d0-76967428f313?response-content-disposition=attachment%3B%20filename%2A%3DUTF-8%27%27chr22.tar.gz&response-content-type=application%2Fx-gzip&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20220914T204228Z&X-Amz-SignedHeaders=host&X-Amz-Expires=3600&X-Amz-Credential=AKIAIEJ3NV7UYCSRJC7A%2F20220914%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=6be2b2b646fbb584c3a51af135b1558a0cb7d63bbffa6457da4ecde93573b489 [following]
533 | --2022-09-14 16:42:28--  https://dvn-cloud.s3.amazonaws.com/10.7910/DVN/1VGTJI/17fe155b1d0-76967428f313?response-content-disposition=attachment%3B%20filename%2A%3DUTF-8%27%27chr22.tar.gz&response-content-type=application%2Fx-gzip&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20220914T204228Z&X-Amz-SignedHeaders=host&X-Amz-Expires=3600&X-Amz-Credential=AKIAIEJ3NV7UYCSRJC7A%2F20220914%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=6be2b2b646fbb584c3a51af135b1558a0cb7d63bbffa6457da4ecde93573b489
534 | Resolving dvn-cloud.s3.amazonaws.com (dvn-cloud.s3.amazonaws.com)... 52.217.171.57
535 | Connecting to dvn-cloud.s3.amazonaws.com (dvn-cloud.s3.amazonaws.com)|52.217.171.57|:443... connected.
536 | HTTP request sent, awaiting response... 200 OK
537 | Length: 5574054308 (5.2G) [application/x-gzip]
538 | Saving to: ‘6170504’
539 | 
540 | 6170504                                            100%[================================================================================================================>]   5.19G  2.59MB/s    in 32m 38s
541 | 
542 | 2022-09-14 17:15:07 (2.72 MB/s) - ‘6170504’ saved [5574054308/5574054308]
543 | 
544 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr22_1.csv
545 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr22_1.csv.idx
546 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr22_2.csv
547 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr22_2.csv.idx
548 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr22_3.csv
549 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr22_3.csv.idx
550 | Object of class "SeqVarGDSClass"
551 | File: /Users/zhou/Storage/Research/Projects/Test/All.chr22.27022019.GRCh38.phased.gds (87.4M)
552 | +    [  ] *
553 | |--+ description   [  ] *
554 | |--+ sample.id   { Str8 2548 LZMA_ra(7.84%), 1.6K } *
555 | |--+ variant.id   { Int32 1059079 LZMA_ra(6.20%), 256.6K } *
556 | |--+ position   { Int32 1059079 LZMA_ra(27.0%), 1.1M } *
557 | |--+ chromosome   { Str8 1059079 LZMA_ra(0.02%), 617B } *
558 | |--+ allele   { Str8 1059079 LZMA_ra(15.4%), 665.6K } *
559 | |--+ genotype   [  ] *
560 | |  |--+ data   { Bit2 2x2548x1059079 LZMA_ra(1.98%), 25.5M } *
561 | |  |--+ extra.index   { Int32 3x0 LZMA_ra, 18B } *
562 | |  \--+ extra   { Int16 0 LZMA_ra, 18B }
563 | |--+ phase   [  ]
564 | |  |--+ data   { Bit1 2548x1059079 LZMA_ra(0.01%), 48.1K } *
565 | |  |--+ extra.index   { Int32 3x0 LZMA_ra, 18B } *
566 | |  \--+ extra   { Bit1 0 LZMA_ra, 18B }
567 | |--+ annotation   [  ]
568 | |  |--+ id   { Str8 1059079 LZMA_ra(0.03%), 305B } *
569 | |  |--+ qual   { Float32 1059079 LZMA_ra(0.02%), 777B } *
570 | |  |--+ filter   { Int32,factor 1059079 LZMA_ra(0.02%), 777B } *
571 | |  |--+ info   [  ]
572 | |  |  |--+ AF   { Float32 1059079 LZMA_ra(7.72%), 319.6K } *
573 | |  |  |--+ AC   { Int32 1059079 LZMA_ra(19.0%), 788.0K } *
574 | |  |  |--+ NS   { Int32 1059079 LZMA_ra(0.02%), 777B } *
575 | |  |  |--+ AN   { Int32 1059079 LZMA_ra(0.02%), 777B } *
576 | |  |  |--+ EAS_AF   { Float32 1059079 LZMA_ra(5.73%), 237.2K } *
577 | |  |  |--+ EUR_AF   { Float32 1059079 LZMA_ra(6.18%), 255.7K } *
578 | |  |  |--+ AFR_AF   { Float32 1059079 LZMA_ra(8.56%), 354.1K } *
579 | |  |  |--+ AMR_AF   { Float32 1059079 LZMA_ra(6.70%), 277.2K } *
580 | |  |  |--+ SAS_AF   { Float32 1059079 LZMA_ra(6.45%), 266.8K } *
581 | |  |  |--+ VT   { Str8 1059079 LZMA_ra(2.06%), 88.0K } *
582 | |  |  |--+ EX_TARGET   { Bit1 1059079 LZMA_ra(6.62%), 8.6K } *
583 | |  |  |--+ DP   { Int32 1059079 LZMA_ra(45.0%), 1.8M } *
584 | |  |  |--+ FunctionalAnnotationJun1st2022   [ tbl_df,tbl,data.frame,list ] *
585 | |  |  \--+ FunctionalAnnotationAug1st2022   [ spec_tbl_df,tbl_df,tbl,data.frame,list ] *
586 | |  |     |--+ VarInfo   { Str8 1059079 LZMA_ra(15.6%), 2.5M }
587 | |  |     |--+ apc_conservation   { Float64 1059079 LZMA_ra(86.9%), 7.0M }
588 | |  |     |--+ apc_epigenetics   { Float64 1059079 LZMA_ra(86.2%), 7.0M }
589 | |  |     |--+ apc_epigenetics_active   { Float64 1059079 LZMA_ra(79.4%), 6.4M }
590 | |  |     |--+ apc_epigenetics_repressed   { Float64 1059079 LZMA_ra(48.9%), 3.9M }
591 | |  |     |--+ apc_epigenetics_transcription   { Float64 1059079 LZMA_ra(48.5%), 3.9M }
592 | |  |     |--+ apc_local_nucleotide_diversity   { Float64 1059079 LZMA_ra(84.1%), 6.8M }
593 | |  |     |--+ apc_mappability   { Float64 1059079 LZMA_ra(31.4%), 2.5M }
594 | |  |     |--+ apc_protein_function   { Float64 1059079 LZMA_ra(2.58%), 213.5K }
595 | |  |     |--+ apc_transcription_factor   { Float64 1059079 LZMA_ra(8.40%), 694.7K }
596 | |  |     |--+ cage_tc   { Str8 1059079 LZMA_ra(5.61%), 98.1K }
597 | |  |     |--+ metasvm_pred   { Str8 1059079 LZMA_ra(1.25%), 13.1K }
598 | |  |     |--+ rsid   { Str8 1059079 LZMA_ra(35.6%), 4.1M }
599 | |  |     |--+ fathmm_xf   { Float64 1059079 LZMA_ra(54.4%), 4.4M }
600 | |  |     |--+ genecode_comprehensive_category   { Str8 1059079 LZMA_ra(0.64%), 66.9K }
601 | |  |     |--+ genecode_comprehensive_info   { Str8 1059079 LZMA_ra(5.62%), 1.1M }
602 | |  |     |--+ genecode_comprehensive_exonic_category   { Str8 1059079 LZMA_ra(1.51%), 21.7K }
603 | |  |     |--+ genecode_comprehensive_exonic_info   { Str8 1059079 LZMA_ra(7.62%), 330.8K }
604 | |  |     |--+ genehancer   { Str8 1059079 LZMA_ra(0.24%), 171.9K }
605 | |  |     |--+ linsight   { Float64 1059079 LZMA_ra(25.8%), 2.1M }
606 | |  |     |--+ cadd_phred   { Float64 1059079 LZMA_ra(23.8%), 1.9M }
607 | |  |     \--+ rdhs   { Str8 1059079 LZMA_ra(2.52%), 112.2K }
608 | |  \--+ format   [  ]
609 | \--+ sample.annotation   [  ]
610 | [1] 1
611 | [1] 2
612 | [1] 3
613 | [1] 1
614 | [1] 2
615 | [1] 3
616 | [1] 1059079      22
617 | Warning messages:
618 | 1: In add.gdsn(ans, nm[i], val[[i]], compress = compress, closezip = closezip,  :
619 |   Missing characters are converted to "".
620 | 2: In add.gdsn(ans, nm[i], val[[i]], compress = compress, closezip = closezip,  :
621 |   Missing characters are converted to "".
622 | 3: In add.gdsn(ans, nm[i], val[[i]], compress = compress, closezip = closezip,  :
623 |   Missing characters are converted to "".
624 | 4: In add.gdsn(ans, nm[i], val[[i]], compress = compress, closezip = closezip,  :
625 |   Missing characters are converted to "".
626 | 5: In add.gdsn(ans, nm[i], val[[i]], compress = compress, closezip = closezip,  :
627 |   Missing characters are converted to "".
628 | 6: In add.gdsn(ans, nm[i], val[[i]], compress = compress, closezip = closezip,  :
629 |   Missing characters are converted to "".
630 | 7: In add.gdsn(ans, nm[i], val[[i]], compress = compress, closezip = closezip,  :
631 |   Missing characters are converted to "".
632 | 8: In add.gdsn(ans, nm[i], val[[i]], compress = compress, closezip = closezip,  :
633 |   Missing characters are converted to "".
634 | 9: In add.gdsn(ans, nm[i], val[[i]], compress = compress, closezip = closezip,  :
635 |   Missing characters are converted to "".
636 | [1] "time"
637 | Time difference of 14.08326 mins
638 | 
639 | 
640 | 
641 | 
642 | 
643 | zhou@M1 Essential % Rscript FAVORannotatorCSVEssentialDB.R All.chr1.27022019.GRCh38.phased.gds 1
644 | [1] "gds.file:  All.chr1.27022019.GRCh38.phased.gds"
645 | [1] "chr:  1"
646 | [1] "use_compression: Yes"
647 | --2022-09-14 16:24:39--  https://dataverse.harvard.edu/api/access/datafile/6170506
648 | Resolving dataverse.harvard.edu (dataverse.harvard.edu)... 54.211.138.37, 3.219.100.164, 3.226.192.24
649 | Connecting to dataverse.harvard.edu (dataverse.harvard.edu)|54.211.138.37|:443... connected.
650 | HTTP request sent, awaiting response... 303 See Other
651 | Location: https://dvn-cloud.s3.amazonaws.com/10.7910/DVN/1VGTJI/17fe5944e75-2c901ebf815d?response-content-disposition=attachment%3B%20filename%2A%3DUTF-8%27%27chr1.tar.gz&response-content-type=application%2Fx-gzip&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20220914T202439Z&X-Amz-SignedHeaders=host&X-Amz-Expires=3600&X-Amz-Credential=AKIAIEJ3NV7UYCSRJC7A%2F20220914%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=52a60668962768b84f03bc5e6f2084ddddb776848455215b7d7cfd9074e02fb8 [following]
652 | --2022-09-14 16:24:39--  https://dvn-cloud.s3.amazonaws.com/10.7910/DVN/1VGTJI/17fe5944e75-2c901ebf815d?response-content-disposition=attachment%3B%20filename%2A%3DUTF-8%27%27chr1.tar.gz&response-content-type=application%2Fx-gzip&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20220914T202439Z&X-Amz-SignedHeaders=host&X-Amz-Expires=3600&X-Amz-Credential=AKIAIEJ3NV7UYCSRJC7A%2F20220914%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=52a60668962768b84f03bc5e6f2084ddddb776848455215b7d7cfd9074e02fb8
653 | Resolving dvn-cloud.s3.amazonaws.com (dvn-cloud.s3.amazonaws.com)... 52.217.133.185
654 | Connecting to dvn-cloud.s3.amazonaws.com (dvn-cloud.s3.amazonaws.com)|52.217.133.185|:443... connected.
655 | HTTP request sent, awaiting response... 200 OK
656 | Length: 33455185130 (31G) [application/x-gzip]
657 | Saving to: ‘6170506’
658 | 
659 | 6170506                                            100%[================================================================================================================>]  31.16G  5.23MB/s    in 2h 2m   
660 | 
661 | 2022-09-14 18:27:07 (4.34 MB/s) - ‘6170506’ saved [33455185130/33455185130]
662 | 
663 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_10.csv
664 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_10.csv.idx
665 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_11.csv
666 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_11.csv.idx
667 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_12.csv
668 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_12.csv.idx
669 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_13.csv
670 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_13.csv.idx
671 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_14.csv
672 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_14.csv.idx
673 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_1.csv
674 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_1.csv.idx
675 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_2.csv
676 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_2.csv.idx
677 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_3.csv
678 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_3.csv.idx
679 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_4.csv
680 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_4.csv.idx
681 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_5.csv
682 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_5.csv.idx
683 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_6.csv
684 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_6.csv.idx
685 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_7.csv
686 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_7.csv.idx
687 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_8.csv
688 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_8.csv.idx
689 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_9.csv
690 | x n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/chr1_9.csv.idx
691 | Object of class "SeqVarGDSClass"
692 | File: /Users/zhou/Storage/Research/Projects/Test/Essential/All.chr1.27022019.GRCh38.phased.gds (161.0M)
693 | +    [  ] *
694 | |--+ description   [  ] *
695 | |--+ sample.id   { Str8 2548 LZMA_ra(7.84%), 1.6K } *
696 | |--+ variant.id   { Int32 6191833 LZMA_ra(2.92%), 706.4K } *
697 | |--+ position   { Int32 6191833 LZMA_ra(27.9%), 6.6M } *
698 | |--+ chromosome   { Str8 6191833 LZMA_ra(0.02%), 1.9K } *
699 | |--+ allele   { Str8 6191833 LZMA_ra(15.7%), 3.9M } *
700 | |--+ genotype   [  ] *
701 | |  |--+ data   { Bit2 2x2548x6191833 LZMA_ra(1.66%), 124.8M } *
702 | |  |--+ extra.index   { Int32 3x0 LZMA_ra, 18B } *
703 | |  \--+ extra   { Int16 0 LZMA_ra, 18B }
704 | |--+ phase   [  ]
705 | |  |--+ data   { Bit1 2548x6191833 LZMA_ra(0.01%), 280.4K } *
706 | |  |--+ extra.index   { Int32 3x0 LZMA_ra, 18B } *
707 | |  \--+ extra   { Bit1 0 LZMA_ra, 18B }
708 | |--+ annotation   [  ]
709 | |  |--+ id   { Str8 6191833 LZMA_ra(0.02%), 1.0K } *
710 | |  |--+ qual   { Float32 6191833 LZMA_ra(0.02%), 3.7K } *
711 | |  |--+ filter   { Int32,factor 6191833 LZMA_ra(0.02%), 3.7K } *
712 | |  |--+ info   [  ]
713 | |  |  |--+ AF   { Float32 6191833 LZMA_ra(7.39%), 1.7M } *
714 | |  |  |--+ AC   { Int32 6191833 LZMA_ra(18.5%), 4.4M } *
715 | |  |  |--+ NS   { Int32 6191833 LZMA_ra(0.02%), 3.7K } *
716 | |  |  |--+ AN   { Int32 6191833 LZMA_ra(0.02%), 3.7K } *
717 | |  |  |--+ EAS_AF   { Float32 6191833 LZMA_ra(5.54%), 1.3M } *
718 | |  |  |--+ EUR_AF   { Float32 6191833 LZMA_ra(5.90%), 1.4M } *
719 | |  |  |--+ AFR_AF   { Float32 6191833 LZMA_ra(8.20%), 1.9M } *
720 | |  |  |--+ AMR_AF   { Float32 6191833 LZMA_ra(6.52%), 1.5M } *
721 | |  |  |--+ SAS_AF   { Float32 6191833 LZMA_ra(6.17%), 1.5M } *
722 | |  |  |--+ VT   { Str8 6191833 LZMA_ra(2.09%), 522.7K } *
723 | |  |  |--+ EX_TARGET   { Bit1 6191833 LZMA_ra(4.82%), 36.4K } *
724 | |  |  \--+ DP   { Int32 6191833 LZMA_ra(44.0%), 10.4M } *
725 | |  \--+ format   [  ]
726 | \--+ sample.annotation   [  ]
727 | [1] 1
728 | [1] 2
729 | [1] 3
730 | [1] 4
731 | [1] 5
732 | [1] 6
733 | [1] 7
734 | [1] 8
735 | [1] 9
736 | [1] 10
737 | [1] 11
738 | [1] 12
739 | [1] 13
740 | [1] 14
741 | [1] 1
742 | 
743 | [1] 2
744 | [1] 3
745 | [1] 4
746 | [1] 5
747 | [1] 6
748 | [1] 7
749 | [1] 8
750 | [1] 9
751 | [1] 10
752 | [1] 11
753 | [1] 12
754 | [1] 13
755 | [1] 14
756 | [1] 6191833      22
757 | Warning messages:
758 | 1: In add.gdsn(ans, nm[i], val[[i]], compress = compress, closezip = closezip,  :
759 |   Missing characters are converted to "".
760 | 2: In add.gdsn(ans, nm[i], val[[i]], compress = compress, closezip = closezip,  :
761 |   Missing characters are converted to "".
762 | 3: In add.gdsn(ans, nm[i], val[[i]], compress = compress, closezip = closezip,  :
763 |   Missing characters are converted to "".
764 | 4: In add.gdsn(ans, nm[i], val[[i]], compress = compress, closezip = closezip,  :
765 |   Missing characters are converted to "".
766 | 5: In add.gdsn(ans, nm[i], val[[i]], compress = compress, closezip = closezip,  :
767 |   Missing characters are converted to "".
768 | 6: In add.gdsn(ans, nm[i], val[[i]], compress = compress, closezip = closezip,  :
769 |   Missing characters are converted to "".
770 | 7: In add.gdsn(ans, nm[i], val[[i]], compress = compress, closezip = closezip,  :
771 |   Missing characters are converted to "".
772 | 8: In add.gdsn(ans, nm[i], val[[i]], compress = compress, closezip = closezip,  :
773 |   Missing characters are converted to "".
774 | 9: In add.gdsn(ans, nm[i], val[[i]], compress = compress, closezip = closezip,  :
775 |   Missing characters are converted to "".
776 | Object of class "SeqVarGDSClass"
777 | File: /Users/zhou/Storage/Research/Projects/Test/Essential/All.chr1.27022019.GRCh38.phased.gds (485.6M)
778 | +    [  ] *
779 | |--+ description   [  ] *
780 | |--+ sample.id   { Str8 2548 LZMA_ra(7.84%), 1.6K } *
781 | |--+ variant.id   { Int32 6191833 LZMA_ra(2.92%), 706.4K } *
782 | |--+ position   { Int32 6191833 LZMA_ra(27.9%), 6.6M } *
783 | |--+ chromosome   { Str8 6191833 LZMA_ra(0.02%), 1.9K } *
784 | |--+ allele   { Str8 6191833 LZMA_ra(15.7%), 3.9M } *
785 | |--+ genotype   [  ] *
786 | |  |--+ data   { Bit2 2x2548x6191833 LZMA_ra(1.66%), 124.8M } *
787 | |  |--+ extra.index   { Int32 3x0 LZMA_ra, 18B } *
788 | |  \--+ extra   { Int16 0 LZMA_ra, 18B }
789 | |--+ phase   [  ]
790 | |  |--+ data   { Bit1 2548x6191833 LZMA_ra(0.01%), 280.4K } *
791 | |  |--+ extra.index   { Int32 3x0 LZMA_ra, 18B } *
792 | |  \--+ extra   { Bit1 0 LZMA_ra, 18B }
793 | |--+ annotation   [  ]
794 | |  |--+ id   { Str8 6191833 LZMA_ra(0.02%), 1.0K } *
795 | |  |--+ qual   { Float32 6191833 LZMA_ra(0.02%), 3.7K } *
796 | |  |--+ filter   { Int32,factor 6191833 LZMA_ra(0.02%), 3.7K } *
797 | |  |--+ info   [  ]
798 | |  |  |--+ AF   { Float32 6191833 LZMA_ra(7.39%), 1.7M } *
799 | |  |  |--+ AC   { Int32 6191833 LZMA_ra(18.5%), 4.4M } *
800 | |  |  |--+ NS   { Int32 6191833 LZMA_ra(0.02%), 3.7K } *
801 | |  |  |--+ AN   { Int32 6191833 LZMA_ra(0.02%), 3.7K } *
802 | |  |  |--+ EAS_AF   { Float32 6191833 LZMA_ra(5.54%), 1.3M } *
803 | |  |  |--+ EUR_AF   { Float32 6191833 LZMA_ra(5.90%), 1.4M } *
804 | |  |  |--+ AFR_AF   { Float32 6191833 LZMA_ra(8.20%), 1.9M } *
805 | |  |  |--+ AMR_AF   { Float32 6191833 LZMA_ra(6.52%), 1.5M } *
806 | |  |  |--+ SAS_AF   { Float32 6191833 LZMA_ra(6.17%), 1.5M } *
807 | |  |  |--+ VT   { Str8 6191833 LZMA_ra(2.09%), 522.7K } *
808 | |  |  |--+ EX_TARGET   { Bit1 6191833 LZMA_ra(4.82%), 36.4K } *
809 | |  |  |--+ DP   { Int32 6191833 LZMA_ra(44.0%), 10.4M } *
810 | |  |  \--+ FunctionalAnnotationJun1st2022   [ spec_tbl_df,tbl_df,tbl,data.frame,list ] *
811 | |  |     |--+ VarInfo   { Str8 6191833 LZMA_ra(16.4%), 15.2M }
812 | |  |     |--+ apc_conservation   { Float64 6191833 LZMA_ra(86.9%), 41.1M }
813 | |  |     |--+ apc_epigenetics   { Float64 6191833 LZMA_ra(86.4%), 40.8M }
814 | |  |     |--+ apc_epigenetics_active   { Float64 6191833 LZMA_ra(80.7%), 38.1M }
815 | |  |     |--+ apc_epigenetics_repressed   { Float64 6191833 LZMA_ra(52.7%), 24.9M }
816 | |  |     |--+ apc_epigenetics_transcription   { Float64 6191833 LZMA_ra(48.1%), 22.7M }
817 | |  |     |--+ apc_local_nucleotide_diversity   { Float64 6191833 LZMA_ra(83.6%), 39.5M }
818 | |  |     |--+ apc_mappability   { Float64 6191833 LZMA_ra(29.0%), 13.7M }
819 | |  |     |--+ apc_protein_function   { Float64 6191833 LZMA_ra(2.17%), 1.0M }
820 | |  |     |--+ apc_transcription_factor   { Float64 6191833 LZMA_ra(7.31%), 3.5M }
821 | |  |     |--+ cage_tc   { Str8 6191833 LZMA_ra(4.99%), 451.4K }
822 | |  |     |--+ metasvm_pred   { Str8 6191833 LZMA_ra(0.94%), 57.5K }
823 | |  |     |--+ rsid   { Str8 6191833 LZMA_ra(35.7%), 24.2M }
824 | |  |     |--+ fathmm_xf   { Float64 6191833 LZMA_ra(57.2%), 27.0M }
825 | |  |     |--+ genecode_comprehensive_category   { Str8 6191833 LZMA_ra(0.57%), 360.1K }
826 | |  |     |--+ genecode_comprehensive_info   { Str8 6191833 LZMA_ra(5.77%), 7.2M }
827 | |  |     |--+ genecode_comprehensive_exonic_category   { Str8 6191833 LZMA_ra(1.17%), 89.4K }
828 | |  |     |--+ genecode_comprehensive_exonic_info   { Str8 6191833 LZMA_ra(7.27%), 1.3M }
829 | |  |     |--+ genehancer   { Str8 6191833 LZMA_ra(0.27%), 707.1K }
830 | |  |     |--+ linsight   { Float64 6191833 LZMA_ra(22.9%), 10.8M }
831 | |  |     |--+ cadd_phred   { Float64 6191833 LZMA_ra(23.8%), 11.2M }
832 | |  |     \--+ rdhs   { Str8 6191833 LZMA_ra(2.76%), 622.1K }
833 | |  \--+ format   [  ]
834 | \--+ sample.annotation   [  ]
835 | 
836 | 


--------------------------------------------------------------------------------
/Data/TestData/Input/._FAVOR.T2210k.gds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Data/TestData/Input/._FAVOR.T2210k.gds


--------------------------------------------------------------------------------
/Data/TestData/Input/FAVOR.T2210k.gds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Data/TestData/Input/FAVOR.T2210k.gds


--------------------------------------------------------------------------------
/Docs/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Docs/.DS_Store


--------------------------------------------------------------------------------
/Docs/Tutorial/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Docs/Tutorial/.DS_Store


--------------------------------------------------------------------------------
/Docs/Tutorial/Demos/FASRC.md:
--------------------------------------------------------------------------------
  1 | # **Step-by-step tutorial of running FAVORannotator on FASRC Slurm Cluster**
  2 | 
  3 | ##
  4 | 
  5 | ## FAVORannotator runs smoothly on FASRC slurm cluster. Like many other slurm cluster, FASRC has PostgreSQL installed.  And we can quickly boot up the PostgreSQL, on different nodes that vastly boost the performance and enables the parallel computing. 
  6 | 
  7 | ## 1. Download the FAVORannotator data file from the FAVOR website: [http://favor.genohub.org](http://favor.genohub.org/).
  8 | 
  9 | ## 2. Download the FAVORannotator data file from here **whole genome** version (download [URL](https://drive.google.com/file/d/1izzKJliuouG2pCJ6MkcXd_oxoEwzx5RQ/view?usp=sharing)) and **by chromosome** version (download [URL](https://drive.google.com/file/d/1Ccep9hmeWpIT_OH9IqS6p1MZbEonjG2z/view?usp=sharing)) or from the FAVOR website: [http://favor.genohub.org](http://favor.genohub.org/)
 10 | ## 3. Set up the database on slurm cluster
 11 | 
 12 | ## 4. Install the fasrc VPN ([https://docs.rc.fas.harvard.edu/kb/vpn-setup/](https://docs.rc.fas.harvard.edu/kb/vpn-setup/)). To connect to VPN, the Cisco AnyConnect client can be installed fromVPN portal ([https://downloads.rc.fas.harvard.edu](https://downloads.rc.fas.harvard.edu)) Note that you need to add @fasrc after your username in order to login.
 13 | 
 14 | 
 15 | ## 5. Once the VPN has been connected, access the fasrc VDI ([https://docs.rc.fas.harvard.edu/kb/virtual-desktop/](https://docs.rc.fas.harvard.edu/kb/virtual-desktop/)). Following figure shows how the VDI interaface look like. 
 16 | 
 17 | ![VDI Interface](https://github.com/zhouhufeng/FAVORannotator/blob/main/Docs/Tutorial/Figures/FASRC1.jpg)
 18 | 
 19 | _Figure 1. VDI Interface._
 20 | 
 21 | ## 7. Create a folder on fasrc where you would like to store the database ($ _mkdir /Directory/FAVORannotatorDataBase/_)
 22 | 
 23 | ## 8. Then we can create a database server by 1. Click “My Interactive Sessions”; at the top. 2. Click “Postgresql db”; on the left. 3. Configure the server.
 24 | ![My Interactive Sessions of postgreSQL](https://github.com/zhouhufeng/FAVORannotator/blob/main/Docs/Tutorial/Figures/postgreSQLdb.png)
 25 | 
 26 | _Figure 2. My Interactive Sessions of postgreSQL._
 27 | 
 28 | ## 9. The configuration of postgreSQL database server is shown through the following figure.  In the following example show in the figure 3, we input the folder directory for which we want the postgreSQL to store the database to, and also we input the database name. 
 29 | ![postgreSQL configuration on VDI](https://github.com/zhouhufeng/FAVORannotator/blob/main/Docs/Tutorial/Figures/createDBinstance.png)
 30 | 
 31 | _Figure 3. postgreSQL configuration on VDI._
 32 | 
 33 | ## 10. The postgreSQL database server is up and running after a few minutes of the creating as shown through the following figure.  And on the page you will be able to find the assigned **host name** and the **port number**.These information is important for FAVORannotator R program to find the database instance.  In the following example show in the figure 4, the host name is holy7c04301, and the port number is 9011. 
 34 | 
 35 | ![Active running postgreSQL database](https://github.com/zhouhufeng/FAVORannotator/blob/main/Docs/Tutorial/Figures/runningInstance.png)
 36 | 
 37 | _Figure 4. Active running postgreSQL database._
 38 | 
 39 | ## 11. Through the above configuration and booting up postgreSQL through VDI, we now know the following information of the running backend database host on the postgreSQL. DBName from step 9, Host and Port from step 10, and User and Password is your FASRC user name and password. These information can be input in the config.R file. 
 40 | 
 41 | 
 42 | ## 12.  Once config.R file is updated, if the database has already been imported then FAVORannotator is ready to run. If it is the first to boot up the database instance, we can import the database in the following commands. 
 43 | 
 44 | 
 45 | 
 46 | ## **Import Database into PostgreSQL and Run FAVORannotator**
 47 | 
 48 | Once PostgreSQL database is booted up and running, backend datbase can be imported and then FAVORannotator can be executed as follows. 
 49 | 
 50 | ### 1. Once the server is running, set up the database:
 51 | 
 52 | 1) Load the postgres module
 53 | 
 54 |   i. On fasrc, the command is: _module load postgresql/12.2-fasrc01_
 55 | 
 56 | 2) Log into the database: psql -h hostname -p port -d databasename;
 57 | 
 58 |   ii. eg_: psql -h holy2c14409 -p 8462 -d favor_
 59 | 
 60 | 3) Create the table
 61 | 
 62 |   iii. _CREATE TABLE MAIN(
 63 | variant_vcf text,
 64 | chromosome text,
 65 | position integer,
 66 | ref_vcf text,
 67 | alt_vcf text,
 68 | apc_conservation numeric,
 69 | apc_conservation_v2 numeric,
 70 | apc_epigenetics numeric,
 71 | apc_epigenetics_active numeric,
 72 | apc_epigenetics_repressed numeric,
 73 | apc_epigenetics_transcription numeric,
 74 | apc_local_nucleotide_diversity numeric,
 75 | apc_local_nucleotide_diversity_v2 numeric,
 76 | apc_local_nucleotide_diversity_v3 numeric,
 77 | apc_mappability numeric,
 78 | apc_micro_rna numeric,
 79 | apc_mutation_density numeric,
 80 | apc_protein_function numeric,
 81 | apc_proximity_to_coding numeric,
 82 | apc_proximity_to_coding_v2 numeric,
 83 | apc_proximity_to_tsstes numeric,
 84 | apc_transcription_factor numeric,
 85 | cadd_phred numeric,
 86 | cage text,
 87 | fathmm_xf numeric,
 88 | genecode_comprehensive_category text,
 89 | genecode_comprehensive_info text,
 90 | genecode_comprehensive_exonic_info text,
 91 | genecode_comprehensive_exonic_category text,
 92 | genehancer text,
 93 | linsight numeric,
 94 | metasvm_pred text,
 95 | rdhs text,
 96 | rsid text);_
 97 | 
 98 | iv. Load the data: _COPY main FROM path to file/offlineData.csv; CSV HEADER;_ This command can take several hours to complete, up to a day.
 99 | 
100 | v. Create the index: _CREATE INDEX ON main USING HASH(variant\_vcf);_ This command can take several hours to complete, up to a day.
101 | 
102 | vi. Create the view: _CREATE VIEW offline\_view AS SELECT \* FROM main_;
103 | 
104 | ### 2. Now the PostgreSQL hosting FAVORannotator backend database is up and running it is listening for the query from FAVORannotator R program. 
105 | ### 3. Update the config.R file with the PostgreSQL instance information (database name, port, host, user, password):
106 | 
107 | •	USER_G <- 'userID';
108 | •	PASSWORD_G <- 'secretPassWord'
109 | •	vcf.fn<-"/n/location/input.vcf"
110 | •	gds.fn<-"/n/location/output.gds"
111 | •	DBNAME_G <- favor; 
112 | •	HOST_G <- holy2c14409; 
113 | •	PORT_G <- 8462; 
114 | 
115 | ### 4.	We can first create GDS file from the input VCF file. 
116 | •	$ Rscript   convertVCFtoGDS.r  
117 | 
118 | ### 5.	Now FAVORannotator is ready to run using following command:
119 | •	$ Rscript   FAVORannotatorGDS.r     
120 | 
121 | ### If using the FAVORannotator by chromosome version, import the database in the same way and run FAVORannotator exactly as above. The only difference is config.R contains all the 22 chromosomes instances information (vcf file, gds file, database name, port, host, user, password).  For many clusters, we also provide the submitting scripts (submitJobs.sh) for submitting all 22 jobs to the cluster at the same time. For by chromosome versions, the R scripts needs to feed in the chromosome number and the above command turns into following.  
122 | •	$ Rscript   convertVCFtoGDS.r  22
123 | •	$ Rscript   FAVORannotatorGDS.r     22
124 | ### To simplify the parallel computing process, we also provide the submission scripts example here ([submission.sh](https://github.com/zhouhufeng/FAVORannotator/blob/main/Scripts/ByChromosome/submitJobs.sh)).
125 | 
126 | ## If interested in learning more about how to run FAVORannotator on FASRC slurm cluster, we have also prepared the recorded live demonstration here. 
127 | [![Recorded Live Demo](https://github.com/zhouhufeng/FAVORannotator/blob/main/Docs/Tutorial/Figures/LiveDemo.png)](https://youtu.be/_FRQLsFY4qI)
128 | 


--------------------------------------------------------------------------------
/Docs/Tutorial/Demos/UKBB200KWESpreprocessVCF.md:
--------------------------------------------------------------------------------
 1 | # **Step-by-step tutorial of turning raw VCFs into well-organized aGDS files**
 2 | This is a tutorial for (1) preprocessing VCFs (2) generate GDS file from the preprocessed VCFs, with high priority in computing performance and speed. 
 3 | 
 4 | 
 5 | ### Preprocessing file using BCFtools.
 6 | #### Prerequisites:
 7 | **BCFTools** (version 1.16) Please install the <a href="https://samtools.github.io/bcftools/">**BCFTools**</a>.
 8 | 
 9 | #### Step 0: Check up for errors and inconsistencies.
10 | The following steps are important for the successful execution of BCFTools, the raw VCF files needs strictly follows the VCF format standard v4.2.
11 | 1. Fixed Headers [make sure all fields are defined in header].  
12 | 2. Remove Duplicated VCFs [Make sure there is no duplicated VCF files]. Otherwise duplicated entries will cause issues for the following steps.
13 | 
14 | Note: Most of the raw VCFs has issues with the header files that needs to be fixed, without this step BCFTools will not be able to process these VCF files. 
15 | 
16 | #### Step 1: Remove other FORMAT variables but only keep GT  [multi-core].
17 | ##### Script: 
18 | - ```$ bcftools annotate -x ^FORMAT/GT ukb23156_c19_c12.vcf.gz  -Oz -o ./CVCF/ukb23156_c19_c12.vcf.gz ```
19 | ##### Input: All the raw VCF files ** ukb23156_c19_b0_v1.vcf.gz, ukb23156_c19_b2_v1.vcf.gz,..., ukb23156_c19_b64_v1.vcf.gz**
20 | ##### Output: The cleaned VCF files in the folder **./CVCF/** that within which has the same file name, but the FORMAT fields only contain GT. 
21 | 
22 | Note: This is computationally intensive, each smaller file is one multi-core instance, and multiple instances can be run in parallel to speed up the process. 
23 | 
24 | Finish 65 VCF processing in 12 core parallel, 130 mins
25 | 
26 | Finish 65 VCF processing in 32 core parallel,49 mins
27 | 
28 | #### Step 2: Break the multi-allelic sites into multiple rows of all the VCFs of each study.
29 | ##### Script: 
30 | - ```$ bcftools norm -m -any  ./ConcatVCF/ukb23156_c19_c12.vcf.gz -Oz -o ./ConcatVCF/ukb23156_c19_c12.bk.vcf.gz```
31 | ##### Input: The VCF file contains multi-allelic sites ** ukb23156_c19_c12.vcf.gz **
32 | ##### Output: The VCF file has multi-allelic sites break into multiple lines ** ukb23156_c19_c12.bk.vcf.gz**. 
33 | 
34 | Note: multi-allelic sites cause issues for the following analysis, we usually break them into multiple rows in the preprocessing steps. 
35 | 
36 | Finish 65 VCF processing in 12 core parallel, 33 mins
37 | 
38 | Finish 65 VCF processing in 32 core parallel, 12 mins
39 | 
40 | #### Step 3: Concat the smaller VCFs (sliced by variants) within each study into one VCF file. [Benchmark in UKBB 200k WES 24 mins] 
41 | ##### Script: 
42 | - ```$ bcftools concat --threads 12 ./CVCF/ukb23156_c19_b*_v1.vcf.gz -Oz -o ./ConcatVCF/ukb23156_c19_c12.vcf.gz```
43 | ##### Input: All the cleaned VCF files in the folder **./CVCF/** with the VCF files name: ** ukb23156_c19_b0_v1.vcf.gz, ukb23156_c19_b2_v1.vcf.gz,..., ukb23156_c19_b64_v1.vcf.gz**
44 | ##### Output: The concatenated VCF files in the folder **./ConcatVCF/ukb23156_c19_c12.vcf.gz** that is the results of all the input VCFs concatenated by rows into one big VCF that has the same columns. 
45 | 
46 | Note: This is computationally intensive, multi-core function enabled to speed up the process. Concat is only for VCFs has same samples [columns] just need to concat the variants [rows], if VCF is sliced by samples, you should refer to the following steps using the merge function. 
47 | 
48 | Finish concat 65 VCF processing in 12 core parallel,36 mins 
49 | 
50 | Finish concat 65 VCF processing in 32 core parallel,24 mins
51 | 
52 | 
53 | 
54 | #### Step 4: Convert the merged VCFs per chromosomes into GDSs (per chromosome) [Benchmarked using UKBB 200k WES chr19 VCF takes 72 mins].
55 | ##### Script: 
56 | - ```$ Rscripts ./convertVCFtoGDS.r ./MergedVCF/ukbb.merged.bk.nm.vcf.gz ./MergedGDS/ukbb.merged.bk.nm.gds```
57 | Script: <a href="https://github.com/zhouhufeng/FAVORannotator/blob/main/Scripts/UTL/convertVCFtoGDS.r">**convertVCFtoGDS.r**</a>
58 | ##### Input: The preprocessed VCF file,**ukbb.merged.bk.nm.vcf.gz**.
59 | ##### Output: The generated GDS file **ukbb.merged.bk.nm.gds**. 
60 | 
61 | Note: This is computationally intensive multi-core option enabled, by default it is 12 core,parallel=10, users can modify based on computing platforms. Since this multi-core convertVCFtoGDS.r involes 3 steps: (1)count variants, (2)generate smaller GDS intermediate files, (3)merge intermediate files into one GDS. Only Step (2) is running in parallel R sessions. Therefore, small VCF file (<10GB) will not see significant computing time reduce with too many cores(>10 cores). We recommend, small VCF file (<10GB) parallel=6, medium VCF file (10GB~50GB) parallel=12,large VCF file (>50GB) parallel=32.
62 | 
63 | Finish VCF to GDS processing in 12 core parallel, 90 mins
64 | 
65 | Finish VCF to GDS processing in 32 core parallel, 72 mins
66 | 
67 | 
68 | 
69 | 
70 | 
71 | 
72 | 
73 | #### Step additional: Index VCFs.
74 | ##### Script: 
75 | - ```$ bcftools index ./ConcatVCF/ukb23156_c19_c12.bk.nm.vcf.gz```
76 | ##### Input: The VCF file needs index ** ukb23156_c19_c12.bk.nm.vcf.gz **
77 | ##### Output: The VCF file index file ** ukb23156_c19_c12.bk.nm.vcf.gz.csi ** 
78 | 
79 | Note: Many processes needs indexed VCFs, e.g. view range, merge, etc. 
80 | 
81 | #### Step additional: Normalize (left) the broken multi-allelic VCFs.
82 | ##### Script: 
83 | - ```$ bcftools norm -f --threads 12 hg38.p13.fa ./ConcatVCF/ukb23156_c19_c12.bk.vcf.gz -Oz -o ./ConcatVCF/ukb23156_c19_c12.bk.nm.vcf.gz```
84 | ##### Input: The VCF file has multi-allelic sites break into multiple lines ** ukb23156_c19_c12.bk.vcf.gz**, and the reference genome fasta file **hg38.p13.fa**.
85 | ##### Output: The left-normalized VCF file has multi-allelic sites break into multiple lines ** ukb23156_c19_c12.bk.nm.vcf.gz**. 
86 | 
87 | Note: left-normalization is critical for the indels to have the correct and most commonly accepted formats and representation.
88 | 
89 | 
90 | 


--------------------------------------------------------------------------------
/Docs/Tutorial/Demos/preprocessVCF.md:
--------------------------------------------------------------------------------
 1 | # **Step-by-step tutorial of turning raw VCFs into well-organized aGDS files**
 2 | This is a tutorial for (1) preprocessing VCFs (2) generate GDS file from the preprocessed VCFs, with high priority in computing performance and speed. 
 3 | 
 4 | 
 5 | ### Preprocessing file using BCFtools.
 6 | #### Prerequisites:
 7 | **BCFTools** (version 1.16) Please install the <a href="https://samtools.github.io/bcftools/">**BCFTools**</a>.
 8 | 
 9 | #### Step 0: Check up for errors and inconsistencies.
10 | The following steps are important for the successful execution of BCFTools, the raw VCF files needs strictly follows the VCF format standard v4.2.
11 | 1. Fixed Headers [make sure all fields are defined in header].  
12 | 2. Remove Duplicated VCFs [Make sure there is no duplicated VCF files]. Otherwise duplicated entries will cause issues for the following steps.
13 | 
14 | Note: Most of the raw VCFs has issues with the header files that needs to be fixed, without this step BCFTools will not be able to process these VCF files. 
15 | 
16 | #### Step 1: Remove other FORMAT variables but only keep GT  [multi-core].
17 | ##### Script: 
18 | - ```$ for fl in ukb23156_c19_b*_v1.vcf.gz; do  bcftools annotate -x ^FORMAT/GT $fl --threads 12 -Oz -o ./CVCF/$fl &; done```
19 | ##### Input: All the raw VCF files ** ukb23156_c19_b0_v1.vcf.gz, ukb23156_c19_b2_v1.vcf.gz,..., ukb23156_c19_b64_v1.vcf.gz**
20 | ##### Output: The cleaned VCF files in the folder **./CVCF/** that within which has the same file name, but the FORMAT fields only contain GT. 
21 | 
22 | Note: This is computationally intensive, each smaller file is one multi-core instance, and multiple instances can be run in parallel to speed up the process. 
23 | 
24 | #### Step 2: Concat the smaller VCFs (sliced by variants) within each study into one VCF file. [Benchmark in UKBB 200k WES 24 mins] 
25 | ##### Script: 
26 | - ```$ bcftools concat --threads 12 ./CVCF/ukb23156_c19_b*_v1.vcf.gz -Oz -o ./ConcatVCF/ukb23156_c19_c12.vcf.gz```
27 | ##### Input: All the cleaned VCF files in the folder **./CVCF/** with the VCF files name: ** ukb23156_c19_b0_v1.vcf.gz, ukb23156_c19_b2_v1.vcf.gz,..., ukb23156_c19_b64_v1.vcf.gz**
28 | ##### Output: The concatenated VCF files in the folder **./ConcatVCF/ukb23156_c19_c12.vcf.gz** that is the results of all the input VCFs concatenated by rows into one big VCF that has the same columns. 
29 | 
30 | Note: This is computationally intensive, multi-core function enabled to speed up the process. Concat is only for VCFs has same samples [columns] just need to concat the variants [rows], if VCF is sliced by samples, you should refer to the following steps using the merge function. 
31 | 
32 | #### Step 3: Break the multi-allelic sites into multiple rows of all the VCFs of each study.
33 | ##### Script: 
34 | - ```$ bcftools norm -m -any --threads 12 ./ConcatVCF/ukb23156_c19_c12.vcf.gz -Oz -o ./ConcatVCF/ukb23156_c19_c12.bk.vcf.gz```
35 | ##### Input: The VCF file contains multi-allelic sites ** ukb23156_c19_c12.vcf.gz **
36 | ##### Output: The VCF file has multi-allelic sites break into multiple lines ** ukb23156_c19_c12.bk.vcf.gz**. 
37 | 
38 | Note: multi-allelic sites cause issues for the following analysis, we usually break them into multiple rows in the preprocessing steps. 
39 | 
40 | #### Step 4: Normalize (left) the broken multi-allelic VCFs.
41 | ##### Script: 
42 | - ```$ bcftools norm -f --threads 12 hg38.p13.fa ./ConcatVCF/ukb23156_c19_c12.bk.vcf.gz -Oz -o ./ConcatVCF/ukb23156_c19_c12.bk.nm.vcf.gz```
43 | ##### Input: The VCF file has multi-allelic sites break into multiple lines ** ukb23156_c19_c12.bk.vcf.gz**, and the reference genome fasta file **hg38.p13.fa**.
44 | ##### Output: The left-normalized VCF file has multi-allelic sites break into multiple lines ** ukb23156_c19_c12.bk.nm.vcf.gz**. 
45 | 
46 | Note: left-normalization is critical for the indels to have the correct and most commonly accepted formats and representation.
47 | 
48 | #### Step 5: Index VCFs.
49 | ##### Script: 
50 | - ```$ bcftools index ./ConcatVCF/ukb23156_c19_c12.bk.nm.vcf.gz```
51 | ##### Input: The VCF file needs index ** ukb23156_c19_c12.bk.nm.vcf.gz **
52 | ##### Output: The VCF file index file ** ukb23156_c19_c12.bk.nm.vcf.gz.csi ** 
53 | 
54 | Note: Many processes needs indexed VCFs, e.g. view range, merge, etc. 
55 | 
56 | #### Step 6: Sliced the Normalized VCFs into each chromosome [if needed].
57 | ##### Script: 
58 | - ```$ bcftools view -r chr19 ./ConcatVCF/ukb23156_c12.bk.nm.vcf.gz -Oz -o ./ConcatVCF/ukb23156_c19_c12.bk.nm.vcf.gz```
59 | ##### Input: The VCF file contains all chromosomes ** ukb23156_c12.bk.nm.vcf.gz **
60 | ##### Output: The VCF file that only has chr19 (chosen chromosome or range) ** ukb23156_c19_c12.bk.nm.vcf.gz**. 
61 | 
62 | Note: Sliced VCF will have many advantages in computing performance.
63 | 
64 | #### Step 7: Merge the Normalized VCFs (sliced by different samples) of each study into one VCF (per chromosome).
65 | ##### Script: 
66 | - ```$ bcftools merge -m all --threads 6 ./DifferentStudies/ukbb*.bk.nm.vcf.gz -Oz -o ./MergedVCF/ukbb.merged.bk.nm.vcf.gz```
67 | ##### Input: The VCF files has same set of Variants (rows) but different samples (columns) ** /DifferentStudies/ukbb*.bk.nm.vcf.gz **
68 | ##### Output: One big VCF file has same set of variants (rows) now with all the samples (columns) **./MergedVCF/ukbb.merged.bk.nm.vcf.gz**. 
69 | 
70 | Note:This is computationally intensive multi-core option enabled. merge function only for VCFs with same set of variants (rows) merging different samples (columns) together.  
71 | 
72 | 
73 | #### Step 8: Convert the merged VCFs per chromosomes into GDSs (per chromosome) [Benchmarked using UKBB 200k WES chr19 VCF takes 72 mins].
74 | ##### Script: 
75 | - ```$ Rscripts ./convertVCFtoGDS.r ./MergedVCF/ukbb.merged.bk.nm.vcf.gz ./MergedGDS/ukbb.merged.bk.nm.gds```
76 | Script: <a href="https://github.com/zhouhufeng/FAVORannotator/blob/main/Scripts/UTL/convertVCFtoGDS.r">**convertVCFtoGDS.r**</a>
77 | ##### Input: The preprocessed VCF file,**ukbb.merged.bk.nm.vcf.gz**.
78 | ##### Output: The generated GDS file **ukbb.merged.bk.nm.gds**. 
79 | 
80 | Note: This is computationally intensive multi-core option enabled, by default it is 12 core,parallel=10, users can modify based on computing platforms. Since this multi-core convertVCFtoGDS.r involes 3 steps: (1)count variants, (2)generate smaller GDS intermediate files, (3)merge intermediate files into one GDS. Only Step (2) is running in parallel R sessions. Therefore, small VCF file (<10GB) will not see significant computing time reduce with too many cores(>10 cores). We recommend, small VCF file (<10GB) parallel=6, medium VCF file (10GB~50GB) parallel=12,large VCF file (>50GB) parallel=32.
81 | 
82 | 


--------------------------------------------------------------------------------
/Docs/Tutorial/Detailed-Explanation/FAVORFullDB.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Docs/Tutorial/Detailed-Explanation/FAVORFullDB.xlsx


--------------------------------------------------------------------------------
/Docs/Tutorial/Figures/FASRC1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Docs/Tutorial/Figures/FASRC1.jpg


--------------------------------------------------------------------------------
/Docs/Tutorial/Figures/FAVORannotatorOnTerra.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Docs/Tutorial/Figures/FAVORannotatorOnTerra.png


--------------------------------------------------------------------------------
/Docs/Tutorial/Figures/Figure2A.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Docs/Tutorial/Figures/Figure2A.png


--------------------------------------------------------------------------------
/Docs/Tutorial/Figures/Figure2B.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Docs/Tutorial/Figures/Figure2B.png


--------------------------------------------------------------------------------
/Docs/Tutorial/Figures/Figure2C.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Docs/Tutorial/Figures/Figure2C.png


--------------------------------------------------------------------------------
/Docs/Tutorial/Figures/HarvardDataVerse.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Docs/Tutorial/Figures/HarvardDataVerse.png


--------------------------------------------------------------------------------
/Docs/Tutorial/Figures/LiveDemo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Docs/Tutorial/Figures/LiveDemo.png


--------------------------------------------------------------------------------
/Docs/Tutorial/Figures/createDBinstance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Docs/Tutorial/Figures/createDBinstance.png


--------------------------------------------------------------------------------
/Docs/Tutorial/Figures/figure1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Docs/Tutorial/Figures/figure1.png


--------------------------------------------------------------------------------
/Docs/Tutorial/Figures/figure4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Docs/Tutorial/Figures/figure4.png


--------------------------------------------------------------------------------
/Docs/Tutorial/Figures/postgreSQLdb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Docs/Tutorial/Figures/postgreSQLdb.png


--------------------------------------------------------------------------------
/Docs/Tutorial/Figures/runningInstance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Docs/Tutorial/Figures/runningInstance.png


--------------------------------------------------------------------------------
/Docs/Tutorial/Figures/versions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Docs/Tutorial/Figures/versions.png


--------------------------------------------------------------------------------
/Docs/Tutorial/Figures/versions1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Docs/Tutorial/Figures/versions1.png


--------------------------------------------------------------------------------
/Docs/Tutorial/Tables/table 1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Docs/Tutorial/Tables/table 1.png


--------------------------------------------------------------------------------
/Docs/Tutorial/Tables/table1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Docs/Tutorial/Tables/table1.png


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0)
  2 | 
  3 | # **FAVORannotator**
  4 | FAVORannotator is an R program for performing functional annotation of any genetic study (e.g. Whole-Genome/Whole-Exome Sequencing/Genome-Wide Association Studies) using the [FAVOR backend database](https://favor.genohub.org) to create an annotated Genomic Data Structure (aGDS) file by storing the genotype data (in VCF or GDS format) and their functional annotation data in an all-in-one file.
  5 | 
  6 | **For generating GDS/aGDS from raw VCF files, please refer to the detailed tutorial [here](https://github.com/zhouhufeng/FAVORannotator/blob/main/Docs/Tutorial/Demos/preprocessVCF.md).**
  7 | 
  8 | ## 1.Introduction
  9 | 
 10 | FAVORannotator is an open-source pipeline for functionally annotating and efficiently storing the genotype and variant functional annotation data of any genetic study (e.g. GWAS/WES/WGS). Functional annotation data is stored alongside with genotype data in an all-in-one aGDS file, through using the FAVORannotator. It then facilitates a wide range of functionally-informed downstream analyses (Figure 1).
 11 | 
 12 | FAVORannotator first converts a genotype VCF input file to a GDS file, searches the variants in the GDS file using the FAVOR database for their functional annotations, and then integrates these annotations into the GDS file to create an aGDS file. This aGDS file allows both genotype and functional annotation data to be stored in a single unified file (Figure 1). Furthermore, FAVORannotator can be conveniently integrated into [STAARpipeline](https://github.com/xihaoli/STAARpipeline), a rare variant association analysis tool, to perform association analysis of large-scale WGS/WES studies.
 13 | 
 14 | ![FAVORannotator workflow](https://github.com/zhouhufeng/FAVORannotator/blob/main/Docs/Tutorial/Figures/figure1.png)
 15 | 
 16 | _Figure 1. FAVORannotator workflow._
 17 | 
 18 | ## 2. FAVORannotator differnt versions (SQL, CSV and Cloud Versions)
 19 | 
 20 | There are three main versions of FAVORannotator: **SQL**, **CSV** and **Cloud**. 
 21 | 
 22 | All the versions of FAVORannotator requires the same set of R libraries. The postgreSQL version requires postgreSQL installation, and CSV version requires the XSV software dependencies, Cloud version also requires the XSV software dependencies. 
 23 | 
 24 | All the FAVORannotator versions produced identical results and have similar performance, they only differ on the computing environments where FAVORannotator is deployed. Users can choose the different versions of FAVORannotator according to their computing platforms and use cases.   
 25 | 
 26 | FAVORannotator accomplishes both high query speed and storage efficiency due to its optimized configurations and indices. Its offline nature avoids the excessive waiting time and file size restrictions of FAVOR online operation.
 27 | 
 28 | ### 2.1 FAVORannotator SQL version
 29 | 
 30 | It is important to note that the FAVORannotator SQL version PostgreSQL database differs from other storage because it needs to be running in order to be accessed. Thus, users must ensure the database is running before running annotations.
 31 | 
 32 | Once the FAVORannotator database is booted on and running, the following connection information must be specified for the FAVORannotator R program to access the database : DBName, Host, Port, User, and Password.
 33 | 
 34 | This above specialized database setting, ensure the high query speed. Here shows the detail features described above.
 35 | 
 36 | ![FAVORannotator SQL version Tech Features](https://github.com/zhouhufeng/FAVORannotator/blob/main/Docs/Tutorial/Figures/Figure2A.png)
 37 | 
 38 | _Figure 2. FAVORannotator SQL version workflow and differences highlights._
 39 | 
 40 | ### 2.2 FAVORannotator CSV version
 41 | 
 42 | FAVORannotator CSV version database adopts the similar strategies of slicing both database and query inputs into smaller pieces and create index with each of the smaller chucks of database so as to achieve high performance and fast query speed as the SQL version.  
 43 | 
 44 | Differs from SQL version, CSV version database is static, and the query depends upon the xsv software, and therefore does not need to ensure the database is running before running annotations. The CSV version database is static and have much easier way to access through xsv software rather than acquiring the details of the running postgreSQL database, therefore widen the application of FAVORannotator in case computing platform does not support postgreSQL installation. 
 45 | 
 46 | ![FAVORannotator CSV version Tech Features](https://github.com/zhouhufeng/FAVORannotator/blob/main/Docs/Tutorial/Figures/Figure2B.png)
 47 | 
 48 | _Figure 3. FAVORannotator CSV version workflow and differences highlights._
 49 | 
 50 | ### 2.3 FAVORannotator Cloud version
 51 | 
 52 | FAVORannotator Cloud version develop based on the CSV version (no pre-install database) adopts the similar strategies of slicing both database and query inputs into smaller pieces and create index with each of the smaller chucks of database so as to achieve high performance and fast query speed as the SQL/CSV version. But the FAVORannotator Cloud version download the FAVOR databases (Full Databaseor Essential Database) on the fly, requires no pre-install FAVOR database on the computing platform.   
 53 | 
 54 | Cloud version database download from ([FAVOR on Harvard Database](https://dataverse.harvard.edu/dataverse/favor) when FAVORannotator is executed, and after the download finishes, database is decompressed. The downloaded database is CSV version, which is static, and the query depends upon the xsv software therefore requires minimal dependencies and running database management systems.
 55 | 
 56 | ![FAVORannotator Cloud version Tech Features](https://github.com/zhouhufeng/FAVORannotator/blob/main/Docs/Tutorial/Figures/Figure2C.png)
 57 | 
 58 | _Figure 4. FAVORannotator Cloud version workflow and differences highlights._
 59 | 
 60 | 
 61 | ## 3. Obtain the FAVOR Database
 62 | ### 3.1 Obtain the database through direct downloading
 63 | 1. Download the FAVORannotator data file from here ([download URL](http://favor.genohub.org), under the "FAVORannotator" tab).
 64 | 2. Decompress the downloaded data.
 65 | 3. Move the decompressedd database to the location, and update location info on '''config.R'''.
 66 | 
 67 | ### 3.2 FAVOR databases host on Harvard Dataverse
 68 | FAVOR databases (Essential Database and Full Database) are hosting on ([Harvard Database](https://dataverse.harvard.edu/dataverse/favor)).
 69 | 
 70 | 
 71 | ![FAVORannotator Cloud version Tech Features](https://github.com/zhouhufeng/FAVORannotator/blob/main/Docs/Tutorial/Figures/HarvardDataVerse.png)
 72 | 
 73 | _Figure 5. FAVOR Databases on Harvard Dataverse (both Essential Database and Full Database)._
 74 | 
 75 | 
 76 | ### 3.3 FAVOR Essential Database
 77 | ([FAVOR Essential Database](https://doi.org/10.7910/DVN/1VGTJI)) containing 20 essential annotation scores. This FAVOR Essential Database is comprised of a collection of essential annotation scores for all possible SNVs (8,812,917,339) and observed indels (79,997,898) in Build GRCh38/hg38.
 78 | 
 79 | ### 3.4 FAVOR Full Database
 80 | ([FAVOR Full Database](https://doi.org/10.7910/DVN/KFUBKG)) containing 160 essential annotation scores. This FAVOR Full Database is comprised of a collection of full annotation scores for all possible SNVs (8,812,917,339) and observed indels (79,997,898) in Build GRCh38/hg38.
 81 | 
 82 | 
 83 | ## 4. Resource requirements
 84 | 
 85 | The resources utilized by the FAVORannotator R program and PostgreSQL instance are largely dependent upon the size of the input variants. 
 86 | 
 87 | For the both the SQL and CSV versions of FAVORannotator, 60,000 samples of WGS variant sets were tested. The whole functional annotation finished in parallel in 1 hour using 24 computing cores (Intel cascade lake with 2.9 GHz frequency). The memory consumed by each instance varies (usually within 18 GB), as there are different amounts of variants associated with each chromosome.
 88 | 
 89 | ## 5. Resource requirements
 90 | 
 91 | The resources utilized by the FAVORannotator R program and PostgreSQL instance are largely dependent upon the size of the input variants. 
 92 | 
 93 | For the both the SQL and CSV versions of FAVORannotator, 60,000 samples of WGS variant sets were tested. The whole functional annotation finished in parallel in 1 hour using 24 computing cores (Intel cascade lake with 2.9 GHz frequency). The memory consumed by each instance varies (usually within 18 GB), as there are different amounts of variants associated with each chromosome.
 94 | 
 95 | 
 96 | 
 97 | ## 6. How to Use FAVORannotator
 98 | 
 99 | ### 6.1 SQL/CSV versions
100 | 
101 | Installing and run FAVORannotator to perform functional annotation requires only 2 major steps:
102 | 
103 | **I.	Install software dependencies and prepare the database (process varies between systems).**
104 | 
105 | **II.	Run FAVORannotator (CSV or SQL versions).** 
106 | 
107 | The first step depends on whether FAVORannotator is the SQL or CSV version, and depends on different computing platforms. The following sections detail the process for major platforms. The second step (running FAVORannotator) will be detailed first, as it is consistent across platforms.
108 | 
109 | 
110 | ### 6.2 No pre-install databases version
111 | There are a few user cases where download the database and configuration can be difficult, we simply the FAVORannotator by including the downloading, decompression, update config.R, include database location and output location all into the FAOVRannotator (no pre-install database version), users only need to put the R scripts in to the directory with enough storage and run the program. 
112 | 
113 | **I. Install software dependencies.**
114 | 
115 | **II. Run FAOVRannotator (no pre-install database version).**
116 | 
117 | ### 6.3 Cloud version
118 | Based on the FAOVRannotator (no pre-install database version), we develop the FAOVRannotator cloud-native app, in the cloud platform like Terra and DNAnexus, or on the virtual machines of Google Cloud Platform (GCP), Amazon Web Services (AWS), Microsoft Azure. With the dockerized images and workflow languages, FAVORannotator can be executed through the user-friendly and drag-and-drop graphical interface, with no scripting nor programming skills required from the users. 
119 | 
120 | 
121 | ![FAVORannotator Versions](https://github.com/zhouhufeng/FAVORannotator/blob/main/Docs/Tutorial/Figures/versions.png)
122 | 
123 | _Figure 6. FAVORannotator Different Versions._
124 | 
125 | 
126 | ## 7. SQL version
127 | ### 7.1 Run FAVORannotator SQL version
128 | 
129 | Once PostgreSQL is running, the database can be imported and FAVORannotator can be executed as follows. Please find the R scripts in the ```Scripts/SQL/``` folder.
130 | 
131 | **Important: Before run FAVORannotator SQL version, please update the file locations and database info on the ```config.R``` file. FAVORannotator relies on the file locations and database info for the annotation.**
132 | 
133 | 1.	Create GDS file from the input VCF file:
134 | 
135 | -	``` $ Rscript   convertVCFtoGDS.r  chrnumber ```
136 | 
137 | 2.	Run FAVORannotator:
138 | 
139 | -	``` $ Rscript   FAVORannotatorv2aGDS.r  chrnumber ```  
140 | 
141 | chrnumber are the numeric number indicating which chromosome this database is reading from, chrnumber can be 1, 2, ..., 22. 
142 | 
143 | Scripts for submitting jobs for all chromosomes simultaneously have been provided. They use SLURM, which is supported by many high-performance clusters, and utilize parallel jobs to boost performance.
144 | 
145 | A SLURM script to simplify the process can be found here: ([submission.sh](https://github.com/zhouhufeng/FAVORannotator/blob/main/Scripts/SQL/submitJobs.sh)).
146 | 
147 | ### 7.2 Install and prepare the database for SQL version
148 | 
149 | The FAVORannotator SQL version relies upon the PostgreSQL Database Management System (DBMS). PostgreSQL is a free and open-source application which emphasizes extensibility and SQL compliance. It is a highly stable DBMS, backed by more than 20 years of community development. PostgreSQL is used to manage data for many web, mobile, geospatial, and analytics applications. Its advanced features, including diverse index types and configuration options, have been carefully selected for FAVORannotator so that end users do not need to worry about the implementation.
150 | 
151 | How to use FAVORannotator will be explained from the following steps. PostgreSQL is available in most platforms. Each of these platforms has a different process for installing software, which affects the first step of installing FAVORannotator.
152 |  
153 | Once PostgreSQL is running, the database can be imported and FAVORannotator can be executed as follows:
154 | 
155 | 1. Once the server is running, Load the database: ```$ psql -h hostname -p port_number -U username -f your_file.sql databasename ```
156 |    
157 |    e.g. ```$ psql -h c02510 -p 582  -f /n/SQL/ByChr7FAVORDBxO.sql Chr7```
158 | 
159 | 2. Now the PostgreSQL hosting FAVORannotator backend database is up and running it is listening for the query from FAVORannotator R program. 
160 | 
161 | 3. Update the config.R file with the PostgreSQL instance information (database name, port, host, user, password):
162 | 
163 | ### 7.3 Install PostgreSQL (FAVORannotator SQL version)
164 | 
165 | The following steps have been written for major computing environments in order to best account for all possibilities. The following steps are for the widely used operating system (Ubuntu) on a virtual machine.
166 | 
167 | 1. Install the required software:
168 |  - ```$ sudo apt install postgresql postgresql-contrib```
169 | 2. Start and run PostgreSQL: 
170 |  - ```$ sudo -i -u postgres``` 
171 |  - ```$ psql```
172 | 
173 | 3. [Optional] For installing the database on external storage (Edit the configuration file):
174 | -	The file is located at ```/etc/postgresql/12/main/postgresql.conf```
175 | -	Change the line in file “postgresql.conf”, data_directory = 'new directory of external storage'
176 | -	Reboot the data directory, ```$ sudo systemctl start postgresql```
177 | 
178 | 
179 | **For more detailed instructions on how to use FAVORannotator (SQL version) on the Harvard FASRC Slurm Cluster, please refer to the detailed tutorial [here](https://github.com/zhouhufeng/FAVORannotator/blob/main/Docs/Tutorial/Demos/FASRC.md).**
180 | 
181 | 
182 | ## 8. CSV version
183 | 
184 | ### 8.1 Run FAVORannotator CSV version
185 | 
186 | Once CSV database is downloaded and decompressed, the database is readable by FAVORannotator can be executed as follows. Please find the R scripts in the ```Scripts/CSV/``` folder.
187 | 
188 | **Important: Before run FAVORannotator CSV version, please update the file locations and database info on the ```config.R``` file. FAVORannotator relies on the file locations and database info for the annotation.**
189 | 
190 | 1.	Create GDS file from the input VCF file:
191 | 
192 | -	``` $ Rscript   convertVCFtoGDS.r  chrnumber ```
193 | 
194 | 2.	Run FAVORannotator:
195 | 
196 | -	``` $ Rscript   FAVORannotatorv2aGDS.r chrnumber ```  
197 | 
198 | Scripts for submitting jobs for all chromosomes simultaneously have been provided. They use SLURM, which is supported by many high-performance clusters, and utilize parallel jobs to boost performance.
199 | 
200 | A SLURM script to simplify the process can be found here: ([submission.sh](https://github.com/zhouhufeng/FAVORannotator/blob/main/Scripts/SQL/submitJobs.sh)).
201 | 
202 | chrnumber are the numeric number indicating which chromosome this database is reading from, chrnumber can be 1, 2, ..., 22. 
203 | 
204 | ### 8.2 Install and prepare the database for CSV version
205 | 
206 | **FAVORannotator** (CSV version) depends on the **xsv software** and the **FAVOR database** in CSV format. Please install the <a href="https://github.com/BurntSushi/xsv">**xsv software**</a> and 
207 | download the <a href="http://favor.genohub.org">**FAVOR database** CSV files</a> (under the "FAVORannotator" tab) before using **FAVORannotator** (CSV version). 
208 | 
209 | ### 8.3 Install xsv (FAVORannotator CSV version)
210 | 
211 | The following steps have been written for major computing environments in order to best account for all possibilities. The following steps are for the widely used operating system (Ubuntu) on a virtual machine.
212 | 
213 | 1. Install Rust and Cargo:
214 |  - ```$ curl https://sh.rustup.rs -sSf | sh```
215 | 2. Source the environment: 
216 |  - ```$ source $HOME/.cargo/env``` 
217 | 3. Install xsv using Cargo:
218 |  - ```$ cargo install xsv```
219 | 
220 | 
221 | 
222 | ## 9 No pre-install databases version
223 | 
224 | ### 9.1 Install xsv (No need to pre-install database but xsv need to be installed)
225 | 
226 | The following steps have been written for major computing environments in order to best account for all possibilities. The following steps are for the widely used operating system (Ubuntu) on a virtual machine.
227 | 
228 | 1. Install Rust and Cargo:
229 |  - ```$ curl https://sh.rustup.rs -sSf | sh```
230 | 2. Source the environment: 
231 |  - ```$ source $HOME/.cargo/env``` 
232 | 3. Install xsv using Cargo:
233 |  - ```$ cargo install xsv```
234 | 
235 | 
236 | 
237 | ### 9.2 Run FAVORannotator no pre-install databases version
238 | 
239 | FAVOR database can be downloaded on the fly and decompressed automatically in the scripts, this version of FAVORannotator will remove the burden of download the backend database and update the ```config.R```. The database is downloaded and decompressed automatically and is readable by FAVORannotator can be executed as follows.
240 | 
241 | Please find the R scripts in the ```Scripts/SQL/``` folder.
242 | 
243 | **Important: This version of FAVORannotator no pre-install version does not need to update ```config.R``` file. This version of FAVORannotator directly download FAVORdatabase (Full or Essential versions) from the Harvard Dataverse to the default file locations and database info for the annotation. Just put the FAVORannotator script in the directory with ample storage all the database and index and intermediate files will be generated in the directory.**
244 | 
245 | 1.	Create GDS file from the input VCF file:
246 | 
247 | -	``` $ Rscript   convertVCFtoGDS.r  input.vcf output.gds ```
248 | 
249 | 2.	Run FAVORannotator for the FAVOR Essential Database:
250 | 
251 | -	``` $ Rscript   FAVORannotatorCSVEssentialDB.R  output.gds chrnumber ```  
252 | 
253 | 3.	Run FAVORannotator for the FAVOR Full Database:
254 | 
255 | -	``` $ Rscript   FAVORannotatorCSVFullDB.R  output.gds chrnumber ```  
256 | 
257 | chrnumber are the numeric number indicating which chromosome this database is reading from, chrnumber can be 1, 2, ..., 22. 
258 | 
259 | Scripts for submitting jobs for all chromosomes simultaneously have been provided. They use SLURM, which is supported by many high-performance clusters, and utilize parallel jobs to boost performance.
260 | 
261 | A SLURM script to simplify the process can be found here: ([submission.sh](https://github.com/zhouhufeng/FAVORannotator/blob/main/Scripts/SQL/submitJobs.sh)).
262 | 
263 | 
264 | ## 10. Cloud Version
265 | ### 10.1 Run FAVORannotator Cloud Version
266 | 
267 | For Cloud environment, we simplified the process of database set up and remove the configration files. FAVOR database can be downloaded on the fly and decompressed automatically in the scripts, this version of FAVORannotator will remove the burden of download the backend database and update the ```config.R```. The database is downloaded and decompressed automatically and is capable of seamless integration to the workflow languages of the cloud platform. It currently works for cloud platforms like Terra, DNAnexus, etc. This tutorial uses Terra as an example to illustrate the functional annotation process. 
268 | 
269 | Please find the R scripts in the ```Scripts/Cloud/``` folder.
270 | 
271 | **Important: This version of FAVORannotator based on the no pre-install version does not need ```config.R``` file. This version of FAVORannotator directly download FAVORdatabase (Full or Essential versions) from the Harvard Dataverse to the default file locations and database info for the annotation. Just put the FAVORannotator script in the directory with ample storage all the database and index and intermediate files will be generated in the directory. These database files and intermediate files in the working directories will be removed in most cloud platforms.**
272 | 
273 | 1.	Create GDS file from the input VCF file:
274 | 
275 | -	``` $ Rscript   convertVCFtoGDS.r  input.vcf output.gds ```
276 | 
277 | 2.1	Run FAVORannotator for the FAVOR Essential Database:
278 | 
279 | -	``` $ Rscript FAVORannotatorTerraEssentialDB.R  output.gds chrnumber ```  
280 | 
281 | 2.2.	Run FAVORannotator for the FAVOR Essential Database workflow:
282 | 
283 | -	``` $ java -jar cromwell-30.2.jar run FAVORannotatorEssentialDB.wdl --inputs file.json ```  
284 | 
285 | 
286 | 3.1	Run FAVORannotator for the FAVOR Full Database:
287 | 
288 | -	``` $ Rscript FAVORannotatorTerraEssentialDB.R  output.gds chrnumber ```  
289 | 
290 | chrnumber are the numeric number indicating which chromosome this database is reading from, chrnumber can be 1, 2, ..., 22. 
291 | 
292 | 3.2.	Run FAVORannotator for the FAVOR Full Database workflow:
293 | 
294 | -	``` $ java -jar cromwell-30.2.jar run FAVORannotatorFullDB.wdl --inputs file.json ```  
295 | 
296 | 
297 | 
298 | 
299 | ![FAVORannotator Cloud Version](https://github.com/zhouhufeng/FAVORannotator/blob/main/Docs/Tutorial/Figures/FAVORannotatorOnTerra.png)
300 | 
301 | _Figure 7. FAVORannotator Cloud Native Workflow on Terra._
302 | 
303 | ## 11. Other Functions and Utilities
304 | 
305 | ### 11.1 Convert VCF to aGDS
306 | 
307 | The following functions have been written for the purpose of converting VCF files to GDS/aGDS files. Please find the R scripts in the ```Scripts/UTL/``` folder. 
308 | 
309 | 1. If users wish to convert VCF files that only contain genotype data into GDS files for the following annoation process:
310 |  - ```$ Rscript convertVCFtoGDS.r input.vcf output.agds```
311 | 
312 | 
313 | 2. If users wish to convert Variant List that does not contain genotype data into GDS files for the following annoation process, after formatting the varaint list into the same VCF format, following R scripts can generate the empty GDS file that do not have genotype data just the varaint info:
314 |  - ```$ Rscript convertVCFtoGDS.r inputVariantList.vcf output.agds```
315 | 
316 | 3. If users already annotated VCF files using SpnEff,BCFTools, VarNote, Vcfanno and just wish to use aGDS for the following analysis, running the followign R script to convert annotated VCF files into aGDS file
317 |  - ```$ Rscript convertVCFtoGDS.r annotated.vcf output.agds```
318 | 
319 | 
320 | 
321 | ### 11.2 Add In Functional Annotations to aGDS
322 | 1. If users have external annotation sources or annotation in text tables that containing varaint sets, this function will be able to add in the new functional annotations into the new node of aGDS files:
323 |  - ```$ Rscript FAVORannotatorAddIn.R input.agds AnnotationFile.tsv``` 
324 | 
325 | 
326 | ### 11.3 Extract Variant Functional Annotation to Text Tables from aGDS
327 | 
328 | 1. If users prefer to have the Variant Functional Annotation results write into Text Tables, this Rscripts will be able to extract the functional annotation from aGDS and write into the text tables:
329 |  - ```$ Rscript FAVORaGDSToText.R annotated.agds AnnotationTextTable.tsv```
330 | 
331 | 
332 | 
333 | ## 12 Demo Using Real Example (1000 Genomes Project Data)
334 | 
335 | The following steps are the demo of how to FAVORannotato through using real genotype data from 1000 Genomes Project. From the step of obtaining the genotype data to the end point of creating aGDS are illustrated here below in the step by step process. 
336 | 
337 | 
338 | ### 12.1 Download the 1000G VCF
339 | 
340 | If users can use command line below to obtain the ([1000G](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7059836/)) from the FTP ([1000 Genomes official website](http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/)), for the following process.
341 | 
342 | Change the directory:
343 | - ```$ cd ../../Data/TestData/1000G/ ``` 
344 | 
345 | Download VCF to the directory (chr22):
346 | 
347 | - ```$ wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chr22.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz ```
348 | 
349 | Additionally if download chr1:
350 | - ``` $ wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chr22.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz```
351 | 
352 | 
353 | ### 12.2 Convert VCF to GDS (chr22)
354 | 
355 | Users can use command line below to convert the VCF to GDS.
356 |  
357 | Change the directory:
358 | - ```$ cd ../../../Scripts/UTL ``` 
359 | 
360 | Run program to create GDS:
361 | - ```$ Rscript convertVCFtoGDS.r ../../Data/TestData/Input/ALL.chr22.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz ../../Data/1000G/All.chr22.27022019.GRCh38.phased.gds ```
362 | 
363 | And you will get the following output on terminal:
364 | 
365 |  ```
366 |     ALL.chr22.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz (176.9M)
367 |     file format: VCFv4.3
368 |     the number of sets of chromosomes (ploidy): 2
369 |     the number of samples: 2,548
370 |     genotype storage: bit2
371 |     compression method: LZMA_RA
372 |     of samples: 2548
373 | 		...
374 |  ```
375 | 
376 | 
377 | ### 12.3 Annotate GDS using FAVORannotator to create aGDS (no pre-install version)
378 | 
379 | Users can use following command to annotate GDS using FAVORannotator to create aGDS .
380 | 
381 | Change the directory:
382 | - ```$ cd ../../Data/1000G/ ``` 
383 | 
384 | Copy FAVORannotator program to the current directory:
385 | - ```$ cp ../../../Scripts/CSV/FAVORannotatorCSVEssentialDB.R .``` 
386 | - ```$ cp ../../../Scripts/CSV/FAVORannotatorCSVFullDB.R . ``` 
387 | 
388 | Run program to annotate GDS using FAVORannotator reading FAVOR Essential Database to create aGDS(chr22):
389 | - ```$ Rscript FAVORannotatorCSVEssentialDB.R All.chr22.27022019.GRCh38.phased.gds 22 ```
390 | 
391 | And you will get the following output on terminal:
392 | ``` 
393 | [1] gds.file:  All.chr22.27022019.GRCh38.phased.gds
394 | [1] chr:  22
395 | [1] use_compression Yes
396 | --2022-09-14 16:42:28--  https://dataverse.harvard.edu/api/access/datafile/6170504
397 | 
398 | ```
399 | 
400 | 
401 | Run program to annotate GDS using FAVORannotator reading FAVOR Full Database to create aGDS(chr22):
402 | - ```$ Rscript FAVORannotatorCSVFullDB.R All.chr22.27022019.GRCh38.phased.gds 22 ```
403 | 
404 | And you will get the following output on terminal:
405 | ```
406 | [1] gds.file:  All.chr22.27022019.GRCh38.phased.gds
407 | [1] chr:  22
408 | [1] use_compression: Yes
409 | --2022-09-14 16:39:31--  https://dataverse.harvard.edu/api/access/datafile/6358299
410 | 
411 | 
412 | ```
413 | 
414 | 
415 | 
416 | 
417 | 
418 | 
419 | 
420 | 
421 | ## 13 Dependencies
422 | FAVORannotator imports R packages: <a href="https://cran.r-project.org/web/packages/dplyr/index.html">dplyr</a>, <a href="https://bioconductor.org/packages/release/bioc/html/SeqArray.html">SeqArray</a>, <a href="https://bioconductor.org/packages/release/bioc/html/gdsfmt.html">gdsfmt</a>, <a href="https://cran.r-project.org/web/packages/RPostgreSQL/index.html">RPostgreSQL</a>, <a href="https://stringr.tidyverse.org">stringr</a>, <a href="https://readr.tidyverse.org">readr</a>, <a href="https://cran.r-project.org/web/packages/stringi/index.html">stringi</a>. These dependencies should be installed before running FAVORannotator.
423 | 
424 | FAVORannotator (SQL version) depends upon <a href="https://www.postgresql.org"> PostgreSQL software</a>.
425 | 
426 | FAVORannotator (CSV version) depends upon <a href="https://github.com/BurntSushi/xsv"> xsv software</a>.
427 | 
428 | ## Data Availability
429 | The whole-genome individual functional annotation data assembled from a variety of sources and the computed annotation principal components are available at the [Functional Annotation of Variant - Online Resource (FAVOR)](https://favor.genohub.org) site.
430 | 
431 | ## Version
432 | The current version is 1.1.1 (August 30th, 2022).
433 | ## License
434 | This software is licensed under GPLv3.
435 | 
436 | ![GPLv3](http://www.gnu.org/graphics/gplv3-127x51.png)
437 | [GNU General Public License, GPLv3](http://www.gnu.org/copyleft/gpl.html)
438 | 


--------------------------------------------------------------------------------
/Scripts/CSV/Dockerfile.txt:
--------------------------------------------------------------------------------
 1 | # Base image https://hub.docker.com/u/rocker/
 2 | FROM rocker/r-base:latest
 3 | 
 4 | ## create directories
 5 | RUN mkdir -p /01_data
 6 | RUN mkdir -p /02_code
 7 | RUN mkdir -p /03_output
 8 | 
 9 | ## copy files
10 | #COPY install_packages.R
11 | #COPY /02_code/myScript.R
12 | 
13 | ## install R-packages
14 | RUN Rscript install_packages.R
15 | 
16 | 


--------------------------------------------------------------------------------
/Scripts/CSV/FAVORannotatorCSVEssentialDB.R:
--------------------------------------------------------------------------------
  1 | #############################################################################
  2 | #Title:    FAVORannotatorCSVEssentialDB      
  3 | #Function: 
  4 | # * Build the aGDS file thorugh performing functional annotation, 
  5 | # * without pre-install the FAVOR Full Database, download database on the fly.
  6 | #Author:   Hufeng Zhou
  7 | #Time:     Sept 27th 2022
  8 | #############################################################################
  9 | 
 10 | 
 11 | args <- commandArgs(TRUE)
 12 | ### mandatory
 13 | 
 14 | gds.file <- args[1]
 15 | print(paste0("gds.file:  ",gds.file))
 16 | 
 17 | chr <- as.numeric(args[2])
 18 | print(paste0("chr:  ",chr))
 19 | #chr<-19
 20 | 
 21 | use_compression <- "Yes"
 22 | print(paste0("use_compression: ",use_compression))
 23 | 
 24 | ### R package
 25 | library(gdsfmt)
 26 | library(SeqArray)
 27 | library(readr)
 28 | 
 29 | ### xsv directory
 30 | xsv <- "~/.cargo/bin/xsv"
 31 | 
 32 | ## read info
 33 | DB_info <- read.csv(url("https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/main/Scripts/SQL/FAVORdatabase_chrsplit.csv"),header=TRUE)
 34 | DB_info_chr <- DB_info[DB_info$Chr==chr,]
 35 | chr_splitnum <- sum(DB_info$Chr==chr)
 36 | 
 37 | ### DB file
 38 | DB_path <- "n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/"
 39 | 
 40 | ### output
 41 | output_path <- "./"
 42 | 
 43 | ### annotation file
 44 | anno_file_name_1 <- "Anno_chr"
 45 | anno_file_name_2 <- "_STAARpipeline.csv"
 46 | 
 47 | 
 48 | ##########################################################################
 49 | ### Step 0 (Download FAVOR Database)
 50 | ##########################################################################
 51 | URLs <- data.frame(chr = c(1:22),
 52 |                    URL = c("https://dataverse.harvard.edu/api/access/datafile/6170506",
 53 |                            "https://dataverse.harvard.edu/api/access/datafile/6170501",
 54 |                            "https://dataverse.harvard.edu/api/access/datafile/6170502",
 55 |                            "https://dataverse.harvard.edu/api/access/datafile/6170521",
 56 |                            "https://dataverse.harvard.edu/api/access/datafile/6170511",
 57 |                            "https://dataverse.harvard.edu/api/access/datafile/6170516",
 58 |                            "https://dataverse.harvard.edu/api/access/datafile/6170505",
 59 |                            "https://dataverse.harvard.edu/api/access/datafile/6170513",
 60 |                            "https://dataverse.harvard.edu/api/access/datafile/6165867",
 61 |                            "https://dataverse.harvard.edu/api/access/datafile/6170507",
 62 |                            "https://dataverse.harvard.edu/api/access/datafile/6170517",
 63 |                            "https://dataverse.harvard.edu/api/access/datafile/6170520",
 64 |                            "https://dataverse.harvard.edu/api/access/datafile/6170503",
 65 |                            "https://dataverse.harvard.edu/api/access/datafile/6170509",
 66 |                            "https://dataverse.harvard.edu/api/access/datafile/6170515",
 67 |                            "https://dataverse.harvard.edu/api/access/datafile/6170518",
 68 |                            "https://dataverse.harvard.edu/api/access/datafile/6170510",
 69 |                            "https://dataverse.harvard.edu/api/access/datafile/6170508",
 70 |                            "https://dataverse.harvard.edu/api/access/datafile/6170514",
 71 |                            "https://dataverse.harvard.edu/api/access/datafile/6170512",
 72 |                            "https://dataverse.harvard.edu/api/access/datafile/6170519",
 73 |                            "https://dataverse.harvard.edu/api/access/datafile/6170504"))
 74 | 
 75 | URL <- URLs[chr, "URL"]
 76 | system(paste0("wget --progress=bar:force:noscroll ", URLs[chr, "URL"]))
 77 | system(paste0("tar -xvf ", gsub(".*?([0-9]+).*", "\\1", URL)))
 78 | 
 79 | ##########################################################################
 80 | ### Step 1 (Varinfo_gds)
 81 | ##########################################################################
 82 | 
 83 | start_time <- Sys.time()
 84 | ### make directory
 85 | system(paste0("mkdir ",output_path,"chr",chr))
 86 | 
 87 | ### chromosome number
 88 | 
 89 | ## open GDS
 90 | genofile <- seqOpen(gds.file, readonly = FALSE)
 91 | 
 92 | genofile
 93 | 
 94 | CHR <- as.numeric(seqGetData(genofile, "chromosome"))
 95 | position <- as.integer(seqGetData(genofile, "position"))
 96 | REF <- as.character(seqGetData(genofile, "$ref"))
 97 | ALT <- as.character(seqGetData(genofile, "$alt"))
 98 | 
 99 | VarInfo_genome <- paste0(CHR,"-",position,"-",REF,"-",ALT)
100 | 
101 | ## Generate VarInfo
102 | for(kk in 1:dim(DB_info_chr)[1])
103 | {
104 |   print(kk)
105 | 
106 |   VarInfo <- VarInfo_genome[(position>=DB_info_chr$Start_Pos[kk])&(position<=DB_info_chr$End_Pos[kk])]
107 |   VarInfo <- data.frame(VarInfo)
108 | 
109 |   write.csv(VarInfo,paste0(output_path,"chr",chr,"/VarInfo_chr",chr,"_",kk,".csv"),quote=FALSE,row.names = FALSE)
110 | }
111 | 
112 | ##########################################################################
113 | ### Step 2 (Annotate)
114 | ##########################################################################
115 | ### anno channel (subset)
116 | anno_colnum <- c(1,8:12,15,16,19,23,25:36)
117 | 
118 | 
119 | for(kk in 1:chr_splitnum)
120 | {
121 |   print(kk)
122 | 
123 |   system(paste0(xsv," join --left VarInfo ",output_path,"chr",chr,"/VarInfo_chr",chr,"_",kk,".csv variant_vcf ",DB_path,"/chr",chr,"_",kk,".csv > ",output_path,"chr",chr,"/Anno_chr",chr,"_",kk,".csv"))
124 | }
125 | 
126 | ## merge info
127 | Anno <- paste0(output_path,"chr",chr,"/Anno_chr",chr,"_",seq(1:chr_splitnum),".csv ")
128 | merge_command <- paste0(xsv," cat rows ",Anno[1])
129 | 
130 | for(kk in 2:chr_splitnum)
131 | {
132 |   merge_command <- paste0(merge_command,Anno[kk])
133 | }
134 | 
135 | merge_command <- paste0(merge_command,"> ",output_path,"chr",chr,"/Anno_chr",chr,".csv")
136 | 
137 | system(merge_command)
138 | 
139 | ## subset
140 | anno_colnum_xsv <- c()
141 | for(kk in 1:(length(anno_colnum)-1))
142 | {
143 |   anno_colnum_xsv <- paste0(anno_colnum_xsv,anno_colnum[kk],",")
144 | }
145 | anno_colnum_xsv <- paste0(anno_colnum_xsv,anno_colnum[length(anno_colnum)])
146 | 
147 | system(paste0(xsv," select ",anno_colnum_xsv," ",output_path,"chr",chr,"/Anno_chr",chr,".csv > ",output_path,"chr",chr,"/Anno_chr",chr,"_STAARpipeline.csv"))
148 | 
149 | ##########################################################################
150 | ### Step 3 (gds2agds)
151 | ##########################################################################
152 | 
153 | ### read annotation data
154 | FunctionalAnnotation <- read_csv(paste0(output_path,"chr",chr,"/",anno_file_name_1,chr,anno_file_name_2),
155 |                                  col_types=list(col_character(),col_double(),col_double(),col_double(),col_double(),
156 |                                                 col_double(),col_double(),col_double(),col_double(),col_double(),
157 |                                                 col_character(),col_character(),col_character(),col_double(),col_character(),
158 |                                                 col_character(),col_character(),col_character(),col_character(),col_double(),
159 |                                                 col_double(),col_character()))
160 | 
161 | dim(FunctionalAnnotation)
162 | 
163 | ## rename colnames
164 | colnames(FunctionalAnnotation)[2] <- "apc_conservation"
165 | colnames(FunctionalAnnotation)[7] <- "apc_local_nucleotide_diversity"
166 | colnames(FunctionalAnnotation)[9] <- "apc_protein_function"
167 | 
168 | Anno.folder <- index.gdsn(genofile, "annotation/info")
169 | add.gdsn(Anno.folder, "FunctionalAnnotation", val=FunctionalAnnotation, compress="LZMA_ra", closezip=TRUE)
170 | 
171 | genofile
172 | 
173 | seqClose(genofile)
174 | 
175 | end_time <- Sys.time()
176 | 
177 | print("time")
178 | end_time - start_time
179 | 


--------------------------------------------------------------------------------
/Scripts/CSV/FAVORannotatorCSVFullDB.R:
--------------------------------------------------------------------------------
  1 | #############################################################################
  2 | #Title:    FAVORannotatorCSVFullDB      
  3 | #Function: 
  4 | # * Build the aGDS file thorugh performing functional annotation, 
  5 | # * without pre-install the FAVOR Full Database, download database on the fly.
  6 | #Author:   Hufeng Zhou
  7 | #Time:     Sept 27th 2022
  8 | #############################################################################
  9 | 
 10 | 
 11 | args <- commandArgs(TRUE)
 12 | ### mandatory
 13 | 
 14 | gds.file <- args[1]
 15 | print(paste0("gds.file:  ",gds.file))
 16 | 
 17 | #outfile <- args[2]
 18 | #print(paste0("outfile:  ",outfile))
 19 | 
 20 | chr <- as.numeric(args[2])
 21 | print(paste0("chr:  ",chr))
 22 | #chr<-19
 23 | 
 24 | use_compression <- "Yes"
 25 | print(paste0("use_compression: ",use_compression))
 26 | 
 27 | ### output
 28 | output_path <- "./"
 29 | 
 30 | ### make directory
 31 | system(paste0("mkdir ",output_path,"chr",chr))
 32 | 
 33 | ### annotation file
 34 | dir_anno <- "./"
 35 | 
 36 | 
 37 | ### load required package
 38 | library(gdsfmt)
 39 | library(SeqArray)
 40 | library(readr)
 41 | 
 42 | 
 43 | ### chromosome number
 44 | ## read info
 45 | DB_info <- read.csv(url("https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/main/Scripts/SQL/FAVORdatabase_chrsplit.csv"),header=TRUE)
 46 | DB_info <- DB_info[DB_info$Chr==chr,]
 47 | 
 48 | ### DB file
 49 | DB_path <- "./"
 50 | 
 51 | ### xsv directory
 52 | xsv <- "~/.cargo/bin/xsv"
 53 | 
 54 | 
 55 | 
 56 | 
 57 | ##########################################################################
 58 | ### Step 0 (Download FAVOR Database)
 59 | ##########################################################################
 60 | URLs <- data.frame(chr = c(1:22),
 61 |                    URL = c("https://dataverse.harvard.edu/api/access/datafile/6380374",  #1
 62 |                            "https://dataverse.harvard.edu/api/access/datafile/6380471",  #2
 63 |                            "https://dataverse.harvard.edu/api/access/datafile/6380732",  #3
 64 |                            "https://dataverse.harvard.edu/api/access/datafile/6381512",  #4
 65 |                            "https://dataverse.harvard.edu/api/access/datafile/6381457",  #5
 66 |                            "https://dataverse.harvard.edu/api/access/datafile/6381327",  #6
 67 |                            "https://dataverse.harvard.edu/api/access/datafile/6384125",  #7
 68 |                            "https://dataverse.harvard.edu/api/access/datafile/6382573",  #8
 69 |                            "https://dataverse.harvard.edu/api/access/datafile/6384268",  #9
 70 |                            "https://dataverse.harvard.edu/api/access/datafile/6380273",  #10
 71 |                            "https://dataverse.harvard.edu/api/access/datafile/6384154",  #11
 72 |                            "https://dataverse.harvard.edu/api/access/datafile/6384198",  #12
 73 |                            "https://dataverse.harvard.edu/api/access/datafile/6388366",  #13
 74 |                            "https://dataverse.harvard.edu/api/access/datafile/6388406",  #14
 75 |                            "https://dataverse.harvard.edu/api/access/datafile/6388427",  #15
 76 |                            "https://dataverse.harvard.edu/api/access/datafile/6388551",  #16
 77 |                            "https://dataverse.harvard.edu/api/access/datafile/6388894",  #17
 78 |                            "https://dataverse.harvard.edu/api/access/datafile/6376523",  #18
 79 |                            "https://dataverse.harvard.edu/api/access/datafile/6376522",  #19
 80 |                            "https://dataverse.harvard.edu/api/access/datafile/6376521",  #20
 81 |                            "https://dataverse.harvard.edu/api/access/datafile/6358305",  #21
 82 |                            "https://dataverse.harvard.edu/api/access/datafile/6358299")) #22
 83 | 
 84 | URL <- URLs[chr, "URL"]
 85 | system(paste0("wget --progress=bar:force:noscroll ", URLs[chr, "URL"]))
 86 | system(paste0("tar -xvf ", gsub(".*?([0-9]+).*", "\\1", URL)))
 87 | 
 88 | ##########################################################################
 89 | ### Step 1 (Varinfo_gds)
 90 | ##########################################################################
 91 | 
 92 | ## open GDS
 93 | genofile <- seqOpen(gds.file, readonly = FALSE)
 94 | 
 95 | genofile
 96 | 
 97 | CHR <- as.numeric(seqGetData(genofile, "chromosome"))
 98 | position <- as.integer(seqGetData(genofile, "position"))
 99 | REF <- as.character(seqGetData(genofile, "$ref"))
100 | ALT <- as.character(seqGetData(genofile, "$alt"))
101 | 
102 | VarInfo_genome <- paste0(CHR,"-",position,"-",REF,"-",ALT)
103 | 
104 | ## Generate VarInfo
105 | for(kk in 1:dim(DB_info)[1])
106 | {
107 |   print(kk)
108 |   VarInfo <- VarInfo_genome[(position>=DB_info$Start_Pos[kk])&(position<=DB_info$End_Pos[kk])]
109 |   VarInfo <- data.frame(VarInfo)
110 |   write.csv(VarInfo,paste0(output_path,"chr",chr,"/VarInfo_chr",chr,"_",kk,".csv"),quote=FALSE,row.names = FALSE)
111 | }
112 | 
113 | ##########################################################################
114 | ### Step 2 (Annotate)
115 | ##########################################################################
116 | start_time <- Sys.time()
117 | chr_splitnum <- sum(DB_info$Chr==chr)
118 | 
119 | for(kk in 1:chr_splitnum)
120 | {
121 |   print(kk)
122 | 	#system(paste0(xsv," index ",DB_path,"/chr",chr,"_",kk,".csv))
123 |   system(paste0(xsv," join --left VarInfo ",output_path,"chr",chr,"/VarInfo_chr",chr,"_",kk,".csv variant_vcf ",DB_path,"/chr",chr,"_",kk,".csv > ",output_path,"chr",chr,"/Anno_chr",chr,"_",kk,".csv"))
124 | }
125 | 
126 | ## merge info
127 | Anno <- paste0(output_path,"chr",chr,"/Anno_chr",chr,"_",seq(1:chr_splitnum),".csv ")
128 | merge_command <- paste0(xsv," cat rows ",Anno[1])
129 | for(kk in 2:chr_splitnum)
130 | {
131 |   merge_command <- paste0(merge_command,Anno[kk])
132 | }
133 | merge_command <- paste0(merge_command,"> ",output_path,"chr",chr,"/Anno_chr",chr,".csv")
134 | system(merge_command)
135 | 
136 | ##########################################################################
137 | ### Step 3 (gds2agds)
138 | ##########################################################################
139 | ### read annotation data
140 | FunctionalAnnotation <- read_csv(paste0(dir_anno,"chr",chr,"/Anno_chr",chr,".csv"))
141 | dim(FunctionalAnnotation)
142 | 
143 | Anno.folder <- index.gdsn(genofile, "annotation/info")
144 | add.gdsn(Anno.folder, "FAVORFullDB", val=FunctionalAnnotation, compress="LZMA_ra", closezip=TRUE)
145 | genofile
146 | 
147 | seqClose(genofile)
148 | end_time <- Sys.time()
149 | 
150 | print("time")
151 | end_time - start_time
152 | 
153 | #system(paste0("mv ", gds.file, " ", outfile, ".gds"))
154 | 
155 | 


--------------------------------------------------------------------------------
/Scripts/CSV/FAVORannotatorv2aGDS.r:
--------------------------------------------------------------------------------
 1 | rm(list=ls())
 2 | gc()
 3 | ### R package
 4 | library(gdsfmt)
 5 | library(SeqArray)
 6 | library(readr)
 7 | source('config.R')
 8 | 
 9 | CHRN <- as.numeric(commandArgs(TRUE)[1])
10 | 
11 | ### make directory
12 | system(paste0("mkdir ",output_path,"/chr",CHRN))
13 | start_time<-Sys.time()
14 | 
15 | ### chromosome number
16 | ## read info
17 | DB_info <- read.csv(file_DBsplit,header=TRUE)
18 | chr_splitnum <- sum(DB_info$Chr==CHRN)
19 | DB_info_chr <- DB_info[DB_info$Chr==CHRN,]
20 | 
21 | ## open GDS
22 | genofile<-seqOpen(eval(parse(text = paste0("gds.chr",CHRN,".fn"))), readonly = FALSE)
23 | 
24 | CHR <- as.numeric(seqGetData(genofile, "chromosome"))
25 | position <- as.integer(seqGetData(genofile, "position"))
26 | REF <- as.character(seqGetData(genofile, "$ref"))
27 | ALT <- as.character(seqGetData(genofile, "$alt"))
28 | 
29 | VarInfo_genome <- paste0(CHR,"-",position,"-",REF,"-",ALT)
30 | 
31 | ## Generate VarInfo
32 | for(kk in 1:dim(DB_info_chr)[1])
33 | {
34 | 	print(kk)
35 | 	VarInfo <- VarInfo_genome[(position>=DB_info_chr$Start_Pos[kk])&(position<=DB_info_chr$End_Pos[kk])]
36 | 	VarInfo <- data.frame(VarInfo)
37 | 	write.csv(VarInfo,paste0(output_path,"/chr",CHRN,"/VarInfo_chr",CHRN,"_",kk,".csv"),quote=FALSE,row.names = FALSE)
38 | }
39 | gc()
40 | 
41 | for(kk in 1:chr_splitnum)
42 | {
43 | 	print(kk)
44 | 	system(paste0(xsv," join --left VarInfo ",output_path,"/chr",CHRN,"/VarInfo_chr",CHRN,"_",kk,".csv variant_vcf ",DB_path,"/chr",CHRN,"_",kk,".csv > ",output_path,"/chr",CHRN,"/Anno_chr",CHRN,"_",kk,".csv"))
45 | }
46 | 
47 | ## merge info
48 | Anno <- paste0(output_path,"/chr",CHRN,"/Anno_chr",CHRN,"_",seq(1:chr_splitnum),".csv ")
49 | merge_command <- paste0(xsv," cat rows ",Anno[1])
50 | 
51 | for(kk in 2:chr_splitnum)
52 | {
53 | 	merge_command <- paste0(merge_command,Anno[kk])
54 | }
55 | 
56 | merge_command <- paste0(merge_command,"> ",output_path,"/chr",CHRN,"/Anno_chr",CHRN,".csv")
57 | system(merge_command)
58 | 
59 | gc()
60 | ### read annotation data
61 | FunctionalAnnotation <- read_csv(paste0(output_path,"/chr",CHRN,"/Anno_chr",CHRN,".csv"))
62 | 
63 | dim(FunctionalAnnotation)
64 | 
65 | 
66 | ## open GDS
67 | Anno.folder <- addfolder.gdsn(index.gdsn(genofile, "annotation/info"), "FunctionalAnnotation")
68 | add.gdsn(Anno.folder, "FunctionalAnnotation", val=FunctionalAnnotation, compress="LZMA_ra", closezip=TRUE)
69 | genofile
70 | 
71 | seqClose(genofile)
72 | end_time<-Sys.time()
73 | 
74 | print("time:")
75 | end_time - start_time
76 | 


--------------------------------------------------------------------------------
/Scripts/CSV/config.R:
--------------------------------------------------------------------------------
 1 | ### DB split information 
 2 | file_DBsplit <- "../SQL/FAVORdatabase_chrsplit.csv"
 3 | 
 4 | ### output
 5 | output_path <- "../../../Output"
 6 | 
 7 | ### xsv directory
 8 | xsv <- "~/.cargo/bin/xsv"
 9 | 
10 | ### DB file
11 | DB_path <- "../../../FullDB/FAVORDB/"
12 | 
13 | #---------chr1-----------------------
14 | vcf.chr1.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr1.vcf"
15 | gds.chr1.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr1.agds"
16 | #---------chr2-----------------------
17 | vcf.chr2.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr2.vcf"
18 | gds.chr2.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr2.agds"
19 | #---------chr3-----------------------
20 | vcf.chr3.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr3.vcf"
21 | gds.chr3.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr3.agds"
22 | #---------chr4-----------------------
23 | vcf.chr4.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr4.vcf"
24 | gds.chr4.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr4.agds"
25 | #---------chr5-----------------------
26 | vcf.chr5.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr5.vcf"
27 | gds.chr5.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr5.agds"
28 | #---------chr6-----------------------
29 | vcf.chr6.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr6.vcf"
30 | gds.chr6.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr6.agds"
31 | #---------chr7-----------------------
32 | vcf.chr7.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr7.vcf"
33 | gds.chr7.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr7.agds"
34 | #---------chr1-----------------------
35 | vcf.chr8.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr8.vcf"
36 | gds.chr8.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr8.agds"
37 | #---------chr1-----------------------
38 | vcf.chr9.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr9.vcf"
39 | gds.chr9.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr9.agds"
40 | #---------chr10-----------------------
41 | vcf.chr10.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr10.vcf"
42 | gds.chr10.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr10.agds"
43 | #---------chr10-----------------------
44 | vcf.chr10.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr10.vcf"
45 | gds.chr10.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr10.agds"
46 | #---------chr11-----------------------
47 | vcf.chr11.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr11.vcf"
48 | gds.chr11.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr11.agds"
49 | #---------chr11-----------------------
50 | vcf.chr11.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr11.vcf"
51 | gds.chr11.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr11.agds"
52 | #---------chr12-----------------------
53 | vcf.chr12.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr12.vcf"
54 | gds.chr12.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr12.agds"
55 | #---------chr13-----------------------
56 | vcf.chr13.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr13.vcf"
57 | gds.chr13.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr13.agds"
58 | #---------chr14-----------------------
59 | vcf.chr14.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr14.vcf"
60 | gds.chr14.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr14.agds"
61 | #---------chr15-----------------------
62 | vcf.chr15.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr15.vcf"
63 | gds.chr15.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr15.agds"
64 | #---------chr16-----------------------
65 | vcf.chr16.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr16.vcf"
66 | gds.chr16.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr16.agds"
67 | #---------chr17-----------------------
68 | vcf.chr17.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr17.vcf"
69 | gds.chr17.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr17.agds"
70 | #---------chr18-----------------------
71 | vcf.chr18.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr18.vcf"
72 | gds.chr18.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr18.agds"
73 | #---------chr19-----------------------
74 | vcf.chr19.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr19.vcf"
75 | gds.chr19.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr19.agds"
76 | #---------chr20-----------------------
77 | vcf.chr20.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr20.vcf"
78 | gds.chr20.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr20.agds"
79 | #---------chr21-----------------------
80 | vcf.chr21.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr21.vcf"
81 | gds.chr21.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr21.agds"
82 | #---------chr22-----------------------
83 | vcf.chr22.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr22.vcf"
84 | gds.chr22.fn<-"../../../Data/gnomad.genomes.v3.1.2.sites.chr22.agds"
85 | 
86 | 


--------------------------------------------------------------------------------
/Scripts/CSV/convertVCFtoGDS.r:
--------------------------------------------------------------------------------
 1 | #############################################################################
 2 | #Title:    convertVCFtoGDS      
 3 | #Function: 
 4 | # * Build the GDS file from VCF files
 5 | #Author:   Hufeng Zhou
 6 | #Time:     Nov 27th 2021
 7 | #############################################################################
 8 | library(gdsfmt)
 9 | library(SeqArray)
10 | 
11 | #import configuration file
12 | source('config.R')
13 | 
14 | #vcf.chr10.fn=as.character(commandArgs(TRUE)[1])
15 | #gds.chr10.fn=as.character(commandArgs(TRUE)[2])
16 | CHRN=as.character(commandArgs(TRUE)[1])
17 | seqVCF2GDS(eval(parse(text = paste0("vcf.chr",CHRN,".fn"))), eval(parse(text = paste0("gds.chr",CHRN,".fn"))), header = NULL, genotype.var.name = "GT", info.import=NULL, fmt.import=NULL, ignore.chr.prefix="chr", raise.error=TRUE, verbose=TRUE)
18 | genofile<-seqOpen(eval(parse(text = paste0("gds.chr",CHRN,".fn"))), readonly = FALSE)
19 | print("GDS built")
20 | 
21 | ###Closing Up###
22 | genofile
23 | seqClose(genofile)
24 | 


--------------------------------------------------------------------------------
/Scripts/CSV/subBatchJobs.sh:
--------------------------------------------------------------------------------
 1 | ## define your array 
 2 | MYFILES=(1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22)
 3 | echo ${MYFILES[*]}
 4 | ## count how many elements I have
 5 | NUM=${#MYFILES[@]}
 6 | ZBNUM=$(($NUM -1 ))
 7 | 
 8 | if [ $ZBNUM -ge 0 ]; then
 9 | ## not very elegant workaround to the array export issue:
10 | ## package the array into a string with a specific FS
11 | STRINGFILES=$( IFS=$','; echo "${MYFILES[*]}" )
12 | 
13 | printf "STRINGFILES =  "
14 | echo $STRINGFILES
15 | 
16 | export STRINGFILES
17 | 
18 | ## example of how to reconvert into an array inside the slurm file
19 | #IFS=',' read -r -a MYNEWFILES  <<< "$STRINGFILES"
20 | #myiid=2
21 | #CURRENTFILE=${MYNEWFILES[$myiid]}
22 | #echo "currentfile is $CURRENTFILE "
23 | 
24 | ## submit job 
25 | sbatch --array=0-$ZBNUM subBatchJobs.txt
26 | fi
27 | 
28 | 


--------------------------------------------------------------------------------
/Scripts/CSV/subBatchJobs.txt:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -c 1 
 3 | #SBATCH -N 1
 4 | #SBATCH -t 10000 
 5 | #SBATCH -p shared,xlin,xlin-lab
 6 | #SBATCH --mem=128000 
 7 | #SBATCH -o array_%A_%a.out 
 8 | #SBATCH -e array_%A_%a.err 
 9 | #SBATCH --mail-type=ALL
10 | 
11 | ## expand back variable into array
12 | IFS=',' read -r -a MYFILES  <<< "$STRINGFILES"
13 | cur=${MYFILES[$SLURM_ARRAY_TASK_ID]}
14 | 
15 | . ~/.bash_profile
16 | module load R/4.0.2-fasrc01
17 | 
18 | cd /directory/aGDSFolder # put all the aGDS files to be annotated in the aGDSFolder
19 | 
20 | Rscript FAVORannotatorCSVFullDB.R InputData.chr$cur.agds $cur
21 | 
22 | Rscript FAVORannotatorCSVEssentialDB.R InputData.chr$cur.agds $cur
23 | 
24 | 


--------------------------------------------------------------------------------
/Scripts/CSV/submitJobs.sh:
--------------------------------------------------------------------------------
 1 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=65000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 1; Rscript ./FAVORannotatorv2aGDS.r 1'
 2 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=65000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 2; Rscript ./FAVORannotatorv2aGDS.r 2'
 3 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=60000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 3;  Rscript ./FAVORannotatorv2aGDS.r 3'
 4 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=60000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 4;  Rscript ./FAVORannotatorv2aGDS.r 4'
 5 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=55000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 5;  Rscript ./FAVORannotatorv2aGDS.r 6'
 6 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=50000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 6;  Rscript ./FAVORannotatorv2aGDS.r 5'
 7 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=50000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 7;  Rscript ./FAVORannotatorv2aGDS.r 7'
 8 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=50000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 8;  Rscript ./FAVORannotatorv2aGDS.r 8'
 9 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=40000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 9;  Rscript ./FAVORannotatorv2aGDS.r 9'
10 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=40000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 10; Rscript ./FAVORannotatorv2aGDS.r 10'
11 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=40000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 11; Rscript ./FAVORannotatorv2aGDS.r 11'
12 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=40000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 12; Rscript ./FAVORannotatorv2aGDS.r 12'
13 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=40000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 13; Rscript ./FAVORannotatorv2aGDS.r 13'
14 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=35000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 14; Rscript ./FAVORannotatorv2aGDS.r 14'
15 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=35000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 15; Rscript ./FAVORannotatorv2aGDS.r 15'
16 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=30000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 16; Rscript ./FAVORannotatorv2aGDS.r 16'
17 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=30000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 17; Rscript ./FAVORannotatorv2aGDS.r 17'
18 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=30000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 18; Rscript ./FAVORannotatorv2aGDS.r 18'
19 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=30000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 19; Rscript ./FAVORannotatorv2aGDS.r 19'
20 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=20000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 20; Rscript ./FAVORannotatorv2aGDS.r 20'
21 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=20000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 21;  Rscript ./FAVORannotatorv2aGDS.r 21'
22 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=20000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 22;  Rscript ./FAVORannotatorv2aGDS.r 22'
23 | 


--------------------------------------------------------------------------------
/Scripts/Cloud/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Scripts/Cloud/.DS_Store


--------------------------------------------------------------------------------
/Scripts/Cloud/._.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Scripts/Cloud/._.DS_Store


--------------------------------------------------------------------------------
/Scripts/Cloud/DNAnexus/._FAVORannotatorDev.R:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Scripts/Cloud/DNAnexus/._FAVORannotatorDev.R


--------------------------------------------------------------------------------
/Scripts/Cloud/DNAnexus/._code.sh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Scripts/Cloud/DNAnexus/._code.sh


--------------------------------------------------------------------------------
/Scripts/Cloud/DNAnexus/._favorannotator.R:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Scripts/Cloud/DNAnexus/._favorannotator.R


--------------------------------------------------------------------------------
/Scripts/Cloud/DNAnexus/FAVORannotatorDev.R:
--------------------------------------------------------------------------------
  1 | args <- commandArgs(TRUE)
  2 | ### mandatory
  3 | outfile <- args[1]
  4 | gds.file <- args[2]
  5 | chr <- as.numeric(args[3])
  6 | #chr<-19
  7 | use_compression <- args[4]
  8 | 
  9 | 
 10 | 
 11 | ##########################################################################
 12 | ### Step 0 (Download FAVOR Database)
 13 | ##########################################################################
 14 | URLs <- data.frame(chr = c(1:22),
 15 |                    URL = c("https://dataverse.harvard.edu/api/access/datafile/6170506",
 16 |                            "https://dataverse.harvard.edu/api/access/datafile/6170501",
 17 |                            "https://dataverse.harvard.edu/api/access/datafile/6170502",
 18 |                            "https://dataverse.harvard.edu/api/access/datafile/6170521",
 19 |                            "https://dataverse.harvard.edu/api/access/datafile/6170511",
 20 |                            "https://dataverse.harvard.edu/api/access/datafile/6170516",
 21 |                            "https://dataverse.harvard.edu/api/access/datafile/6170505",
 22 |                            "https://dataverse.harvard.edu/api/access/datafile/6170513",
 23 |                            "https://dataverse.harvard.edu/api/access/datafile/6165867",
 24 |                            "https://dataverse.harvard.edu/api/access/datafile/6170507",
 25 |                            "https://dataverse.harvard.edu/api/access/datafile/6170517",
 26 |                            "https://dataverse.harvard.edu/api/access/datafile/6170520",
 27 |                            "https://dataverse.harvard.edu/api/access/datafile/6170503",
 28 |                            "https://dataverse.harvard.edu/api/access/datafile/6170509",
 29 |                            "https://dataverse.harvard.edu/api/access/datafile/6170515",
 30 |                            "https://dataverse.harvard.edu/api/access/datafile/6170518",
 31 |                            "https://dataverse.harvard.edu/api/access/datafile/6170510",
 32 |                            "https://dataverse.harvard.edu/api/access/datafile/6170508",
 33 |                            "https://dataverse.harvard.edu/api/access/datafile/6170514",
 34 |                            "https://dataverse.harvard.edu/api/access/datafile/6170512",
 35 |                            "https://dataverse.harvard.edu/api/access/datafile/6170519",
 36 |                            "https://dataverse.harvard.edu/api/access/datafile/6170504"))
 37 | 
 38 | URL <- URLs[chr, "URL"]
 39 | system(paste0("wget --progress=bar:force:noscroll ", URLs[chr, "URL"]))
 40 | system(paste0("tar -xvf ", gsub(".*?([0-9]+).*", "\\1", URL)))
 41 | 
 42 | ##########################################################################
 43 | ### Step 1 (Varinfo_gds)
 44 | ##########################################################################
 45 | 
 46 | ### output
 47 | output_path <- "/root/./"
 48 | 
 49 | ### make directory
 50 | system(paste0("mkdir ",output_path,"chr",chr))
 51 | 
 52 | ### R package
 53 | library(gdsfmt)
 54 | library(SeqArray)
 55 | library(SeqVarTools)
 56 | 
 57 | ### chromosome number
 58 | ## read info
 59 | DB_info <- read.csv(url("https://raw.githubusercontent.com/xihaoli/STAARpipeline-Tutorial/main/FAVORannotator_csv/FAVORdatabase_chrsplit.csv"),header=TRUE)
 60 | DB_info <- DB_info[DB_info$Chr==chr,]
 61 | 
 62 | ## open GDS
 63 | genofile <- seqOpen(gds.file)
 64 | 
 65 | genofile
 66 | 
 67 | CHR <- as.numeric(seqGetData(genofile, "chromosome"))
 68 | position <- as.integer(seqGetData(genofile, "position"))
 69 | REF <- as.character(seqGetData(genofile, "$ref"))
 70 | ALT <- as.character(seqGetData(genofile, "$alt"))
 71 | 
 72 | VarInfo_genome <- paste0(CHR,"-",position,"-",REF,"-",ALT)
 73 | 
 74 | seqClose(genofile)
 75 | 
 76 | ## Generate VarInfo
 77 | for(kk in 1:dim(DB_info)[1])
 78 | {
 79 |   print(kk)
 80 | 
 81 |   VarInfo <- VarInfo_genome[(position>=DB_info$Start_Pos[kk])&(position<=DB_info$End_Pos[kk])]
 82 |   VarInfo <- data.frame(VarInfo)
 83 | 
 84 |   write.csv(VarInfo,paste0(output_path,"chr",chr,"/VarInfo_chr",chr,"_",kk,".csv"),quote=FALSE,row.names = FALSE)
 85 | }
 86 | 
 87 | ##########################################################################
 88 | ### Step 2 (Annotate)
 89 | ##########################################################################
 90 | 
 91 | ### xsv directory
 92 | xsv <- "/root/.cargo/bin/xsv"
 93 | 
 94 | ### DB file
 95 | DB_path <- "n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/"
 96 | 
 97 | ### anno channel (subset)
 98 | anno_colnum <- c(1,8:12,15,16,19,23,25:36)
 99 | 
100 | chr_splitnum <- sum(DB_info$Chr==chr)
101 | 
102 | for(kk in 1:chr_splitnum)
103 | {
104 |   print(kk)
105 | 
106 |   system(paste0(xsv," join --left VarInfo ",output_path,"chr",chr,"/VarInfo_chr",chr,"_",kk,".csv variant_vcf ",DB_path,"/chr",chr,"_",kk,".csv > ",output_path,"chr",chr,"/Anno_chr",chr,"_",kk,".csv"))
107 | }
108 | 
109 | ## merge info
110 | Anno <- paste0(output_path,"chr",chr,"/Anno_chr",chr,"_",seq(1:chr_splitnum),".csv ")
111 | merge_command <- paste0(xsv," cat rows ",Anno[1])
112 | 
113 | for(kk in 2:chr_splitnum)
114 | {
115 |   merge_command <- paste0(merge_command,Anno[kk])
116 | }
117 | 
118 | merge_command <- paste0(merge_command,"> ",output_path,"chr",chr,"/Anno_chr",chr,".csv")
119 | 
120 | system(merge_command)
121 | 
122 | ## subset
123 | anno_colnum_xsv <- c()
124 | for(kk in 1:(length(anno_colnum)-1))
125 | {
126 |   anno_colnum_xsv <- paste0(anno_colnum_xsv,anno_colnum[kk],",")
127 | }
128 | anno_colnum_xsv <- paste0(anno_colnum_xsv,anno_colnum[length(anno_colnum)])
129 | 
130 | system(paste0(xsv," select ",anno_colnum_xsv," ",output_path,"chr",chr,"/Anno_chr",chr,".csv > ",output_path,"chr",chr,"/Anno_chr",chr,"_STAARpipeline.csv"))
131 | 
132 | ##########################################################################
133 | ### Step 3 (gds2agds)
134 | ##########################################################################
135 | 
136 | ### annotation file
137 | dir_anno <- "/root/"
138 | anno_file_name_1 <- "Anno_chr"
139 | anno_file_name_2 <- "_STAARpipeline.csv"
140 | 
141 | ### load required package
142 | library(gdsfmt)
143 | library(SeqArray)
144 | library(SeqVarTools)
145 | library(readr)
146 | 
147 | ### read annotation data
148 | FunctionalAnnotation <- read_csv(paste0(dir_anno,"chr",chr,"/",anno_file_name_1,chr,anno_file_name_2),
149 |                                  col_types=list(col_character(),col_double(),col_double(),col_double(),col_double(),
150 |                                                 col_double(),col_double(),col_double(),col_double(),col_double(),
151 |                                                 col_character(),col_character(),col_character(),col_double(),col_character(),
152 |                                                 col_character(),col_character(),col_character(),col_character(),col_double(),
153 |                                                 col_double(),col_character()))
154 | 
155 | dim(FunctionalAnnotation)
156 | 
157 | ## rename colnames
158 | colnames(FunctionalAnnotation)[2] <- "apc_conservation"
159 | colnames(FunctionalAnnotation)[7] <- "apc_local_nucleotide_diversity"
160 | colnames(FunctionalAnnotation)[9] <- "apc_protein_function"
161 | 
162 | ## open GDS
163 | genofile <- seqOpen(gds.file, readonly = FALSE)
164 | 
165 | #Anno.folder <- addfolder.gdsn(index.gdsn(genofile, "annotation/info"), "FunctionalAnnotation")
166 | Anno.folder <- index.gdsn(genofile, "annotation/info")
167 | if(use_compression == "YES")
168 | {
169 |   add.gdsn(Anno.folder, "FunctionalAnnotation", val=FunctionalAnnotation, compress="LZMA_ra", closezip=TRUE)
170 | }else
171 | {
172 |   add.gdsn(Anno.folder, "FunctionalAnnotation", val=FunctionalAnnotation)
173 | }
174 | 
175 | seqClose(genofile)
176 | 
177 | system(paste0("mv ", gds.file, " ", outfile, ".gds"))
178 | 
179 | 
180 | 
181 | print(args)


--------------------------------------------------------------------------------
/Scripts/Cloud/DNAnexus/code.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # favorannotator_v1.0.0
 3 | # Generated by dx-app-wizard.
 4 | #
 5 | # Basic execution pattern: Your app will run on a single machine from
 6 | # beginning to end.
 7 | #
 8 | # Your job's input variables (if any) will be loaded as environment
 9 | # variables before this script runs.  Any array inputs will be loaded
10 | # as bash arrays.
11 | #
12 | # Any code outside of main() (or any entry point you may add) is
13 | # ALWAYS executed, followed by running the entry point itself.
14 | #
15 | # See https://documentation.dnanexus.com/developer for tutorials on how
16 | # to modify this file.
17 | 
18 | main() {
19 | 
20 |     echo "Value of outfile: '$outfile'"
21 |     echo "Value of gds_file: '$gds_file'"
22 |     echo "Value of chromosome: '$chromosome'"
23 |     echo "Value of use_compression: '$use_compression'"
24 | 
25 |     # The following line(s) use the dx command-line tool to download your file
26 |     # inputs to the local file system using variable names for the filenames. To
27 |     # recover the original filenames, you can use the output of "dx describe
28 |     # "$variable" --name".
29 | 
30 |     if [ -n "$gds_file" ]
31 |     then
32 |         dx download "$gds_file" -o gds_file.gds &
33 |     gds_file2="gds_file.gds"
34 |     else
35 |     gds_file2="NO_GDS_FILE"
36 |     fi
37 | 
38 |     echo "Installing xsv"
39 |     curl https://sh.rustup.rs -sSf | sh -s -- -y
40 |     source $HOME/.cargo/env
41 |     cargo install xsv
42 | 
43 |     echo "Rscript --vanilla favorannotator.R $outfile $gds_file2 $chromosome $use_compression"
44 |     dx-docker run -v /home/dnanexus/:/home/dnanexus/ -w /home/dnanexus/ zilinli/staarpipeline:0.9.6 Rscript --vanilla favorannotator.R $outfile $gds_file2 $chromosome $use_compression
45 |     mkdir -p out/results
46 |     mv ${outfile}.gds out/results
47 |     dx-upload-all-outputs
48 | }
49 | 
50 | 


--------------------------------------------------------------------------------
/Scripts/Cloud/DNAnexus/favorannotator.R:
--------------------------------------------------------------------------------
  1 | args <- commandArgs(TRUE)
  2 | ### mandatory
  3 | outfile <- args[1]
  4 | gds.file <- args[2]
  5 | #chr <- as.numeric(args[3])
  6 | chr<-19
  7 | use_compression <- args[4]
  8 | 
  9 | ##########################################################################
 10 | ### Step 0 (Download FAVOR Database)
 11 | ##########################################################################
 12 | URLs <- data.frame(chr = c(1:22),
 13 |                    URL = c("https://dataverse.harvard.edu/api/access/datafile/6170506",
 14 |                            "https://dataverse.harvard.edu/api/access/datafile/6170501",
 15 |                            "https://dataverse.harvard.edu/api/access/datafile/6170502",
 16 |                            "https://dataverse.harvard.edu/api/access/datafile/6170521",
 17 |                            "https://dataverse.harvard.edu/api/access/datafile/6170511",
 18 |                            "https://dataverse.harvard.edu/api/access/datafile/6170516",
 19 |                            "https://dataverse.harvard.edu/api/access/datafile/6170505",
 20 |                            "https://dataverse.harvard.edu/api/access/datafile/6170513",
 21 |                            "https://dataverse.harvard.edu/api/access/datafile/6165867",
 22 |                            "https://dataverse.harvard.edu/api/access/datafile/6170507",
 23 |                            "https://dataverse.harvard.edu/api/access/datafile/6170517",
 24 |                            "https://dataverse.harvard.edu/api/access/datafile/6170520",
 25 |                            "https://dataverse.harvard.edu/api/access/datafile/6170503",
 26 |                            "https://dataverse.harvard.edu/api/access/datafile/6170509",
 27 |                            "https://dataverse.harvard.edu/api/access/datafile/6170515",
 28 |                            "https://dataverse.harvard.edu/api/access/datafile/6170518",
 29 |                            "https://dataverse.harvard.edu/api/access/datafile/6170510",
 30 |                            "https://dataverse.harvard.edu/api/access/datafile/6170508",
 31 |                            "https://dataverse.harvard.edu/api/access/datafile/6170514",
 32 |                            "https://dataverse.harvard.edu/api/access/datafile/6170512",
 33 |                            "https://dataverse.harvard.edu/api/access/datafile/6170519",
 34 |                            "https://dataverse.harvard.edu/api/access/datafile/6170504"))
 35 | 
 36 | URL <- URLs[chr, "URL"]
 37 | system(paste0("wget --progress=bar:force:noscroll ", URLs[chr, "URL"]))
 38 | system(paste0("tar -xvf ", gsub(".*?([0-9]+).*", "\\1", URL)))
 39 | 
 40 | ##########################################################################
 41 | ### Step 1 (Varinfo_gds)
 42 | ##########################################################################
 43 | 
 44 | ### output
 45 | output_path <- "./"
 46 | 
 47 | ### make directory
 48 | system(paste0("mkdir ",output_path,"chr",chr))
 49 | 
 50 | ### R package
 51 | library(gdsfmt)
 52 | library(SeqArray)
 53 | library(SeqVarTools)
 54 | 
 55 | ### chromosome number
 56 | ## read info
 57 | DB_info <- read.csv(url("https://raw.githubusercontent.com/xihaoli/STAARpipeline-Tutorial/main/FAVORannotator_csv/FAVORdatabase_chrsplit.csv"),header=TRUE)
 58 | DB_info <- DB_info[DB_info$Chr==chr,]
 59 | 
 60 | ## open GDS
 61 | genofile <- seqOpen(gds.file)
 62 | 
 63 | CHR <- as.numeric(seqGetData(genofile, "chromosome"))
 64 | position <- as.integer(seqGetData(genofile, "position"))
 65 | REF <- as.character(seqGetData(genofile, "$ref"))
 66 | ALT <- as.character(seqGetData(genofile, "$alt"))
 67 | 
 68 | VarInfo_genome <- paste0(CHR,"-",position,"-",REF,"-",ALT)
 69 | 
 70 | seqClose(genofile)
 71 | 
 72 | ## Generate VarInfo
 73 | for(kk in 1:dim(DB_info)[1])
 74 | {
 75 |   print(kk)
 76 | 
 77 |   VarInfo <- VarInfo_genome[(position>=DB_info$Start_Pos[kk])&(position<=DB_info$End_Pos[kk])]
 78 |   VarInfo <- data.frame(VarInfo)
 79 | 
 80 |   write.csv(VarInfo,paste0(output_path,"chr",chr,"/VarInfo_chr",chr,"_",kk,".csv"),quote=FALSE,row.names = FALSE)
 81 | }
 82 | 
 83 | ##########################################################################
 84 | ### Step 2 (Annotate)
 85 | ##########################################################################
 86 | 
 87 | ### xsv directory
 88 | xsv <- ".cargo/bin/xsv"
 89 | 
 90 | ### DB file
 91 | DB_path <- "n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/"
 92 | 
 93 | ### anno channel (subset)
 94 | anno_colnum <- c(1,8:12,15,16,19,23,25:36)
 95 | 
 96 | chr_splitnum <- sum(DB_info$Chr==chr)
 97 | 
 98 | for(kk in 1:chr_splitnum)
 99 | {
100 |   print(kk)
101 | 
102 |   system(paste0(xsv," join --left VarInfo ",output_path,"chr",chr,"/VarInfo_chr",chr,"_",kk,".csv variant_vcf ",DB_path,"/chr",chr,"_",kk,".csv > ",output_path,"chr",chr,"/Anno_chr",chr,"_",kk,".csv"))
103 | }
104 | 
105 | ## merge info
106 | Anno <- paste0(output_path,"chr",chr,"/Anno_chr",chr,"_",seq(1:chr_splitnum),".csv ")
107 | merge_command <- paste0(xsv," cat rows ",Anno[1])
108 | 
109 | for(kk in 2:chr_splitnum)
110 | {
111 |   merge_command <- paste0(merge_command,Anno[kk])
112 | }
113 | 
114 | merge_command <- paste0(merge_command,"> ",output_path,"chr",chr,"/Anno_chr",chr,".csv")
115 | 
116 | system(merge_command)
117 | 
118 | ## subset
119 | anno_colnum_xsv <- c()
120 | for(kk in 1:(length(anno_colnum)-1))
121 | {
122 |   anno_colnum_xsv <- paste0(anno_colnum_xsv,anno_colnum[kk],",")
123 | }
124 | anno_colnum_xsv <- paste0(anno_colnum_xsv,anno_colnum[length(anno_colnum)])
125 | 
126 | system(paste0(xsv," select ",anno_colnum_xsv," ",output_path,"chr",chr,"/Anno_chr",chr,".csv > ",output_path,"chr",chr,"/Anno_chr",chr,"_STAARpipeline.csv"))
127 | 
128 | ##########################################################################
129 | ### Step 3 (gds2agds)
130 | ##########################################################################
131 | 
132 | ### annotation file
133 | dir_anno <- ""
134 | anno_file_name_1 <- "Anno_chr"
135 | anno_file_name_2 <- "_STAARpipeline.csv"
136 | 
137 | ### load required package
138 | library(gdsfmt)
139 | library(SeqArray)
140 | library(SeqVarTools)
141 | library(readr)
142 | 
143 | ### read annotation data
144 | FunctionalAnnotation <- read_csv(paste0(dir_anno,"chr",chr,"/",anno_file_name_1,chr,anno_file_name_2),
145 |                                  col_types=list(col_character(),col_double(),col_double(),col_double(),col_double(),
146 |                                                 col_double(),col_double(),col_double(),col_double(),col_double(),
147 |                                                 col_character(),col_character(),col_character(),col_double(),col_character(),
148 |                                                 col_character(),col_character(),col_character(),col_character(),col_double(),
149 |                                                 col_double(),col_character()))
150 | 
151 | dim(FunctionalAnnotation)
152 | 
153 | ## rename colnames
154 | colnames(FunctionalAnnotation)[2] <- "apc_conservation"
155 | colnames(FunctionalAnnotation)[7] <- "apc_local_nucleotide_diversity"
156 | colnames(FunctionalAnnotation)[9] <- "apc_protein_function"
157 | 
158 | ## open GDS
159 | genofile <- seqOpen(gds.file, readonly = FALSE)
160 | 
161 | #Anno.folder <- addfolder.gdsn(index.gdsn(genofile, "annotation/info"), "FunctionalAnnotation")
162 | Anno.folder <- index.gdsn(genofile, "annotation/info")
163 | if(use_compression == "YES")
164 | {
165 |   add.gdsn(Anno.folder, "FunctionalAnnotation", val=FunctionalAnnotation, compress="LZMA_ra", closezip=TRUE)
166 | }else
167 | {
168 |   add.gdsn(Anno.folder, "FunctionalAnnotation", val=FunctionalAnnotation)
169 | }
170 | 
171 | seqClose(genofile)
172 | 
173 | system(paste0("mv ", gds.file, " ", outfile, ".gds"))
174 | 
175 | 


--------------------------------------------------------------------------------
/Scripts/Cloud/Terra/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Scripts/Cloud/Terra/.DS_Store


--------------------------------------------------------------------------------
/Scripts/Cloud/Terra/.Rhistory:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/ea7c9aea887b8f8b9c8e9290c062ba29c572b15f/Scripts/Cloud/Terra/.Rhistory


--------------------------------------------------------------------------------
/Scripts/Cloud/Terra/FAVORannotatorEssentialDB.wdl:
--------------------------------------------------------------------------------
 1 | workflow FAVORannotator{ 
 2 | 
 3 | 	File InputaGDS
 4 | 	Int  CHRN
 5 |   
 6 |     call FunctionalAnnotation {
 7 |     	input: 
 8 |         	InputaGDS=InputaGDS, CHRN=CHRN
 9 | 
10 |     }
11 | 
12 | }
13 | 
14 | task FunctionalAnnotation{
15 | 
16 |     Int CHRN            
17 |     File InputaGDS       
18 |     File? FAVORannotator = "gs://fc-secure-38f900cb-e5ed-481d-b866-6c98b7e5e7ea/FAVORannotatorTerra.R"
19 |     
20 |     runtime{
21 |       docker: "zilinli/staarpipeline:0.9.6"
22 |       memory: "36G"
23 |       cpu: "1"
24 |       zones: "us-central1-c us-central1-b"
25 |       disks: "local-disk " + 500 + " HDD"
26 |       preemptible: 1
27 |     }
28 |     
29 |     command {
30 |       curl https://sh.rustup.rs -sSf | sh -s -- -y
31 |       source $HOME/.cargo/env
32 |       cargo install xsv
33 |       echo ${InputaGDS}
34 |       echo ${CHRN}
35 | 	  df -a -h
36 |       Rscript ${FAVORannotator} ${InputaGDS} ${CHRN}
37 |       echo "Finished: in wdl r scripts"   
38 |       df -a -h
39 |       mv ${InputaGDS} AnnotatedOutput.${CHRN}.agds
40 |     }
41 |     
42 |     output {
43 |      	File OutputResults = "AnnotatedOutput.${CHRN}.agds"
44 |     }
45 | }
46 | 
47 | 


--------------------------------------------------------------------------------
/Scripts/Cloud/Terra/FAVORannotatorFullDB.wdl:
--------------------------------------------------------------------------------
 1 | workflow FAVORannotator{ 
 2 | 
 3 | 	File InputaGDS
 4 | 	Int  CHRN
 5 |   
 6 |     call FunctionalAnnotation {
 7 |     	input: 
 8 |         	InputaGDS=InputaGDS, CHRN=CHRN
 9 |     }
10 | 
11 | }
12 | 
13 | task FunctionalAnnotation{
14 | 
15 |     Int CHRN            
16 |     File InputaGDS       
17 |     File? FAVORannotator = "gs://fc-secure-38f900cb-e5ed-481d-b866-6c98b7e5e7ea/FAVORannotatorTerraFullDB.R"
18 |     
19 |     runtime{
20 |       docker: "zilinli/staarpipeline:0.9.6"
21 |       memory: "56G"
22 |       cpu: "1"
23 |       zones: "us-central1-c us-central1-b"
24 |       disks: "local-disk " + 500 + " HDD"
25 |       preemptible: 1
26 |     }
27 |     
28 |     command {
29 |       curl https://sh.rustup.rs -sSf | sh -s -- -y
30 |       source $HOME/.cargo/env
31 |       cargo install xsv
32 |       echo ${InputaGDS}
33 |       echo ${CHRN}
34 | 			df -a -h
35 |       Rscript ${FAVORannotator} ${InputaGDS} ${CHRN}
36 |       echo "Finished: in wdl r scripts"   
37 |       df -a -h
38 |       mv ${InputaGDS} AnnotatedOutput.${CHRN}.agds
39 |     }
40 |     
41 |     output {
42 |      	File OutputResults = "AnnotatedOutput.${CHRN}.agds"
43 |     }
44 | }
45 | 


--------------------------------------------------------------------------------
/Scripts/Cloud/Terra/FAVORannotatorTerra.r:
--------------------------------------------------------------------------------
  1 | args <- commandArgs(TRUE)
  2 | ### mandatory
  3 | 
  4 | gds.file <- args[1]
  5 | print(paste0("gds.file:  ",gds.file))
  6 | 
  7 | #outfile <- args[2]
  8 | #print(paste0("outfile:  ",outfile))
  9 | 
 10 | chr <- as.numeric(args[2])
 11 | print(paste0("chr:  ",chr))
 12 | #chr<-19
 13 | 
 14 | #use_compression <- args[4]
 15 | use_compression <- "Yes"
 16 | print(paste0("use_compression: ",use_compression))
 17 | 
 18 | ##########################################################################
 19 | ### Step 0 (Download FAVOR Database)
 20 | ##########################################################################
 21 | URLs <- data.frame(chr = c(1:22),
 22 |                    URL = c("https://dataverse.harvard.edu/api/access/datafile/6170506",
 23 |                            "https://dataverse.harvard.edu/api/access/datafile/6170501",
 24 |                            "https://dataverse.harvard.edu/api/access/datafile/6170502",
 25 |                            "https://dataverse.harvard.edu/api/access/datafile/6170521",
 26 |                            "https://dataverse.harvard.edu/api/access/datafile/6170511",
 27 |                            "https://dataverse.harvard.edu/api/access/datafile/6170516",
 28 |                            "https://dataverse.harvard.edu/api/access/datafile/6170505",
 29 |                            "https://dataverse.harvard.edu/api/access/datafile/6170513",
 30 |                            "https://dataverse.harvard.edu/api/access/datafile/6165867",
 31 |                            "https://dataverse.harvard.edu/api/access/datafile/6170507",
 32 |                            "https://dataverse.harvard.edu/api/access/datafile/6170517",
 33 |                            "https://dataverse.harvard.edu/api/access/datafile/6170520",
 34 |                            "https://dataverse.harvard.edu/api/access/datafile/6170503",
 35 |                            "https://dataverse.harvard.edu/api/access/datafile/6170509",
 36 |                            "https://dataverse.harvard.edu/api/access/datafile/6170515",
 37 |                            "https://dataverse.harvard.edu/api/access/datafile/6170518",
 38 |                            "https://dataverse.harvard.edu/api/access/datafile/6170510",
 39 |                            "https://dataverse.harvard.edu/api/access/datafile/6170508",
 40 |                            "https://dataverse.harvard.edu/api/access/datafile/6170514",
 41 |                            "https://dataverse.harvard.edu/api/access/datafile/6170512",
 42 |                            "https://dataverse.harvard.edu/api/access/datafile/6170519",
 43 |                            "https://dataverse.harvard.edu/api/access/datafile/6170504"))
 44 | 
 45 | URL <- URLs[chr, "URL"]
 46 | system(paste0("wget --progress=bar:force:noscroll ", URLs[chr, "URL"]))
 47 | system(paste0("tar -xvf ", gsub(".*?([0-9]+).*", "\\1", URL)))
 48 | 
 49 | ##########################################################################
 50 | ### Step 1 (Varinfo_gds)
 51 | ##########################################################################
 52 | 
 53 | ### output
 54 | output_path <- "/cromwell_root/./"
 55 | #output_path <- "/root/./"
 56 | 
 57 | ### make directory
 58 | system(paste0("mkdir ",output_path,"chr",chr))
 59 | 
 60 | ### R package
 61 | library(gdsfmt)
 62 | library(SeqArray)
 63 | library(SeqVarTools)
 64 | 
 65 | ### chromosome number
 66 | ## read info
 67 | DB_info <- read.csv(url("https://raw.githubusercontent.com/xihaoli/STAARpipeline-Tutorial/main/FAVORannotator_csv/FAVORdatabase_chrsplit.csv"),header=TRUE)
 68 | DB_info <- DB_info[DB_info$Chr==chr,]
 69 | 
 70 | ## open GDS
 71 | genofile <- seqOpen(gds.file)
 72 | 
 73 | genofile
 74 | 
 75 | CHR <- as.numeric(seqGetData(genofile, "chromosome"))
 76 | position <- as.integer(seqGetData(genofile, "position"))
 77 | REF <- as.character(seqGetData(genofile, "$ref"))
 78 | ALT <- as.character(seqGetData(genofile, "$alt"))
 79 | 
 80 | VarInfo_genome <- paste0(CHR,"-",position,"-",REF,"-",ALT)
 81 | 
 82 | seqClose(genofile)
 83 | 
 84 | ## Generate VarInfo
 85 | for(kk in 1:dim(DB_info)[1])
 86 | {
 87 |   print(kk)
 88 | 
 89 |   VarInfo <- VarInfo_genome[(position>=DB_info$Start_Pos[kk])&(position<=DB_info$End_Pos[kk])]
 90 |   VarInfo <- data.frame(VarInfo)
 91 | 
 92 |   write.csv(VarInfo,paste0(output_path,"chr",chr,"/VarInfo_chr",chr,"_",kk,".csv"),quote=FALSE,row.names = FALSE)
 93 | }
 94 | 
 95 | ##########################################################################
 96 | ### Step 2 (Annotate)
 97 | ##########################################################################
 98 | 
 99 | ### xsv directory
100 | #xsv <- "/cromwell_root/.cargo/bin/xsv"
101 | xsv <- "/root/.cargo/bin/xsv"
102 | 
103 | ### DB file
104 | DB_path <- "n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/"
105 | 
106 | ### anno channel (subset)
107 | anno_colnum <- c(1,8:12,15,16,19,23,25:36)
108 | 
109 | chr_splitnum <- sum(DB_info$Chr==chr)
110 | 
111 | for(kk in 1:chr_splitnum)
112 | {
113 |   print(kk)
114 | 
115 |   system(paste0(xsv," join --left VarInfo ",output_path,"chr",chr,"/VarInfo_chr",chr,"_",kk,".csv variant_vcf ",DB_path,"/chr",chr,"_",kk,".csv > ",output_path,"chr",chr,"/Anno_chr",chr,"_",kk,".csv"))
116 | }
117 | 
118 | ## merge info
119 | Anno <- paste0(output_path,"chr",chr,"/Anno_chr",chr,"_",seq(1:chr_splitnum),".csv ")
120 | merge_command <- paste0(xsv," cat rows ",Anno[1])
121 | 
122 | for(kk in 2:chr_splitnum)
123 | {
124 |   merge_command <- paste0(merge_command,Anno[kk])
125 | }
126 | 
127 | merge_command <- paste0(merge_command,"> ",output_path,"chr",chr,"/Anno_chr",chr,".csv")
128 | 
129 | system(merge_command)
130 | 
131 | ## subset
132 | anno_colnum_xsv <- c()
133 | for(kk in 1:(length(anno_colnum)-1))
134 | {
135 |   anno_colnum_xsv <- paste0(anno_colnum_xsv,anno_colnum[kk],",")
136 | }
137 | anno_colnum_xsv <- paste0(anno_colnum_xsv,anno_colnum[length(anno_colnum)])
138 | 
139 | system(paste0(xsv," select ",anno_colnum_xsv," ",output_path,"chr",chr,"/Anno_chr",chr,".csv > ",output_path,"chr",chr,"/Anno_chr",chr,"_STAARpipeline.csv"))
140 | 
141 | ##########################################################################
142 | ### Step 3 (gds2agds)
143 | ##########################################################################
144 | 
145 | ### annotation file
146 | dir_anno <- "/cromwell_root/"
147 | #dir_anno <- "/root/"
148 | anno_file_name_1 <- "Anno_chr"
149 | anno_file_name_2 <- "_STAARpipeline.csv"
150 | 
151 | ### load required package
152 | library(gdsfmt)
153 | library(SeqArray)
154 | library(SeqVarTools)
155 | library(readr)
156 | 
157 | ### read annotation data
158 | FunctionalAnnotation <- read_csv(paste0(dir_anno,"chr",chr,"/",anno_file_name_1,chr,anno_file_name_2),
159 |                                  col_types=list(col_character(),col_double(),col_double(),col_double(),col_double(),
160 |                                                 col_double(),col_double(),col_double(),col_double(),col_double(),
161 |                                                 col_character(),col_character(),col_character(),col_double(),col_character(),
162 |                                                 col_character(),col_character(),col_character(),col_character(),col_double(),
163 |                                                 col_double(),col_character()))
164 | 
165 | dim(FunctionalAnnotation)
166 | 
167 | ## rename colnames
168 | colnames(FunctionalAnnotation)[2] <- "apc_conservation"
169 | colnames(FunctionalAnnotation)[7] <- "apc_local_nucleotide_diversity"
170 | colnames(FunctionalAnnotation)[9] <- "apc_protein_function"
171 | 
172 | ## open GDS
173 | genofile <- seqOpen(gds.file, readonly = FALSE)
174 | 
175 | #Anno.folder <- addfolder.gdsn(index.gdsn(genofile, "annotation/info"), "FunctionalAnnotationTest1")
176 | Anno.folder <- index.gdsn(genofile, "annotation/info")
177 | if(use_compression == "YES")
178 | {
179 |   add.gdsn(Anno.folder, "FunctionalAnnotationJun1st2022", val=FunctionalAnnotation, compress="LZMA_ra", closezip=TRUE)
180 | }else
181 | {
182 |   add.gdsn(Anno.folder, "FunctionalAnnotationJun1st2022", val=FunctionalAnnotation)
183 | }
184 | genofile
185 | 
186 | seqClose(genofile)
187 | 
188 | #system(paste0("mv ", gds.file, " ", outfile, ".gds"))
189 | 
190 | 


--------------------------------------------------------------------------------
/Scripts/Cloud/Terra/FAVORannotatorTerraEssentialDB.R:
--------------------------------------------------------------------------------
  1 | args <- commandArgs(TRUE)
  2 | ### mandatory
  3 | 
  4 | gds.file <- args[1]
  5 | print(paste0("gds.file:  ",gds.file))
  6 | 
  7 | #outfile <- args[2]
  8 | #print(paste0("outfile:  ",outfile))
  9 | 
 10 | chr <- as.numeric(args[2])
 11 | print(paste0("chr:  ",chr))
 12 | #chr<-19
 13 | 
 14 | #use_compression <- args[4]
 15 | use_compression <- "Yes"
 16 | print(paste0("use_compression: ",use_compression))
 17 | 
 18 | ##########################################################################
 19 | ### Step 0 (Download FAVOR Database)
 20 | ##########################################################################
 21 | URLs <- data.frame(chr = c(1:22),
 22 |                    URL = c("https://dataverse.harvard.edu/api/access/datafile/6170506",
 23 |                            "https://dataverse.harvard.edu/api/access/datafile/6170501",
 24 |                            "https://dataverse.harvard.edu/api/access/datafile/6170502",
 25 |                            "https://dataverse.harvard.edu/api/access/datafile/6170521",
 26 |                            "https://dataverse.harvard.edu/api/access/datafile/6170511",
 27 |                            "https://dataverse.harvard.edu/api/access/datafile/6170516",
 28 |                            "https://dataverse.harvard.edu/api/access/datafile/6170505",
 29 |                            "https://dataverse.harvard.edu/api/access/datafile/6170513",
 30 |                            "https://dataverse.harvard.edu/api/access/datafile/6165867",
 31 |                            "https://dataverse.harvard.edu/api/access/datafile/6170507",
 32 |                            "https://dataverse.harvard.edu/api/access/datafile/6170517",
 33 |                            "https://dataverse.harvard.edu/api/access/datafile/6170520",
 34 |                            "https://dataverse.harvard.edu/api/access/datafile/6170503",
 35 |                            "https://dataverse.harvard.edu/api/access/datafile/6170509",
 36 |                            "https://dataverse.harvard.edu/api/access/datafile/6170515",
 37 |                            "https://dataverse.harvard.edu/api/access/datafile/6170518",
 38 |                            "https://dataverse.harvard.edu/api/access/datafile/6170510",
 39 |                            "https://dataverse.harvard.edu/api/access/datafile/6170508",
 40 |                            "https://dataverse.harvard.edu/api/access/datafile/6170514",
 41 |                            "https://dataverse.harvard.edu/api/access/datafile/6170512",
 42 |                            "https://dataverse.harvard.edu/api/access/datafile/6170519",
 43 |                            "https://dataverse.harvard.edu/api/access/datafile/6170504"))
 44 | 
 45 | URL <- URLs[chr, "URL"]
 46 | system(paste0("wget --progress=bar:force:noscroll ", URLs[chr, "URL"]))
 47 | system(paste0("tar -xvf ", gsub(".*?([0-9]+).*", "\\1", URL)))
 48 | 
 49 | ##########################################################################
 50 | ### Step 1 (Varinfo_gds)
 51 | ##########################################################################
 52 | 
 53 | ### output
 54 | output_path <- "/root/./"
 55 | 
 56 | ### make directory
 57 | system(paste0("mkdir ",output_path,"chr",chr))
 58 | 
 59 | ### R package
 60 | library(gdsfmt)
 61 | library(SeqArray)
 62 | library(SeqVarTools)
 63 | 
 64 | ### chromosome number
 65 | ## read info
 66 | DB_info <- read.csv(url("https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/main/Scripts/SQL/FAVORdatabase_chrsplit.csv"),header=TRUE)
 67 | DB_info <- DB_info[DB_info$Chr==chr,]
 68 | 
 69 | ## open GDS
 70 | genofile <- seqOpen(gds.file)
 71 | 
 72 | genofile
 73 | 
 74 | CHR <- as.numeric(seqGetData(genofile, "chromosome"))
 75 | position <- as.integer(seqGetData(genofile, "position"))
 76 | REF <- as.character(seqGetData(genofile, "$ref"))
 77 | ALT <- as.character(seqGetData(genofile, "$alt"))
 78 | 
 79 | VarInfo_genome <- paste0(CHR,"-",position,"-",REF,"-",ALT)
 80 | 
 81 | seqClose(genofile)
 82 | 
 83 | ## Generate VarInfo
 84 | for(kk in 1:dim(DB_info)[1])
 85 | {
 86 |   print(kk)
 87 | 
 88 |   VarInfo <- VarInfo_genome[(position>=DB_info$Start_Pos[kk])&(position<=DB_info$End_Pos[kk])]
 89 |   VarInfo <- data.frame(VarInfo)
 90 | 
 91 |   write.csv(VarInfo,paste0(output_path,"chr",chr,"/VarInfo_chr",chr,"_",kk,".csv"),quote=FALSE,row.names = FALSE)
 92 | }
 93 | 
 94 | ##########################################################################
 95 | ### Step 2 (Annotate)
 96 | ##########################################################################
 97 | 
 98 | ### xsv directory
 99 | xsv <- "/root/.cargo/bin/xsv"
100 | 
101 | ### DB file
102 | DB_path <- "n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/"
103 | 
104 | ### anno channel (subset)
105 | anno_colnum <- c(1,8:12,15,16,19,23,25:36)
106 | 
107 | chr_splitnum <- sum(DB_info$Chr==chr)
108 | 
109 | for(kk in 1:chr_splitnum)
110 | {
111 |   print(kk)
112 | 
113 |   system(paste0(xsv," join --left VarInfo ",output_path,"chr",chr,"/VarInfo_chr",chr,"_",kk,".csv variant_vcf ",DB_path,"/chr",chr,"_",kk,".csv > ",output_path,"chr",chr,"/Anno_chr",chr,"_",kk,".csv"))
114 | }
115 | 
116 | ## merge info
117 | Anno <- paste0(output_path,"chr",chr,"/Anno_chr",chr,"_",seq(1:chr_splitnum),".csv ")
118 | merge_command <- paste0(xsv," cat rows ",Anno[1])
119 | 
120 | for(kk in 2:chr_splitnum)
121 | {
122 |   merge_command <- paste0(merge_command,Anno[kk])
123 | }
124 | 
125 | merge_command <- paste0(merge_command,"> ",output_path,"chr",chr,"/Anno_chr",chr,".csv")
126 | 
127 | system(merge_command)
128 | 
129 | ## subset
130 | anno_colnum_xsv <- c()
131 | for(kk in 1:(length(anno_colnum)-1))
132 | {
133 |   anno_colnum_xsv <- paste0(anno_colnum_xsv,anno_colnum[kk],",")
134 | }
135 | anno_colnum_xsv <- paste0(anno_colnum_xsv,anno_colnum[length(anno_colnum)])
136 | 
137 | system(paste0(xsv," select ",anno_colnum_xsv," ",output_path,"chr",chr,"/Anno_chr",chr,".csv > ",output_path,"chr",chr,"/Anno_chr",chr,"_STAARpipeline.csv"))
138 | 
139 | ##########################################################################
140 | ### Step 3 (gds2agds)
141 | ##########################################################################
142 | 
143 | ### annotation file
144 | dir_anno <- "/root/"
145 | anno_file_name_1 <- "Anno_chr"
146 | anno_file_name_2 <- "_STAARpipeline.csv"
147 | 
148 | ### load required package
149 | library(gdsfmt)
150 | library(SeqArray)
151 | library(SeqVarTools)
152 | library(readr)
153 | 
154 | ### read annotation data
155 | FunctionalAnnotation <- read_csv(paste0(dir_anno,"chr",chr,"/",anno_file_name_1,chr,anno_file_name_2),
156 |                                  col_types=list(col_character(),col_double(),col_double(),col_double(),col_double(),
157 |                                                 col_double(),col_double(),col_double(),col_double(),col_double(),
158 |                                                 col_character(),col_character(),col_character(),col_double(),col_character(),
159 |                                                 col_character(),col_character(),col_character(),col_character(),col_double(),
160 |                                                 col_double(),col_character()))
161 | 
162 | dim(FunctionalAnnotation)
163 | 
164 | ## rename colnames
165 | colnames(FunctionalAnnotation)[2] <- "apc_conservation"
166 | colnames(FunctionalAnnotation)[7] <- "apc_local_nucleotide_diversity"
167 | colnames(FunctionalAnnotation)[9] <- "apc_protein_function"
168 | 
169 | ## open GDS
170 | genofile <- seqOpen(gds.file, readonly = FALSE)
171 | 
172 | #Anno.folder <- addfolder.gdsn(index.gdsn(genofile, "annotation/info"), "FunctionalAnnotationTest1")
173 | Anno.folder <- index.gdsn(genofile, "annotation/info")
174 | if(use_compression == "YES")
175 | {
176 |   add.gdsn(Anno.folder, "FunctionalAnnotationJun1st2022", val=FunctionalAnnotation, compress="LZMA_ra", closezip=TRUE)
177 | }else
178 | {
179 |   add.gdsn(Anno.folder, "FunctionalAnnotationJun1st2022", val=FunctionalAnnotation)
180 | }
181 | genofile
182 | 
183 | seqClose(genofile)
184 | 
185 | #system(paste0("mv ", gds.file, " ", outfile, ".gds"))
186 | 
187 | 


--------------------------------------------------------------------------------
/Scripts/Cloud/Terra/FAVORannotatorTerraFullDB.R:
--------------------------------------------------------------------------------
  1 | args <- commandArgs(TRUE)
  2 | ### mandatory
  3 | 
  4 | gds.file <- args[1]
  5 | print(paste0("gds.file:  ",gds.file))
  6 | 
  7 | #outfile <- args[2]
  8 | #print(paste0("outfile:  ",outfile))
  9 | 
 10 | chr <- as.numeric(args[2])
 11 | print(paste0("chr:  ",chr))
 12 | #chr<-19
 13 | 
 14 | #use_compression <- args[4]
 15 | use_compression <- "Yes"
 16 | print(paste0("use_compression: ",use_compression))
 17 | 
 18 | ##########################################################################
 19 | ### Step 0 (Download FAVOR Database)
 20 | ##########################################################################
 21 | URLs <- data.frame(chr = c(1:22),
 22 |                    URL = c("https://dataverse.harvard.edu/api/access/datafile/6380374",  #1
 23 |                            "https://dataverse.harvard.edu/api/access/datafile/6380471",  #2
 24 |                            "https://dataverse.harvard.edu/api/access/datafile/6380732",  #3
 25 |                            "https://dataverse.harvard.edu/api/access/datafile/6381512",  #4
 26 |                            "https://dataverse.harvard.edu/api/access/datafile/6381457",  #5
 27 |                            "https://dataverse.harvard.edu/api/access/datafile/6381327",  #6
 28 |                            "https://dataverse.harvard.edu/api/access/datafile/6384125",  #7
 29 |                            "https://dataverse.harvard.edu/api/access/datafile/6382573",  #8
 30 |                            "https://dataverse.harvard.edu/api/access/datafile/6384268",  #9
 31 |                            "https://dataverse.harvard.edu/api/access/datafile/6380273",  #10
 32 |                            "https://dataverse.harvard.edu/api/access/datafile/6384154",  #11
 33 |                            "https://dataverse.harvard.edu/api/access/datafile/6384198",  #12
 34 |                            "https://dataverse.harvard.edu/api/access/datafile/6388366",  #13
 35 |                            "https://dataverse.harvard.edu/api/access/datafile/6388406",  #14
 36 |                            "https://dataverse.harvard.edu/api/access/datafile/6388427",  #15
 37 |                            "https://dataverse.harvard.edu/api/access/datafile/6388551",  #16
 38 |                            "https://dataverse.harvard.edu/api/access/datafile/6388894",  #17
 39 |                            "https://dataverse.harvard.edu/api/access/datafile/6376523",  #18
 40 |                            "https://dataverse.harvard.edu/api/access/datafile/6376522",  #19
 41 |                            "https://dataverse.harvard.edu/api/access/datafile/6376521",  #20
 42 |                            "https://dataverse.harvard.edu/api/access/datafile/6358305",  #21
 43 |                            "https://dataverse.harvard.edu/api/access/datafile/6358299")) #22
 44 | 
 45 | URL <- URLs[chr, "URL"]
 46 | system(paste0("wget --progress=bar:force:noscroll ", URLs[chr, "URL"]))
 47 | system(paste0("tar -xvf ", gsub(".*?([0-9]+).*", "\\1", URL)))
 48 | 
 49 | ##########################################################################
 50 | ### Step 1 (Varinfo_gds)
 51 | ##########################################################################
 52 | 
 53 | ### output
 54 | output_path <- "/cromwell_root/./"
 55 | #output_path <- "/root/./"
 56 | 
 57 | ### make directory
 58 | system(paste0("mkdir ",output_path,"chr",chr))
 59 | 
 60 | ### R package
 61 | library(gdsfmt)
 62 | library(SeqArray)
 63 | library(SeqVarTools)
 64 | 
 65 | ### chromosome number
 66 | ## read info
 67 | DB_info <- read.csv(url("https://raw.githubusercontent.com/zhouhufeng/FAVORannotator/main/Scripts/SQL/FAVORdatabase_chrsplit.csv"),header=TRUE)
 68 | DB_info <- DB_info[DB_info$Chr==chr,]
 69 | 
 70 | ## open GDS
 71 | genofile <- seqOpen(gds.file)
 72 | 
 73 | genofile
 74 | 
 75 | CHR <- as.numeric(seqGetData(genofile, "chromosome"))
 76 | position <- as.integer(seqGetData(genofile, "position"))
 77 | REF <- as.character(seqGetData(genofile, "$ref"))
 78 | ALT <- as.character(seqGetData(genofile, "$alt"))
 79 | 
 80 | VarInfo_genome <- paste0(CHR,"-",position,"-",REF,"-",ALT)
 81 | 
 82 | seqClose(genofile)
 83 | 
 84 | ## Generate VarInfo
 85 | for(kk in 1:dim(DB_info)[1])
 86 | {
 87 |   print(kk)
 88 | 
 89 |   VarInfo <- VarInfo_genome[(position>=DB_info$Start_Pos[kk])&(position<=DB_info$End_Pos[kk])]
 90 |   VarInfo <- data.frame(VarInfo)
 91 | 
 92 |   write.csv(VarInfo,paste0(output_path,"chr",chr,"/VarInfo_chr",chr,"_",kk,".csv"),quote=FALSE,row.names = FALSE)
 93 | }
 94 | 
 95 | ##########################################################################
 96 | ### Step 2 (Annotate)
 97 | ##########################################################################
 98 | 
 99 | ### xsv directory
100 | #xsv <- "/cromwell_root/.cargo/bin/xsv"
101 | xsv <- "/root/.cargo/bin/xsv"
102 | 
103 | ### DB file
104 | DB_path <- "/cromwell_root/./"
105 | 
106 | ### anno channel (subset)
107 | #anno_colnum <- c(1,8:12,15,16,19,23,25:36)
108 | anno_colnum <- c(2:160)
109 | 
110 | chr_splitnum <- sum(DB_info$Chr==chr)
111 | 
112 | for(kk in 1:chr_splitnum)
113 | {
114 |   print(kk)
115 | 
116 | 	#system(paste0(xsv," index ",DB_path,"/chr",chr,"_",kk,".csv))
117 |   system(paste0(xsv," join --left VarInfo ",output_path,"chr",chr,"/VarInfo_chr",chr,"_",kk,".csv variant_vcf ",DB_path,"/chr",chr,"_",kk,".csv > ",output_path,"chr",chr,"/Anno_chr",chr,"_",kk,".csv"))
118 | }
119 | 
120 | ## merge info
121 | Anno <- paste0(output_path,"chr",chr,"/Anno_chr",chr,"_",seq(1:chr_splitnum),".csv ")
122 | merge_command <- paste0(xsv," cat rows ",Anno[1])
123 | 
124 | for(kk in 2:chr_splitnum)
125 | {
126 |   merge_command <- paste0(merge_command,Anno[kk])
127 | }
128 | 
129 | merge_command <- paste0(merge_command,"> ",output_path,"chr",chr,"/Anno_chr",chr,".csv")
130 | 
131 | system(merge_command)
132 | 
133 | ## subset
134 | anno_colnum_xsv <- c()
135 | for(kk in 1:(length(anno_colnum)-1))
136 | {
137 |   anno_colnum_xsv <- paste0(anno_colnum_xsv,anno_colnum[kk],",")
138 | }
139 | anno_colnum_xsv <- paste0(anno_colnum_xsv,anno_colnum[length(anno_colnum)])
140 | 
141 | system(paste0(xsv," select ",anno_colnum_xsv," ",output_path,"chr",chr,"/Anno_chr",chr,".csv > ",output_path,"chr",chr,"/Anno_chr",chr,"_STAARpipeline.csv"))
142 | 
143 | ##########################################################################
144 | ### Step 3 (gds2agds)
145 | ##########################################################################
146 | 
147 | ### annotation file
148 | dir_anno <- "/cromwell_root/"
149 | #dir_anno <- "/root/"
150 | anno_file_name_1 <- "Anno_chr"
151 | anno_file_name_2 <- "_STAARpipeline.csv"
152 | 
153 | ### load required package
154 | library(gdsfmt)
155 | library(SeqArray)
156 | library(SeqVarTools)
157 | library(readr)
158 | 
159 | ### read annotation data
160 | FunctionalAnnotation <- read_csv(paste0(dir_anno,"chr",chr,"/",anno_file_name_1,chr,anno_file_name_2))
161 | 
162 | #FunctionalAnnotation <- read_csv(paste0(dir_anno,"chr",chr,"/",anno_file_name_1,chr,anno_file_name_2),
163 | #                                 col_types=list(col_character(),col_double(),col_double(),col_double(),col_double(),
164 | #                                                col_double(),col_double(),col_double(),col_double(),col_double(),
165 | #                                                col_character(),col_character(),col_character(),col_double(),col_character(),
166 | #                                                col_character(),col_character(),col_character(),col_character(),col_double(),
167 | #                                                col_double(),col_character()))
168 | 
169 | dim(FunctionalAnnotation)
170 | 
171 | ## rename colnames
172 | #colnames(FunctionalAnnotation)[2] <- "apc_conservation"
173 | #colnames(FunctionalAnnotation)[7] <- "apc_local_nucleotide_diversity"
174 | #colnames(FunctionalAnnotation)[9] <- "apc_protein_function"
175 | 
176 | ## open GDS
177 | genofile <- seqOpen(gds.file, readonly = FALSE)
178 | 
179 | #Anno.folder <- addfolder.gdsn(index.gdsn(genofile, "annotation/info"), "FunctionalAnnotationTest1")
180 | Anno.folder <- index.gdsn(genofile, "annotation/info")
181 | if(use_compression == "YES")
182 | {
183 |   add.gdsn(Anno.folder, "FAVORFullDBAug1st2022", val=FunctionalAnnotation, compress="LZMA_ra", closezip=TRUE)
184 | }else
185 | {
186 |   add.gdsn(Anno.folder, "FAVORFullDBAug1st2022", val=FunctionalAnnotation)
187 | }
188 | genofile
189 | 
190 | seqClose(genofile)
191 | 
192 | #system(paste0("mv ", gds.file, " ", outfile, ".gds"))
193 | 
194 | 


--------------------------------------------------------------------------------
/Scripts/Cloud/Terra/convertVCFtoGDS.R:
--------------------------------------------------------------------------------
 1 | #############################################################################
 2 | #Title:    convertVCFtoGDS      
 3 | #Function: 
 4 | # * Build the GDS file from VCF files
 5 | #Author:   Hufeng Zhou
 6 | #Time:     Nov 27th 2021
 7 | #############################################################################
 8 | library(gdsfmt)
 9 | library(SeqArray)
10 | 
11 | args <- commandArgs(TRUE)
12 | ### mandatory
13 | 
14 | vcf.file <- args[1]
15 | print(paste0("gds.file:  ",gds.file))
16 | 
17 | gds.file <- args[2]
18 | print(paste0("gds.file:  ",gds.file))
19 | 
20 | seqVCF2GDS(vcf.file, gds.file, header = NULL, genotype.var.name = "GT", info.import=NULL, fmt.import=NULL, ignore.chr.prefix="chr", raise.error=TRUE, verbose=TRUE)
21 | genofile<-seqOpen(gds.file, readonly = FALSE)
22 | print("GDS built")
23 | 
24 | ###Closing Up###
25 | genofile
26 | seqClose(genofile)
27 | 


--------------------------------------------------------------------------------
/Scripts/Cloud/Terra/headercolumn.txt:
--------------------------------------------------------------------------------
1 | col_double(),variant_vcf,variant_annovar,col_double(),col_double(),col_double(),ref_annovar,alt_annovar,col_double(),ref_vcf,alt_vcf,aloft_value,aloft_description,apc_conservation,apc_conservation_v2,apc_epigenetics_active,apc_epigenetics,apc_epigenetics_repressed,apc_epigenetics_transcription,apc_local_nucleotide_diversity,apc_local_nucleotide_diversity_v2,apc_local_nucleotide_diversity_v3,apc_mappability,apc_micro_rna,apc_mutation_density,apc_protein_function,apc_protein_function_v2,apc_protein_function_v3,apc_proximity_to_coding,apc_proximity_to_coding_v2,apc_proximity_to_tsstes,apc_transcription_factor,bravo_an,bravo_af,filter_status,cage_enhancer,cage_promoter,cage_tc,clnsig,clnsigincl,clndn,clndnincl,clnrevstat,origin,clndisdb,clndisdbincl,geneinfo,polyphen2_hdiv_score,polyphen2_hvar_score,mutation_taster_score,mutation_assessor_score,metasvm_pred,fathmm_xf,funseq_value,funseq_description,genecode_comprehensive_category,genecode_comprehensive_info,genecode_comprehensive_exonic_category,genecode_comprehensive_exonic_info,genehancer,af_total,af_asj_female,af_eas_female,af_afr_male,af_female,af_fin_male,af_oth_female,af_ami,af_oth,af_male,af_ami_female,af_afr,af_eas_male,af_sas,af_nfe_female,af_asj_male,af_raw,af_oth_male,af_nfe_male,af_asj,af_amr_male,af_amr_female,af_sas_female,af_fin,af_afr_female,af_sas_male,af_amr,af_nfe,af_eas,af_ami_male,af_fin_female,linsight,gc,cpg,min_dist_tss,min_dist_tse,sift_cat,sift_val,polyphen_cat,polyphen_val,priphcons,mamphcons,verphcons,priphylop,mamphylop,verphylop,bstatistic,chmm_e1,chmm_e2,chmm_e3,chmm_e4,chmm_e5,chmm_e6,chmm_e7,chmm_e8,chmm_e9,chmm_e10,chmm_e11,chmm_e12,chmm_e13,chmm_e14,chmm_e15,chmm_e16,chmm_e17,chmm_e18,chmm_e19,chmm_e20,chmm_e21,chmm_e22,chmm_e23,chmm_e24,chmm_e25,gerp_rs,gerp_rs_pval,gerp_n,gerp_s,encodeh3k4me1_sum,encodeh3k4me2_sum,encodeh3k4me3_sum,encodeh3k9ac_sum,encodeh3k9me3_sum,encodeh3k27ac_sum,encodeh3k27me3_sum,encodeh3k36me3_sum,encodeh3k79me2_sum,encodeh4k20me1_sum,encodeh2afz_sum,encode_dnase_sum,encodetotal_rna_sum,grantham,freq100bp,rare100bp,sngl100bp,freq1000bp,rare1000bp,sngl1000bp,freq10000bp,rare10000bp,sngl10000bp,remap_overlap_tf,remap_overlap_cl,cadd_rawscore,cadd_phred,k24_bismap,k24_umap,k36_bismap,k36_umap,k50_bismap,k50_umap,k100_bismap,k100_umap,nucdiv,rdhs,recombination_rate,refseq_category,refseq_info,refseq_exonic_category,refseq_exonic_info,super_enhancer,tg_afr,tg_all,tg_amr,tg_eas,tg_eur,tg_sas,ucsc_category,ucsc_info,ucsc_exonic_category,ucsc_exonic_info
2 | 
3 | 
4 | col_types=list(col_character(),col_double(),col_double(),col_double(),col_double(),col_double(),col_double(),col_double(),col_double(),col_double(),col_character(),col_character(),col_character(),col_double(),col_character(),col_character(),col_character(),col_character(),col_character(),col_double(),col_double(),col_character()))
5 | 
6 | 


--------------------------------------------------------------------------------
/Scripts/Cloud/Terra/test.R:
--------------------------------------------------------------------------------
  1 | args <- commandArgs(TRUE)
  2 | ### mandatory
  3 | outfile <- args[1]
  4 | gds.file <- args[2]
  5 | chr <- as.numeric(args[3])
  6 | use_compression <- args[4]
  7 | 
  8 | ##########################################################################
  9 | ### Step 0 (Download FAVOR Database)
 10 | ##########################################################################
 11 | URLs <- data.frame(chr = c(1:22),
 12 |                    URL = c("https://dataverse.harvard.edu/api/access/datafile/6170506",
 13 |                            "https://dataverse.harvard.edu/api/access/datafile/6170501",
 14 |                            "https://dataverse.harvard.edu/api/access/datafile/6170502",
 15 |                            "https://dataverse.harvard.edu/api/access/datafile/6170521",
 16 |                            "https://dataverse.harvard.edu/api/access/datafile/6170511",
 17 |                            "https://dataverse.harvard.edu/api/access/datafile/6170516",
 18 |                            "https://dataverse.harvard.edu/api/access/datafile/6170505",
 19 |                            "https://dataverse.harvard.edu/api/access/datafile/6170513",
 20 |                            "https://dataverse.harvard.edu/api/access/datafile/6165867",
 21 |                            "https://dataverse.harvard.edu/api/access/datafile/6170507",
 22 |                            "https://dataverse.harvard.edu/api/access/datafile/6170517",
 23 |                            "https://dataverse.harvard.edu/api/access/datafile/6170520",
 24 |                            "https://dataverse.harvard.edu/api/access/datafile/6170503",
 25 |                            "https://dataverse.harvard.edu/api/access/datafile/6170509",
 26 |                            "https://dataverse.harvard.edu/api/access/datafile/6170515",
 27 |                            "https://dataverse.harvard.edu/api/access/datafile/6170518",
 28 |                            "https://dataverse.harvard.edu/api/access/datafile/6170510",
 29 |                            "https://dataverse.harvard.edu/api/access/datafile/6170508",
 30 |                            "https://dataverse.harvard.edu/api/access/datafile/6170514",
 31 |                            "https://dataverse.harvard.edu/api/access/datafile/6170512",
 32 |                            "https://dataverse.harvard.edu/api/access/datafile/6170519",
 33 |                            "https://dataverse.harvard.edu/api/access/datafile/6170504"))
 34 | 
 35 | URL <- URLs[chr, "URL"]
 36 | system(paste0("wget --progress=bar:force:noscroll ", URLs[chr, "URL"]))
 37 | system(paste0("tar -xvf ", gsub(".*?([0-9]+).*", "\\1", URL)))
 38 | 
 39 | ##########################################################################
 40 | ### Step 1 (Varinfo_gds)
 41 | ##########################################################################
 42 | 
 43 | ### output
 44 | output_path <- "./"
 45 | 
 46 | ### make directory
 47 | system(paste0("mkdir ",output_path,"chr",chr))
 48 | 
 49 | ### R package
 50 | library(gdsfmt)
 51 | library(SeqArray)
 52 | library(SeqVarTools)
 53 | 
 54 | ### chromosome number
 55 | ## read info
 56 | DB_info <- read.csv(url("https://raw.githubusercontent.com/xihaoli/STAARpipeline-Tutorial/main/FAVORannotator_csv/FAVORdatabase_chrsplit.csv"),header=TRUE)
 57 | DB_info <- DB_info[DB_info$Chr==chr,]
 58 | 
 59 | ## open GDS
 60 | genofile <- seqOpen(gds.file)
 61 | 
 62 | CHR <- as.numeric(seqGetData(genofile, "chromosome"))
 63 | position <- as.integer(seqGetData(genofile, "position"))
 64 | REF <- as.character(seqGetData(genofile, "$ref"))
 65 | ALT <- as.character(seqGetData(genofile, "$alt"))
 66 | 
 67 | VarInfo_genome <- paste0(CHR,"-",position,"-",REF,"-",ALT)
 68 | 
 69 | seqClose(genofile)
 70 | 
 71 | ## Generate VarInfo
 72 | for(kk in 1:dim(DB_info)[1])
 73 | {
 74 |   print(kk)
 75 | 
 76 |   VarInfo <- VarInfo_genome[(position>=DB_info$Start_Pos[kk])&(position<=DB_info$End_Pos[kk])]
 77 |   VarInfo <- data.frame(VarInfo)
 78 | 
 79 |   write.csv(VarInfo,paste0(output_path,"chr",chr,"/VarInfo_chr",chr,"_",kk,".csv"),quote=FALSE,row.names = FALSE)
 80 | }
 81 | 
 82 | ##########################################################################
 83 | ### Step 2 (Annotate)
 84 | ##########################################################################
 85 | 
 86 | ### xsv directory
 87 | xsv <- ".cargo/bin/xsv"
 88 | 
 89 | ### DB file
 90 | DB_path <- "n/holystore01/LABS/xlin/Lab/xihao_zilin/FAVORDB/"
 91 | 
 92 | ### anno channel (subset)
 93 | anno_colnum <- c(1,8:12,15,16,19,23,25:36)
 94 | 
 95 | chr_splitnum <- sum(DB_info$Chr==chr)
 96 | 
 97 | for(kk in 1:chr_splitnum)
 98 | {
 99 |   print(kk)
100 | 
101 |   system(paste0(xsv," join --left VarInfo ",output_path,"chr",chr,"/VarInfo_chr",chr,"_",kk,".csv variant_vcf ",DB_path,"/chr",chr,"_",kk,".csv > ",output_path,"chr",chr,"/Anno_chr",chr,"_",kk,".csv"))
102 | }
103 | 
104 | ## merge info
105 | Anno <- paste0(output_path,"chr",chr,"/Anno_chr",chr,"_",seq(1:chr_splitnum),".csv ")
106 | merge_command <- paste0(xsv," cat rows ",Anno[1])
107 | 
108 | for(kk in 2:chr_splitnum)
109 | {
110 |   merge_command <- paste0(merge_command,Anno[kk])
111 | }
112 | 
113 | merge_command <- paste0(merge_command,"> ",output_path,"chr",chr,"/Anno_chr",chr,".csv")
114 | 
115 | system(merge_command)
116 | 
117 | ## subset
118 | anno_colnum_xsv <- c()
119 | for(kk in 1:(length(anno_colnum)-1))
120 | {
121 |   anno_colnum_xsv <- paste0(anno_colnum_xsv,anno_colnum[kk],",")
122 | }
123 | anno_colnum_xsv <- paste0(anno_colnum_xsv,anno_colnum[length(anno_colnum)])
124 | 
125 | system(paste0(xsv," select ",anno_colnum_xsv," ",output_path,"chr",chr,"/Anno_chr",chr,".csv > ",output_path,"chr",chr,"/Anno_chr",chr,"_STAARpipeline.csv"))
126 | 
127 | ##########################################################################
128 | ### Step 3 (gds2agds)
129 | ##########################################################################
130 | 
131 | ### annotation file
132 | dir_anno <- ""
133 | anno_file_name_1 <- "Anno_chr"
134 | anno_file_name_2 <- "_STAARpipeline.csv"
135 | 
136 | ### load required package
137 | library(gdsfmt)
138 | library(SeqArray)
139 | library(SeqVarTools)
140 | library(readr)
141 | 
142 | ### read annotation data
143 | FunctionalAnnotation <- read_csv(paste0(dir_anno,"chr",chr,"/",anno_file_name_1,chr,anno_file_name_2),
144 |                                  col_types=list(col_character(),col_double(),col_double(),col_double(),col_double(),
145 |                                                 col_double(),col_double(),col_double(),col_double(),col_double(),
146 |                                                 col_character(),col_character(),col_character(),col_double(),col_character(),
147 |                                                 col_character(),col_character(),col_character(),col_character(),col_double(),
148 |                                                 col_double(),col_character()))
149 | 
150 | dim(FunctionalAnnotation)
151 | 
152 | ## rename colnames
153 | colnames(FunctionalAnnotation)[2] <- "apc_conservation"
154 | colnames(FunctionalAnnotation)[7] <- "apc_local_nucleotide_diversity"
155 | colnames(FunctionalAnnotation)[9] <- "apc_protein_function"
156 | 
157 | ## open GDS
158 | genofile <- seqOpen(gds.file, readonly = FALSE)
159 | 
160 | #Anno.folder <- addfolder.gdsn(index.gdsn(genofile, "annotation/info"), "FunctionalAnnotation")
161 | Anno.folder <- index.gdsn(genofile, "annotation/info")
162 | if(use_compression == "YES")
163 | {
164 |   add.gdsn(Anno.folder, "FunctionalAnnotation", val=FunctionalAnnotation, compress="LZMA_ra", closezip=TRUE)
165 | }else
166 | {
167 |   add.gdsn(Anno.folder, "FunctionalAnnotation", val=FunctionalAnnotation)
168 | }
169 | 
170 | seqClose(genofile)
171 | 
172 | system(paste0("mv ", gds.file, " ", outfile, ".gds"))
173 | 
174 | 


--------------------------------------------------------------------------------
/Scripts/Dockerize/Dockerfile.txt:
--------------------------------------------------------------------------------
 1 | # Base image https://hub.docker.com/u/rocker/
 2 | FROM rocker/r-base:latest
 3 | 
 4 | ## create directories
 5 | RUN mkdir -p /FAVORannotatorDocker
 6 | 
 7 | ## copy files
 8 | COPY ../CSV/FAVORannotatorv2aGDS.r .
 9 | COPY ../CSV/convertVCFtoGDS.r
10 | COPY ../CSV/config.R
11 | 
12 | ## Install R-packages
13 | RUN Rscript install_packages.R
14 | 
15 | ## Run R-scripts
16 | RUN Rscript FAVORannotatorv2aGDS.r
17 | 


--------------------------------------------------------------------------------
/Scripts/Dockerize/ExampleDockerFiles.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # Base image https://hub.docker.com/u/rocker/
 4 | FROM rocker/shiny:latest
 5 | 
 6 | # system libraries of general use
 7 | ## install debian packages
 8 | RUN apt-get update -qq && apt-get -y --no-install-recommends install \
 9 |     libxml2-dev \
10 |     libcairo2-dev \
11 |     libsqlite3-dev \
12 |     libmariadbd-dev \
13 |     libpq-dev \
14 |     libssh2-1-dev \
15 |     unixodbc-dev \
16 |     libcurl4-openssl-dev \
17 |     libssl-dev \
18 |     coinor-libcbc-dev coinor-libclp-dev libglpk-dev
19 | 
20 | 
21 | ## update system libraries
22 | RUN apt-get update && \
23 |     apt-get upgrade -y && \
24 |     apt-get clean
25 | 
26 | # copy necessary files
27 | ## app folder
28 | COPY ./bms_msisuite ./app
29 | 
30 | # Docker inheritance
31 | FROM bioconductor/bioconductor_docker:devel
32 | 
33 | RUN apt-get update
34 |     RUN R -e 'BiocManager::install(ask = F)' && R -e 'BiocManager::install(c("rtracklayer", \
35 |     "GenomicAlignments", "Biostrings", "SummarizedExperiment", "Rsamtools", ask = F))'
36 | # install renv & restore packages
37 | RUN Rscript -e 'install.packages("renv")'
38 | RUN Rscript -e 'install.packages("devtools")'
39 | RUN Rscript -e 'install.packages("shiny")'
40 | RUN Rscript -e 'install.packages("shinyBS")'
41 | RUN Rscript -e 'install.packages("ggvis")'
42 | RUN Rscript -e 'install.packages("shinydashboardPlus")'
43 | RUN Rscript -e 'install.packages("shinycssloaders")'
44 | RUN Rscript -e 'install.packages("shinyWidgets")'
45 | RUN Rscript -e 'install.packages("plotly")'
46 | RUN Rscript -e 'install.packages("RSQLite")'
47 | RUN Rscript -e 'install.packages("forecast", dependencies = TRUE)'
48 | RUN Rscript -e 'install.packages("tsutils")'
49 | RUN Rscript -e 'install.packages("readxl")'
50 | RUN Rscript -e 'install.packages("tidyverse")'
51 | RUN Rscript -e 'install.packages("knitr")'
52 | RUN Rscript -e 'install.packages("knitcitations")'
53 | RUN Rscript -e 'install.packages("nycflights13")'
54 | RUN Rscript -e 'install.packages("Matrix")'
55 | RUN Rscript -e 'install.packages("plotly")'
56 | RUN Rscript -e 'install.packages("igraph")'
57 | RUN Rscript -e 'install.packages("ggthemes")'
58 | RUN Rscript -e 'install.packages("evaluate")'
59 | RUN Rscript -e 'install.packages("psych")'
60 | RUN Rscript -e 'install.packages("kableExtra")'
61 | RUN Rscript -e 'install.packages("ggjoy")'
62 | RUN Rscript -e 'install.packages("gtools")'
63 | RUN Rscript -e 'install.packages("gridExtra")'
64 | RUN Rscript -e 'install.packages("cowplot")'
65 | RUN Rscript -e 'install.packages("ggrepel")'
66 | RUN Rscript -e 'install.packages("data.table")'
67 | RUN Rscript -e 'install.packages("stringr")'
68 | RUN Rscript -e 'install.packages("rmarkdown")'
69 | RUN Rscript -e 'install.packages("shinyjqui")'
70 | RUN Rscript -e 'install.packages("V8")'
71 | RUN Rscript -e 'devtools::install_github("ThomasSiegmund/D3TableFilter")'
72 | RUN Rscript -e 'devtools::install_github("leonawicz/apputils")'
73 | RUN Rscript -e 'devtools::install_github("Marlin-Na/trewjb")'
74 | 
75 | RUN Rscript -e 'devtools::install_github("dirkschumacher/ompr")'
76 | RUN Rscript -e 'devtools::install_github("dirkschumacher/ompr.roi")'
77 | 
78 | RUN Rscript -e 'install.packages("ROI.plugin.glpk")'
79 | 
80 | RUN Rscript -e 'install.packages("shinydashboard")'
81 | RUN Rscript -e 'install.packages("dplyr")'
82 | RUN Rscript -e 'install.packages("dashboardthemes")'
83 | RUN Rscript -e 'install.packages("shinyjs")'
84 | RUN Rscript -e 'install.packages("magrittr")'
85 | RUN Rscript -e 'install.packages("DT")'
86 | RUN Rscript -e 'install.packages("rhandsontable")'
87 | RUN Rscript -e 'renv::consent(provided = TRUE)'
88 | RUN Rscript -e 'renv::restore()'
89 | 
90 | 
91 | 
92 | # expose port
93 | EXPOSE 3838
94 | 
95 | # run app on container start
96 | CMD ["R", "-e", "shiny::runApp('/app', host = '0.0.0.0', port = 3838)"]
97 | 


--------------------------------------------------------------------------------
/Scripts/Dockerize/install_packages.R:
--------------------------------------------------------------------------------
 1 | FROM bioconductor/bioconductor_docker:devel
 2 | 
 3 | RUN apt-get update
 4 |     RUN R -e 'BiocManager::install(ask = F)' && R -e 'BiocManager::install(c("gdsfmt", "SeqArray", "SeqVarTools", ask = F))'
 5 | 
 6 | RUN Rscript -e 'install.packages("readr")'
 7 | RUN Rscript -e 'install.packages("devtools")'
 8 | 
 9 | RUN Rscript -e 'devtools::install_github("zhengxwen/gdsfmt")'
10 | 
11 | 


--------------------------------------------------------------------------------
/Scripts/SQL/FAVORannotatorv2aGDS.r:
--------------------------------------------------------------------------------
  1 | #############################################################################
  2 | #Title:    FAVORannotator      
  3 | #Function: 
  4 | # * Build the GDS file from VCF files
  5 | # * Extract the variant sites from the GDS to obtain functional annotation.
  6 | # * Read the offline FAVOR V2 sql database and provide functional annotation.
  7 | # * Built in the functional annotation into GDS to build aGDS.
  8 | #Author:   Hufeng Zhou
  9 | #Time:     Dec 16th 2021 
 10 | #############################################################################
 11 | library(gdsfmt)
 12 | library(SeqArray)
 13 | library(dplyr)
 14 | library(readr)
 15 | library(stringi)
 16 | library(stringr)
 17 | library(RPostgreSQL)
 18 | library(pryr)
 19 | source('config.R')
 20 | mem_used()
 21 | #vcf.fn=as.character(commandArgs(TRUE)[1])
 22 | #out.fn=as.character(commandArgs(TRUE)[2])
 23 | 
 24 | #N=as.character(commandArgs(TRUE)[1])
 25 | #seqVCF2GDS(vcf.fn, out.fn, header = NULL, genotype.var.name = "GT", info.import=NULL, fmt.import=NULL, ignore.chr.prefix="chr", raise.error=TRUE, verbose=TRUE)
 26 | start.time <- Sys.time()
 27 | CHRN=as.character(commandArgs(TRUE)[1])
 28 | genofile<-seqOpen(eval(parse(text = paste0("gds.chr",CHRN,".fn"))), readonly = FALSE)
 29 | print("GDS built")
 30 | genofile
 31 | CHR<-seqGetData(genofile,"chromosome")
 32 | POS<-seqGetData(genofile,"position")
 33 | REF<-seqGetData(genofile,"$ref")
 34 | ALT<-seqGetData(genofile,"$alt")
 35 | #############################################################################
 36 | #Here VariantsAnno is the data frame needs to retrieve functional annotation
 37 | #It needs to be feed into the SQL
 38 | #############################################################################
 39 | VariantsAnno <- data.frame(CHR, POS, REF, ALT)
 40 | VariantsAnno$CHR <- as.character(VariantsAnno$CHR)
 41 | VariantsAnno$POS <- as.integer(VariantsAnno$POS)
 42 | VariantsAnno$REF <- as.character(VariantsAnno$REF)
 43 | VariantsAnno$ALT <- as.character(VariantsAnno$ALT)
 44 | 
 45 | 
 46 | genDelimitedVariantString <- function(inputs)  {
 47 | 	quotedVariants <- dbQuoteString(ANSI(), inputs)
 48 | 	collapsedVariants <- paste(quotedVariants, collapse = "),(")
 49 | 	collapsedVariants <- paste0("(", collapsedVariants, ")")
 50 | 	return(collapsedVariants)
 51 | }
 52 | 
 53 | #performs batch annotation using the offline database for the
 54 | #specified variants
 55 | batchAnnotate <- function(inputData,blknum)	{
 56 | 
 57 | 		#parse input, silently ignoring variants which do not follow format
 58 | 		variants <- paste(paste0(inputData[, 1]), inputData[, 2], inputData[, 3], inputData[, 4], sep='-')
 59 | 		variants <- str_subset(variants, "^[:alnum:]+-\\d+-[:upper:]+-[:upper:]+$")
 60 | 					
 61 | 		#connect to database
 62 | 		driver <- dbDriver("PostgreSQL")
 63 |     connection <- dbConnect(driver, dbname= eval(parse(text = paste0("DBNAME_chr",CHRN))), host=eval(parse(text = paste0("HOST_chr",CHRN))), port=eval(parse(text = paste0("PORT_chr",CHRN))), user=USER_G, password=PASSWORD_G)
 64 | 
 65 | 		#drop the variant table if it already exists
 66 | 		variantTable <- "batch_variants"
 67 | 		if(dbExistsTable(connection, variantTable))	{
 68 | 			dbRemoveTable(connection, variantTable)	
 69 | 		}
 70 | 
 71 | 		#store variants in temporary table
 72 | 		collapsedVariants <- genDelimitedVariantString(variants)
 73 | 		query <- paste0("CREATE TEMP TABLE ", variantTable, " AS (VALUES ", collapsedVariants, ")")
 74 | 		results <- data.frame()
 75 | 		tryCatch({
 76 | 			results <- dbGetQuery(connection, query)
 77 | 		},
 78 | 		error = function(e)	{
 79 | 			stop("Error sending variants to database")
 80 | 		},
 81 | 		warning = function(w)	{
 82 | 			stop("Error sending variants to database")
 83 | 		})
 84 | 
 85 | 		#retrieve data
 86 | 		results <- data.frame()
 87 | 		query <- paste0("SELECT offline_view",blknum,".* FROM ", variantTable, " LEFT JOIN offline_view",blknum," ON ", variantTable, ".column1=offline_view",blknum,".variant_vcf")
 88 | 		tryCatch({
 89 | 			results <- dbGetQuery(connection, query)
 90 | 		},
 91 | 		error = function(e)	{
 92 | 			stop("Error retrieving results from database")
 93 | 		},
 94 | 		warning = function(w)	{
 95 | 		stop("Error retrieving results from database")
 96 | 		})
 97 | 
 98 | 		#clean up
 99 | 		dbDisconnect(connection)
100 | 
101 | 		return(results)
102 | }
103 | 
104 | DB_info <- read.csv("FAVORdatabase_chrsplit.csv",header=TRUE)
105 | DB_info <- DB_info[DB_info$Chr==CHRN,]
106 | VariantsAnnoTMP<-VariantsAnno[!duplicated(VariantsAnno),];
107 | VariantsBatchAnno <- data.frame();
108 | outlist<- list();
109 | for(kk in 1:dim(DB_info)[1]){
110 | 	print(kk) 
111 | 	dx<-VariantsAnnoTMP[(POS>=DB_info$Start_Pos[kk])&(POS<=DB_info$End_Pos[kk]),]
112 | 	outdx<-batchAnnotate(dx,kk)
113 | 	#VariantsBatchAnno<-bind_rows(VariantsBatchAnno,outdx)
114 | 	print(paste0(("finish annotate rounds/blocks: "),kk))
115 | 	outlist[[kk]]<-outdx
116 | }
117 | VariantsBatchAnno<-bind_rows(outlist);
118 | rm(dx,outdx)
119 | rm(VariantsAnnoTMP)
120 | rm(CHR, POS, REF, ALT)
121 | head(VariantsBatchAnno)
122 | mem_used()
123 | gc()
124 | ############################################
125 | ####This Variant is a searching key#########
126 | ############################################
127 | Anno.folder <- addfolder.gdsn(index.gdsn(genofile, "annotation/info"), "FunctionalAnnotation")
128 | #Anno.folder <- index.gdsn(genofile, "annotation/info/FunctionalAnnotation")
129 | #VariantsBatchAnno<-VariantsBatchAnno[!duplicated(VariantsBatchAnno),]
130 | VariantsAnno <- dplyr::left_join(VariantsAnno,VariantsBatchAnno, by = c("CHR" = "chromosome","POS" = "position","REF" = "ref_vcf","ALT" = "alt_vcf"))
131 | add.gdsn(Anno.folder, "FAVORannotator", val=VariantsAnno, compress="LZMA_ra", closezip=TRUE)
132 | ###Closing Up###
133 | genofile
134 | seqClose(genofile)
135 | 
136 | ###Time Count###
137 | end.time <- Sys.time()
138 | time.taken <- end.time - start.time
139 | time.taken
140 | 


--------------------------------------------------------------------------------
/Scripts/SQL/FAVORdatabase_chrsplit.csv:
--------------------------------------------------------------------------------
  1 | Chr,File_No,Start_Pos,End_Pos,Site_start,Site_end
  2 | 1,1,10001,16747958,0,50000002
  3 | 1,2,16747959,33299099,50000002,100000003
  4 | 1,3,33299100,49816132,100000003,150000003
  5 | 1,4,49816133,66336187,150000003,200000001
  6 | 1,5,66336188,82858506,200000001,250000004
  7 | 1,6,82858507,99383549,250000004,300000004
  8 | 1,7,99383550,115901640,300000004,350000004
  9 | 1,8,115901641,150511302,350000004,400000003
 10 | 1,9,150511303,167027454,400000003,450000002
 11 | 1,10,167027455,183550376,450000002,500000004
 12 | 1,11,183550377,200062349,500000004,550000005
 13 | 1,12,200062350,216579517,550000005,600000005
 14 | 1,13,216579518,233199672,600000005,650000004
 15 | 1,14,233199673,248946422,650000004,697723257
 16 | 2,1,10001,16521124,0,50000003
 17 | 2,2,16521125,33039678,50000003,100000003
 18 | 2,3,33039679,49545408,100000003,150000003
 19 | 2,4,49545409,66059434,150000003,200000004
 20 | 2,5,66059435,82573646,200000004,250000005
 21 | 2,6,82573647,100719302,250000005,300000003
 22 | 2,7,100719303,117241395,300000003,350000005
 23 | 2,8,117241396,133761855,350000005,400000003
 24 | 2,9,133761856,150281681,400000003,450000004
 25 | 2,10,150281682,166803862,450000004,500000004
 26 | 2,11,166803863,183319344,500000004,550000005
 27 | 2,12,183319345,199836331,550000005,600000004
 28 | 2,13,199836332,216347313,600000004,650000003
 29 | 2,14,216347314,232858536,650000003,700000001
 30 | 2,15,232858537,242183529,700000001,728249325
 31 | 3,1,10001,16524420,0,50000003
 32 | 3,2,16524421,33040541,50000003,100000005
 33 | 3,3,33040542,49557337,100000005,150000005
 34 | 3,4,49557338,66074840,150000005,200000005
 35 | 3,5,66074841,82594743,200000005,250000003
 36 | 3,6,82594744,99251340,250000003,300000005
 37 | 3,7,99251341,115776819,300000005,350000005
 38 | 3,8,115776820,132298678,350000005,400000003
 39 | 3,9,132298679,148820162,400000003,450000003
 40 | 3,10,148820163,165336299,450000003,500000003
 41 | 3,11,165336300,181849212,500000003,550000004
 42 | 3,12,181849213,198235559,550000004,599657406
 43 | 4,1,10001,16590803,0,50000004
 44 | 4,2,16590804,33122697,50000004,100000003
 45 | 4,3,33122698,49829213,100000003,150000003
 46 | 4,4,49829214,66446786,150000003,200000004
 47 | 4,5,66446787,82967242,200000004,250000003
 48 | 4,6,82967243,99485400,250000003,300000003
 49 | 4,7,99485401,116006538,300000003,350000005
 50 | 4,8,116006539,132525686,350000005,400000004
 51 | 4,9,132525687,149046341,400000004,450000005
 52 | 4,10,149046342,165559516,450000005,500000003
 53 | 4,11,165559517,182072370,500000003,550000005
 54 | 4,12,182072371,190204555,550000005,574493843
 55 | 5,1,10001,16522729,0,50000003
 56 | 5,2,16522730,33074583,50000003,100000003
 57 | 5,3,33074584,49720402,100000003,150000005
 58 | 5,4,49720403,66290955,150000005,200000004
 59 | 5,5,66290956,82815341,200000004,250000003
 60 | 5,6,82815342,99338210,250000003,300000004
 61 | 5,7,99338211,115854447,300000004,350000004
 62 | 5,8,115854448,132370902,350000004,400000005
 63 | 5,9,132370903,148894412,400000005,450000004
 64 | 5,10,148894413,165417298,450000004,500000005
 65 | 5,11,165417299,181478259,500000005,548653509
 66 | 6,1,60001,16563465,0,50000003
 67 | 6,2,16563466,33061324,50000003,100000003
 68 | 6,3,33061325,49572052,100000003,150000005
 69 | 6,4,49572053,66610016,150000005,200000003
 70 | 6,5,66610017,83124127,200000003,250000004
 71 | 6,6,83124128,99696645,250000004,300000005
 72 | 6,7,99696646,116211798,300000005,350000005
 73 | 6,8,116211799,132735327,350000005,400000003
 74 | 6,9,132735328,149256752,400000003,450000003
 75 | 6,10,149256753,165761505,450000003,500000005
 76 | 6,11,165761506,170745979,500000005,514950784
 77 | 7,1,10001,16497672,0,50000005
 78 | 7,2,16497673,33011778,50000005,100000005
 79 | 7,3,33011779,49535306,100000005,150000004
 80 | 7,4,49535307,66368293,150000004,200000003
 81 | 7,5,66368294,82861258,200000003,250000004
 82 | 7,6,82861259,99375599,250000004,300000004
 83 | 7,7,99375600,115883173,300000004,350000005
 84 | 7,8,115883174,132397651,350000005,400000005
 85 | 7,9,132397652,148959856,400000005,450000005
 86 | 7,10,148959857,159335973,450000005,481465078
 87 | 8,1,60001,16657938,0,50000004
 88 | 8,2,16657939,33168508,50000004,100000005
 89 | 8,3,33168509,49798213,100000005,150000004
 90 | 8,4,49798214,66324168,150000004,200000005
 91 | 8,5,66324169,82841802,200000005,250000005
 92 | 8,6,82841803,99411327,250000005,300000003
 93 | 8,7,99411328,115927799,300000003,350000005
 94 | 8,8,115927800,132449771,350000005,400000004
 95 | 8,9,132449772,145078636,400000004,438236670
 96 | 9,1,10001,16519111,0,50000004
 97 | 9,2,16519112,33034522,50000004,100000003
 98 | 9,3,33034523,65561731,100000003,150000005
 99 | 9,4,65561732,82636109,150000005,200000004
100 | 9,5,82636110,99151058,200000004,250000005
101 | 9,6,99151059,115668007,250000005,300000005
102 | 9,7,115668008,132176643,300000005,350000005
103 | 9,8,132176644,138334717,350000005,368668774
104 | 10,1,10001,16497977,0,50000005
105 | 10,2,16497978,32992389,50000005,100000003
106 | 10,3,32992390,49979963,100000003,150000005
107 | 10,4,49979964,66490426,150000005,200000003
108 | 10,5,66490427,82999924,200000003,250000003
109 | 10,6,82999925,99518158,250000003,300000003
110 | 10,7,99518159,116025435,300000003,350000005
111 | 10,8,116025436,133787422,350000005,403649527
112 | 11,1,60001,16573403,0,50000005
113 | 11,2,16573404,33089158,50000005,100000004
114 | 11,3,33089159,49604102,100000004,150000003
115 | 11,4,49604103,66492354,150000003,200000005
116 | 11,5,66492355,83106677,200000005,250000004
117 | 11,6,83106678,99650866,250000004,300000005
118 | 11,7,99650867,116169860,300000005,350000004
119 | 11,8,116169861,135076622,350000004,407232600
120 | 12,1,10001,16512884,0,50000005
121 | 12,2,16512885,33020486,50000005,100000003
122 | 12,3,33020487,49663834,100000003,150000003
123 | 12,4,49663835,66172393,150000003,200000004
124 | 12,5,66172394,82688873,200000004,250000005
125 | 12,6,82688874,99202936,250000005,300000003
126 | 12,7,99202937,115715845,300000003,350000004
127 | 12,8,115715846,133265309,350000004,403205116
128 | 13,1,16000001,32685483,0,50000005
129 | 13,2,32685484,49205873,50000005,100000003
130 | 13,3,49205874,65719740,100000003,150000003
131 | 13,4,65719741,82234653,150000003,200000005
132 | 13,5,82234654,98792965,200000005,250000005
133 | 13,6,98792966,114354328,250000005,296684670
134 | 14,1,16000001,32844491,0,50000003
135 | 14,2,32844492,49355389,50000003,100000003
136 | 14,3,49355390,65865851,100000003,150000002
137 | 14,4,65865852,82379468,150000002,200000005
138 | 14,5,82379469,98894359,200000005,250000004
139 | 14,6,98894360,106883718,250000004,274216494
140 | 15,1,17000001,33820808,0,50000004
141 | 15,2,33820809,50332483,50000004,100000003
142 | 15,3,50332484,66837081,100000003,150000005
143 | 15,4,66837082,83350271,150000005,200000004
144 | 15,5,83350272,99908885,200000004,250000003
145 | 15,6,99908886,101981189,250000003,256275016
146 | 16,1,10001,16489529,0,50000004
147 | 16,2,16489530,33041390,50000004,100000005
148 | 16,3,33041391,57939584,100000005,150000004
149 | 16,4,57939585,74442138,150000004,200000005
150 | 16,5,74442139,90228345,200000005,247862947
151 | 17,1,60001,16583426,0,50000004
152 | 17,2,16583427,33296314,50000004,100000005
153 | 17,3,33296315,49805199,100000005,150000004
154 | 17,4,49805200,66311745,150000004,200000003
155 | 17,5,66311746,83247441,200000003,251209815
156 | 18,1,10001,16579729,0,50000005
157 | 18,2,16579730,33187036,50000005,100000003
158 | 18,3,33187037,49761154,100000003,150000004
159 | 18,4,49761155,66276260,150000004,200000003
160 | 18,5,66276261,80263285,200000003,242369265
161 | 19,1,60001,16502931,0,50000003
162 | 19,2,16502932,33121104,50000003,100000005
163 | 19,3,33121105,49598738,100000005,150000004
164 | 19,4,49598739,58607616,150000004,177368363
165 | 20,1,60001,16578325,0,50000004
166 | 20,2,16578326,33448578,50000004,100000003
167 | 20,3,33448579,49959412,100000003,150000003
168 | 20,4,49959413,64334167,150000003,193559674
169 | 21,1,5010001,23102900,0,50000004
170 | 21,2,23102901,39607529,50000004,100000004
171 | 21,3,39607530,46699983,100000004,121352389
172 | 22,1,10510001,28173294,0,50000003
173 | 22,2,28173295,44666670,50000003,100000005
174 | 22,3,44666671,50808468,100000005,118627486
175 | 


--------------------------------------------------------------------------------
/Scripts/SQL/config.R:
--------------------------------------------------------------------------------
  1 | USER_G <- 'user name'
  2 | PASSWORD_G <- 'password'
  3 | 
  4 | #---------chr1-----------------------
  5 | vcf.chr1.fn<-"/n/location/input.vcf"
  6 | gds.chr1.fn<-"/n/location/output.gds"
  7 | 
  8 | DBNAME_chr1 <- 'postgres'
  9 | HOST_chr1 <- 'localhost'
 10 | PORT_chr1 <- 5432
 11 | 
 12 | #---------chr2-----------------------
 13 | vcf.chr2.fn<-"/n/location/input.vcf"
 14 | gds.chr2.fn<-"/n/location/output.gds"
 15 | 
 16 | DBNAME_chr2 <- 'postgres'
 17 | HOST_chr2 <- 'localhost'
 18 | PORT_chr2 <- 5432
 19 | 
 20 | #---------chr3-----------------------
 21 | vcf.chr3.fn<-"/n/location/input.vcf"
 22 | gds.chr3.fn<-"/n/location/output.gds"
 23 | 
 24 | DBNAME_chr3 <- 'postgres'
 25 | HOST_chr3 <- 'localhost'
 26 | PORT_chr3 <- 5432
 27 | 
 28 | #---------chr4-----------------------
 29 | vcf.chr4.fn<-"/n/location/input.vcf"
 30 | gds.chr4.fn<-"/n/location/output.gds"
 31 | 
 32 | DBNAME_chr4 <- 'postgres'
 33 | HOST_chr4 <- 'localhost'
 34 | PORT_chr4 <- 5432
 35 | 
 36 | #---------chr5-----------------------
 37 | vcf.chr5.fn<-"/n/location/input.vcf"
 38 | gds.chr5.fn<-"/n/location/output.gds"
 39 | 
 40 | DBNAME_chr5 <- 'postgres'
 41 | HOST_chr5 <- 'localhost'
 42 | PORT_chr5 <- 5432
 43 | 
 44 | #---------chr6-----------------------
 45 | vcf.chr6.fn<-"/n/location/input.vcf"
 46 | gds.chr6.fn<-"/n/location/output.gds"
 47 | 
 48 | DBNAME_chr6 <- 'postgres'
 49 | HOST_chr6 <- 'localhost'
 50 | PORT_chr6 <- 5432
 51 | 
 52 | #---------chr7-----------------------
 53 | vcf.chr7.fn<-"/n/location/input.vcf"
 54 | gds.chr7.fn<-"/n/location/output.gds"
 55 | 
 56 | DBNAME_chr7 <- 'postgres'
 57 | HOST_chr7 <- 'localhost'
 58 | PORT_chr7 <- 5432
 59 | 
 60 | #---------chr8-----------------------
 61 | vcf.chr8.fn<-"/n/location/input.vcf"
 62 | gds.chr8.fn<-"/n/location/output.gds"
 63 | 
 64 | DBNAME_chr8 <- 'postgres'
 65 | HOST_chr8 <- 'localhost'
 66 | PORT_chr8 <- 5432
 67 | 
 68 | #---------chr9-----------------------
 69 | vcf.chr9.fn<-"/n/location/input.vcf"
 70 | gds.chr9.fn<-"/n/location/output.gds"
 71 | 
 72 | DBNAME_chr9 <- 'postgres'
 73 | HOST_chr9 <- 'localhost'
 74 | PORT_chr9 <- 5432
 75 | 
 76 | #---------chr10-----------------------
 77 | vcf.chr10.fn<-"/n/location/input.vcf"
 78 | gds.chr10.fn<-"/n/location/output.gds"
 79 | 
 80 | DBNAME_chr10 <- 'postgres'
 81 | HOST_chr10 <- 'localhost'
 82 | PORT_chr10 <- 5432
 83 | 
 84 | #---------chr11-----------------------
 85 | vcf.chr11.fn<-"/n/location/input.vcf"
 86 | gds.chr11.fn<-"/n/location/output.gds"
 87 | 
 88 | DBNAME_chr11 <- 'postgres'
 89 | HOST_chr11 <- 'localhost'
 90 | PORT_chr11 <- 5432
 91 | 
 92 | #---------chr12-----------------------
 93 | vcf.chr12.fn<-"/n/location/input.vcf"
 94 | gds.chr12.fn<-"/n/location/output.gds"
 95 | 
 96 | DBNAME_chr12 <- 'postgres'
 97 | HOST_chr12 <- 'localhost'
 98 | PORT_chr12 <- 5432
 99 | 
100 | #---------chr13-----------------------
101 | vcf.chr13.fn<-"/n/location/input.vcf"
102 | gds.chr13.fn<-"/n/location/output.gds"
103 | 
104 | DBNAME_chr13 <- 'postgres'
105 | HOST_chr13 <- 'localhost'
106 | PORT_chr13 <- 5432
107 | 
108 | #---------chr14-----------------------
109 | vcf.chr14.fn<-"/n/location/input.vcf"
110 | gds.chr14.fn<-"/n/location/output.gds"
111 | 
112 | DBNAME_chr14 <- 'postgres'
113 | HOST_chr14 <- 'localhost'
114 | PORT_chr14 <- 5432
115 | 
116 | #---------chr15-----------------------
117 | vcf.chr15.fn<-"/n/location/input.vcf"
118 | gds.chr15.fn<-"/n/location/output.gds"
119 | 
120 | DBNAME_chr15 <- 'postgres'
121 | HOST_chr15 <- 'localhost'
122 | PORT_chr15 <- 5432
123 | 
124 | #---------chr16-----------------------
125 | vcf.chr16.fn<-"/n/location/input.vcf"
126 | gds.chr16.fn<-"/n/location/output.gds"
127 | 
128 | DBNAME_chr16 <- 'postgres'
129 | HOST_chr16 <- 'localhost'
130 | PORT_chr16 <- 5432
131 | 
132 | #---------chr17-----------------------
133 | vcf.chr17.fn<-"/n/location/input.vcf"
134 | gds.chr17.fn<-"/n/location/output.gds"
135 | 
136 | DBNAME_chr17 <- 'postgres'
137 | HOST_chr17 <- 'localhost'
138 | PORT_chr17 <- 5432
139 | 
140 | #---------chr18-----------------------
141 | vcf.chr18.fn<-"/n/location/input.vcf"
142 | gds.chr18.fn<-"/n/location/output.gds"
143 | 
144 | DBNAME_chr18 <- 'postgres'
145 | HOST_chr18 <- 'localhost'
146 | PORT_chr18 <- 5432
147 | 
148 | #---------chr19-----------------------
149 | vcf.chr19.fn<-"/n/location/input.vcf"
150 | gds.chr19.fn<-"/n/location/output.gds"
151 | 
152 | DBNAME_chr19 <- 'postgres'
153 | HOST_chr19 <- 'localhost'
154 | PORT_chr19 <- 5432
155 | 
156 | #---------chr20-----------------------
157 | vcf.chr20.fn<-"/n/location/input.vcf"
158 | gds.chr20.fn<-"/n/location/output.gds"
159 | 
160 | DBNAME_chr20 <- 'postgres'
161 | HOST_chr20 <- 'localhost'
162 | PORT_chr20 <- 5432
163 | 
164 | #---------chr21-----------------------
165 | vcf.chr21.fn<-"/n/location/input.vcf"
166 | gds.chr21.fn<-"/n/location/output.gds"
167 | 
168 | DBNAME_chr21 <- 'postgres'
169 | HOST_chr21 <- 'localhost'
170 | PORT_chr21 <- 5432
171 | 
172 | #---------chr22-----------------------
173 | vcf.chr22.fn<-"/n/location/input.vcf"
174 | gds.chr22.fn<-"/n/location/output.gds"
175 | 
176 | DBNAME_chr22 <- 'postgres'
177 | HOST_chr22 <- 'localhost'
178 | PORT_chr22 <- 5432
179 | 
180 | #---------chrX-----------------------
181 | vcf.chrX.fn<-"/n/location/input.vcf"
182 | gds.chrX.fn<-"/n/location/output.gds"
183 | 
184 | DBNAME_chrX <- 'postgres'
185 | HOST_chrX <- 'localhost'
186 | PORT_chrX <- 5432
187 | 
188 | #---------chrY-----------------------
189 | vcf.chrY.fn<-"/n/location/input.vcf"
190 | gds.chrY.fn<-"/n/location/output.gds"
191 | 
192 | DBNAME_chrY <- 'postgres'
193 | HOST_chrY <- 'localhost'
194 | PORT_chrY <- 5432
195 | 


--------------------------------------------------------------------------------
/Scripts/SQL/convertVCFtoGDS.r:
--------------------------------------------------------------------------------
 1 | #############################################################################
 2 | #Title:    convertVCFtoGDS      
 3 | #Function: 
 4 | # * Build the GDS file from VCF files
 5 | #Author:   Hufeng Zhou
 6 | #Time:     Nov 27th 2021
 7 | #############################################################################
 8 | library(gdsfmt)
 9 | library(SeqArray)
10 | 
11 | #import configuration file
12 | source('config.R')
13 | 
14 | #vcf.chr10.fn=as.character(commandArgs(TRUE)[1])
15 | #gds.chr10.fn=as.character(commandArgs(TRUE)[2])
16 | CHRN=as.character(commandArgs(TRUE)[1])
17 | seqVCF2GDS(eval(parse(text = paste0("vcf.chr",CHRN,".fn"))), eval(parse(text = paste0("gds.chr",CHRN,".fn"))), header = NULL, genotype.var.name = "GT", info.import=NULL, fmt.import=NULL, ignore.chr.prefix="chr", raise.error=TRUE, verbose=TRUE)
18 | genofile<-seqOpen(eval(parse(text = paste0("gds.chr",CHRN,".fn"))), readonly = FALSE)
19 | print("GDS built")
20 | 
21 | ###Closing Up###
22 | genofile
23 | seqClose(genofile)
24 | 


--------------------------------------------------------------------------------
/Scripts/SQL/convertVCFtoNullGenotypeGDS.r:
--------------------------------------------------------------------------------
 1 | #############################################################################
 2 | #Title:    convertVCFtoGDS      
 3 | #Function: 
 4 | # * Build the GDS file from VCF files
 5 | #Author:   Hufeng Zhou
 6 | #Time:     Nov 27th 2021
 7 | #############################################################################
 8 | library(gdsfmt)
 9 | library(SeqArray)
10 | 
11 | #import configuration file
12 | source('config.R')
13 | 
14 | #vcf.fn=as.character(commandArgs(TRUE)[1])
15 | #gds.fn=as.character(commandArgs(TRUE)[2])
16 | nogenotype.fn=as.character(commandArgs(TRUE)[2])
17 | 	
18 | seqVCF2GDS(vcf.fn, gds.fn, header = NULL, genotype.var.name = "GT", info.import=NULL, fmt.import=NULL, ignore.chr.prefix="chr", raise.error=TRUE, verbose=TRUE)
19 | genofile<-seqOpen(gds.fn, readonly = FALSE)
20 | print("GDS built")
21 | 
22 | #############################################################################
23 | # Remove samples/genotype data from the full GDS file
24 | #############################################################################
25 | seqSetFilter(genofile,sample.id=character(0))
26 | seqExport(genofile,nogenotype.fn,fmt.var=character(),samp.var=character(0),optimize=TRUE,digest=TRUE,verbose=TRUE)
27 | seqClose(genofile)
28 | 
29 | genofile<-seqOpen(nogenotype.fn, readonly = FALSE)
30 | ###Closing Up###
31 | genofile
32 | seqClose(genofile)
33 | 


--------------------------------------------------------------------------------
/Scripts/SQL/importCommands.sql:
--------------------------------------------------------------------------------
 1 | /* Title: Import the database into postgreSQL
 2 |  * Time: April 29th 2021
 3 |  * Author: Ted and Hufeng
 4 |  */
 5 | 
 6 | psql -h localhost -p portnumber -d FAVORV2
 7 | 
 8 | CREATE TABLE MAIN(
 9 | variant_vcf text,
10 | chromosome text,
11 | position integer,
12 | ref_vcf text,
13 | alt_vcf text,
14 | apc_conservation numeric,
15 | apc_conservation_v2 numeric,
16 | apc_epigenetics numeric,
17 | apc_epigenetics_active numeric,
18 | apc_epigenetics_repressed numeric,
19 | apc_epigenetics_transcription numeric,
20 | apc_local_nucleotide_diversity numeric,
21 | apc_local_nucleotide_diversity_v2 numeric,
22 | apc_local_nucleotide_diversity_v3 numeric,
23 | apc_mappability numeric,
24 | apc_micro_rna numeric,
25 | apc_mutation_density numeric,
26 | apc_protein_function numeric,
27 | apc_proximity_to_coding numeric,
28 | apc_proximity_to_coding_v2 numeric,
29 | apc_proximity_to_tsstes numeric,
30 | apc_transcription_factor numeric,
31 | cage_promoter text,
32 | cage_tc text,
33 | metasvm_pred text,
34 | rsid text,
35 | fathmm_xf numeric,
36 | genecode_comprehensive_category text,
37 | genecode_comprehensive_info text,
38 | genecode_comprehensive_exonic_info text,
39 | genecode_comprehensive_exonic_category text,
40 | genehancer text,
41 | linsight numeric,
42 | cadd_phred numeric,
43 | rdhs text);
44 | 
45 | COPY main FROM '/n/holystore01/LABS/xlin/Lab/zhouhufeng/DB/FAVORannotator/NewDB/FAVORAnnotatorDB.22.txt' CSV HEADER;
46 | 
47 | CREATE VIEW offline_view AS SELECT * FROM main;
48 | 
49 | CREATE INDEX ON main USING HASH(variant_vcf);
50 | 
51 | CREATE USER annotator WITH SUPERUSER PASSWORD 'DoMeAFAVOR';
52 | 


--------------------------------------------------------------------------------
/Scripts/SQL/submitJobs.sh:
--------------------------------------------------------------------------------
 1 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=65000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 1; Rscript ./FAVORannotatorv2aGDS.r 1'
 2 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=65000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 2; Rscript ./FAVORannotatorv2aGDS.r 2'
 3 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=60000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 3;  Rscript ./FAVORannotatorv2aGDS.r 3'
 4 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=60000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 4;  Rscript ./FAVORannotatorv2aGDS.r 4'
 5 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=55000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 5;  Rscript ./FAVORannotatorv2aGDS.r 6'
 6 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=50000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 6;  Rscript ./FAVORannotatorv2aGDS.r 5'
 7 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=50000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 7;  Rscript ./FAVORannotatorv2aGDS.r 7'
 8 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=50000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 8;  Rscript ./FAVORannotatorv2aGDS.r 8'
 9 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=40000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 9;  Rscript ./FAVORannotatorv2aGDS.r 9'
10 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=40000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 10; Rscript ./FAVORannotatorv2aGDS.r 10'
11 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=40000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 11; Rscript ./FAVORannotatorv2aGDS.r 11'
12 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=40000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 12; Rscript ./FAVORannotatorv2aGDS.r 12'
13 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=40000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 13; Rscript ./FAVORannotatorv2aGDS.r 13'
14 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=35000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 14; Rscript ./FAVORannotatorv2aGDS.r 14'
15 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=35000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 15; Rscript ./FAVORannotatorv2aGDS.r 15'
16 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=30000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 16; Rscript ./FAVORannotatorv2aGDS.r 16'
17 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=30000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 17; Rscript ./FAVORannotatorv2aGDS.r 17'
18 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=30000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 18; Rscript ./FAVORannotatorv2aGDS.r 18'
19 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=30000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 19; Rscript ./FAVORannotatorv2aGDS.r 19'
20 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=20000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 20; Rscript ./FAVORannotatorv2aGDS.r 20'
21 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=20000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 21;  Rscript ./FAVORannotatorv2aGDS.r 21'
22 | sbatch -n 1 -N 1 -t 10000 -p sharedn --mail-type=ALL --mem=20000 --wrap='module load postgresql/12.2-fasrc01; module load R/4.0.2-fasrc01; Rscript ./convertVCFtoGDS.r 22;  Rscript ./FAVORannotatorv2aGDS.r 22'
23 | 


--------------------------------------------------------------------------------
/Scripts/UTL/FAVORannotatorAddIn.R:
--------------------------------------------------------------------------------
 1 | #############################################################################
 2 | #Title:    FAVORannotatorAddIn      
 3 | #Function: 
 4 | # * Add in additional functional annotation into the aGDS file
 5 | #Author:   Hufeng Zhou
 6 | #Time:     Aug 27th 2022
 7 | #############################################################################
 8 | 
 9 | args <- commandArgs(TRUE)
10 | ### mandatory
11 | 
12 | gds.file <- args[1]
13 | print(paste0("gds.file:  ",gds.file))
14 | 
15 | anno.file <- args[2]
16 | print(paste0("anno.file:  ",anno.file))
17 | 
18 | 
19 | start_time <- Sys.time()
20 | use_compression <- "Yes"
21 | print(paste0("use_compression: ",use_compression))
22 | 
23 | ### annotation file
24 | dir_anno <- "./"
25 | 
26 | ### load required package
27 | library(gdsfmt)
28 | library(SeqArray)
29 | library(readr)
30 | 
31 | ### read annotation data
32 | #FunctionalAnnotation <- read_csv(paste0(dir_anno,"chr",chr,"/Anno_chr",chr,".csv"))
33 | FunctionalAnnotation <- read_delim(anno.file,delim = NULL)
34 | 
35 | dim(FunctionalAnnotation)
36 | 
37 | ## open GDS
38 | print("Before Adding Functional Annotation")
39 | genofile <- seqOpen(gds.file, readonly = FALSE)
40 | print("Working on Adding")
41 | genofile
42 | 
43 | Anno.folder <- index.gdsn(genofile, "annotation/info")
44 | add.gdsn(Anno.folder, "NewAnnotation", val=FunctionalAnnotation, compress="LZMA_ra", closezip=TRUE)
45 | 
46 | genofile
47 | 
48 | print("Add in Functional Annotation")
49 | 
50 | seqClose(genofile)
51 | end_time <- Sys.time()
52 | 
53 | print("time")
54 | end_time - start_time
55 | 
56 | 


--------------------------------------------------------------------------------
/Scripts/UTL/convBCF2GDS.r:
--------------------------------------------------------------------------------
 1 | #############################################################################
 2 | #Title:    convertVCFtoGDS      
 3 | #Function: 
 4 | # * Build the GDS file from BCF files
 5 | #Author:   Hufeng Zhou
 6 | #Time:     Aug 27th 2022
 7 | # This only runs on single core, therefore very slow.
 8 | #############################################################################
 9 | library(gdsfmt)
10 | library(SeqArray)
11 | 
12 | vcf.fn=as.character(commandArgs(TRUE)[1])
13 | gds.fn=as.character(commandArgs(TRUE)[2])
14 | seqBCF2GDS(vcf.fn, gds.fn, storage.option="LZMA_RA", bcftools="bcftools")
15 | genofile<-seqOpen(gds.fn, readonly = FALSE)
16 | print("GDS built")
17 | 
18 | ###Closing Up###
19 | genofile
20 | seqClose(genofile)
21 | 


--------------------------------------------------------------------------------
/Scripts/UTL/convertVCFtoGDS.r:
--------------------------------------------------------------------------------
 1 | #############################################################################
 2 | #Title:    convertVCFtoGDS      
 3 | #Function: 
 4 | # * Build the GDS file from VCF files
 5 | #Author:   Hufeng Zhou
 6 | #Time:     Aug 27th 2022
 7 | #############################################################################
 8 | library(gdsfmt)
 9 | library(SeqArray)
10 | 
11 | vcf.fn=as.character(commandArgs(TRUE)[1])
12 | gds.fn=as.character(commandArgs(TRUE)[2])
13 | seqVCF2GDS(vcf.fn, gds.fn, parallel=10)
14 | genofile<-seqOpen(gds.fn, readonly = FALSE)
15 | print("GDS built")
16 | 
17 | ###Closing Up###
18 | genofile
19 | seqClose(genofile)
20 | 


--------------------------------------------------------------------------------
/Scripts/UTL/convertVCFtoNullGenotypeGDS.r:
--------------------------------------------------------------------------------
 1 | #############################################################################
 2 | #Title:    convertVCFtoGDS      
 3 | #Function: 
 4 | # * Build the GDS file from VCF files
 5 | #Author:   Hufeng Zhou
 6 | #Time:     Aug 27th 2021
 7 | #############################################################################
 8 | library(gdsfmt)
 9 | library(SeqArray)
10 | 
11 | 
12 | vcf.fn=as.character(commandArgs(TRUE)[1])
13 | gds.fn=as.character(commandArgs(TRUE)[2])
14 | nogenotype.fn=as.character(commandArgs(TRUE)[2])
15 | 	
16 | seqVCF2GDS(vcf.fn, gds.fn, header = NULL, genotype.var.name = "GT", info.import=NULL, fmt.import=NULL, ignore.chr.prefix="chr", raise.error=TRUE, verbose=TRUE)
17 | genofile<-seqOpen(gds.fn, readonly = FALSE)
18 | print("GDS built")
19 | 
20 | #############################################################################
21 | # Remove samples/genotype data from the full GDS file
22 | #############################################################################
23 | seqSetFilter(genofile,sample.id=character(0))
24 | seqExport(genofile,nogenotype.fn,fmt.var=character(),samp.var=character(0),optimize=TRUE,digest=TRUE,verbose=TRUE)
25 | seqClose(genofile)
26 | 
27 | genofile<-seqOpen(nogenotype.fn, readonly = FALSE)
28 | ###Closing Up###
29 | genofile
30 | seqClose(genofile)
31 | 


--------------------------------------------------------------------------------
/Scripts/UTL/convertaGDStoVCF.r:
--------------------------------------------------------------------------------
 1 | #############################################################################
 2 | #Title:    convertVCFtoGDS      
 3 | #Function: 
 4 | # * Build the GDS file from VCF files
 5 | #Author:   Hufeng Zhou
 6 | #Time:     Aug 27th 2022
 7 | #############################################################################
 8 | library(gdsfmt)
 9 | library(SeqArray)
10 | 
11 | gds.fn=as.character(commandArgs(TRUE)[1])
12 | vcf.fn=as.character(commandArgs(TRUE)[2])
13 | #seqVCF2GDS(vcf.fn, gds.fn, parallel=10)
14 | genofile<-seqOpen(gds.fn, readonly = FALSE)
15 | 
16 | ###Closing Up###
17 | genofile
18 | seqClose(genofile)
19 | 
20 | ###Write Out###
21 | seqGDS2VCF(gds.fn,vcf.fn)
22 | print("GDS built")
23 | 
24 | 


--------------------------------------------------------------------------------
/Scripts/UTL/preProcessingVCF.sh:
--------------------------------------------------------------------------------
 1 | #Fixed Headers [make sure all fields are defined in header].
 2 | #Remove Duplicated VCFs [Make sure there is no duplicated VCF files].
 3 | 
 4 | #Remove FORMAT variables but only keep GT  [multi-core]
 5 | for fl in ukb23156_c19_b*_v1.vcf.gz; do  bcftools annotate -x ^FORMAT/GT $fl --threads 12 -Oz -o ./CVCF/$fl &; done
 6 | 
 7 | #Concat the smaller VCFs (sliced by variants) within each study into one VCF file  [24 mins]
 8 | bcftools concat --threads 12 ./CVCF/ukb23156_c19_b*_v1.vcf.gz -Oz -o ./ConcatVCF/ukb23156_c19_c12.vcf.gz
 9 | 
10 | #Break the multi-allelic sites into multiple rows of all the VCFs of each study [Indexed VCFs].
11 | bcftools norm -m -any --threads 12 ./ConcatVCF/ukb23156_c19_c12.vcf.gz -Oz -o ./ConcatVCF/ukb23156_c19_c12.bk.vcf.gz
12 | 
13 | #Normalize (left) the broken multi-allelic VCFs  [Indexed VCFs].
14 | bcftools norm -f --threads 12 hg38.p13.fa ./ConcatVCF/ukb23156_c19_c12.bk.vcf.gz -Oz -o ./ConcatVCF/ukb23156_c19_c12.bk.nm.vcf.gz
15 | 
16 | #Indexed cleaned VCFs
17 | bcftools index ./ConcatVCF/ukb23156_c19_c12.bk.nm.vcf.gz
18 | 
19 | #Sliced the Normalized VCFs into each chromosome.  [Indexed VCFs]
20 | bcftools view -r chr19 ./ConcatVCF/ukb23156_c12.bk.nm.vcf.gz -Oz -o ./ConcatVCF/ukb23156_c19_c12.bk.nm.vcf.gz
21 | 
22 | #Merge the Normalized VCFs (sliced by different samples) of each study into one VCF (per chromosome).
23 | bcftools  merge -m all --threads 6 ./DifferentStudies/ukbb*.bk.nm.vcf.gz -Oz -o ./MergedVCF/ukbb.merged.bk.nm.vcf.gz
24 | 
25 | #Convert the merged VCFs per chromosomes into GDSs (per chromosome) [72 mins]. 
26 | Rscripts ./convertVCFtoGDS.r ./MergedVCF/ukbb.merged.bk.nm.vcf.gz ./MergedGDS/ukbb.merged.bk.nm.gds
27 | 


--------------------------------------------------------------------------------