├── 22.04_environment.yml
├── Dockerfile
├── LICENSE
├── README.md
├── additional_scripts
    ├── annotate_gtf.py
    ├── blast_parser.py
    ├── dexseq_count.py
    ├── dexseq_prepare_annotation_fixed.py
    ├── download_proteome_uniprot.pl
    ├── homolog_parser.py
    └── transcriptome_metrics.sh
├── bash_scripts
    ├── add_ncbi_annotation.sh
    ├── annotate_my_genomes.sh
    ├── genome_download.sh
    ├── genome_download_macOSX.sh
    ├── get_transcripts.sh
    └── isoform_identification.sh
├── data_examples
    ├── braker_chr33.gtf
    ├── gene_counts
    ├── gene_counts_GSE114129
    └── transcripts.gtf.gz
├── environment.yml
├── makefile.nf
├── makefile.sh
├── nextflow_scripts
    ├── 22.04_environment.yml
    ├── add-ncbi-annotation.nf
    ├── annotate-my-genomes.nf
    ├── environment.yml
    ├── genome-download.nf
    ├── isoform-identification.nf
    └── old
    │   ├── add-ncbi-annotation.nf
    │   ├── annotate-my-genomes.nf
    │   └── isoform-identification.nf
└── test
    ├── gawn_config.sh
    └── stringtie_chr33.gtf


/22.04_environment.yml:
--------------------------------------------------------------------------------
  1 | name: annotate_my_genomes
  2 | channels:
  3 |   - anaconda
  4 |   - conda-forge
  5 |   - bioconda
  6 |   - defaults
  7 |   - r
  8 | dependencies:
  9 |   - _libgcc_mutex=0.1
 10 |   - _openmp_mutex=4.5
 11 |   - _r-mutex=1.0.1
 12 |   - argtable2=2.13
 13 |   - atk-1.0=2.36.0
 14 |   - bedtools=2.30.0
 15 |   - binutils_impl_linux-64=2.39
 16 |   - binutils_linux-64=2.39
 17 |   - bottleneck=1.3.5
 18 |   - bwidget=1.9.14
 19 |   - bzip2=1.0.8
 20 |   - c-ares=1.18.1
 21 |   - ca-certificates=2022.07.19
 22 |   - cairo=1.16.0
 23 |   - clustalo=1.2.4
 24 |   - clustalw=2.1
 25 |   - coreutils=9.1
 26 |   - curl=7.83.1
 27 |   - emboss=6.6.0
 28 |   - expat=2.5.0
 29 |   - fasta_ushuffle=0.2
 30 |   - feelnc=0.2
 31 |   - font-ttf-dejavu-sans-mono=2.37
 32 |   - font-ttf-inconsolata=3.000
 33 |   - font-ttf-source-code-pro=2.038
 34 |   - font-ttf-ubuntu=0.83
 35 |   - fontconfig=2.14.0
 36 |   - fonts-conda-ecosystem=1
 37 |   - fonts-conda-forge=1
 38 |   - freetype=2.10.4
 39 |   - fribidi=1.0.10
 40 |   - gawk=5.1.0
 41 |   - gcc_impl_linux-64=10.4.0
 42 |   - gcc_linux-64=10.4.0
 43 |   - gdk-pixbuf=2.42.8
 44 |   - gettext=0.21.1
 45 |   - gffcompare=0.11.2
 46 |   - gffread=0.12.7
 47 |   - gfortran_impl_linux-64=10.4.0
 48 |   - gfortran_linux-64=10.4.0
 49 |   - giflib=5.2.1
 50 |   - gmap=2021.08.25
 51 |   - graphite2=1.3.13
 52 |   - graphviz=2.50.0
 53 |   - gsl=2.7
 54 |   - gtk2=2.24.33
 55 |   - gts=0.7.6
 56 |   - gxx_impl_linux-64=10.4.0
 57 |   - gxx_linux-64=10.4.0
 58 |   - harfbuzz=4.4.1
 59 |   - htslib=1.14
 60 |   - icu=70.1
 61 |   - jpeg=9e
 62 |   - k8=0.2.5
 63 |   - kernel-headers_linux-64=2.6.32
 64 |   - keyutils=1.6.1
 65 |   - kmerinshort=1.0.1
 66 |   - krb5=1.19.3
 67 |   - ld_impl_linux-64=2.39
 68 |   - lerc=3.0
 69 |   - libblas=3.9.0
 70 |   - libcblas=3.9.0
 71 |   - libcurl=7.83.1
 72 |   - libdb=6.2.32
 73 |   - libdeflate=1.10
 74 |   - libedit=3.1.20191231
 75 |   - libev=4.33
 76 |   - libffi=3.4.2
 77 |   - libgcc=7.2.0
 78 |   - libgcc-devel_linux-64=10.4.0
 79 |   - libgcc-ng=12.2.0
 80 |   - libgd=2.3.3
 81 |   - libgfortran-ng=12.2.0
 82 |   - libgfortran5=12.2.0
 83 |   - libglib=2.70.2
 84 |   - libgomp=12.2.0
 85 |   - libiconv=1.17
 86 |   - liblapack=3.9.0
 87 |   - libnghttp2=1.47.0
 88 |   - libnsl=2.0.0
 89 |   - libopenblas=0.3.21
 90 |   - libpng=1.6.37
 91 |   - librsvg=2.54.4
 92 |   - libsanitizer=10.4.0
 93 |   - libssh2=1.10.0
 94 |   - libstdcxx-devel_linux-64=10.4.0
 95 |   - libstdcxx-ng=12.2.0
 96 |   - libtiff=4.3.0
 97 |   - libtool=2.4.6
 98 |   - libuuid=2.32.1
 99 |   - libwebp=1.2.2
100 |   - libwebp-base=1.2.2
101 |   - libxcb=1.13
102 |   - libxml2=2.9.14
103 |   - libxslt=1.1.33
104 |   - libzlib=1.2.11
105 |   - lz4-c=1.9.3
106 |   - make=4.3
107 |   - minimap2=2.24
108 |   - ncurses=6.3
109 |   - nomkl=1.0
110 |   - numexpr=2.8.3
111 |   - numpy=1.21.6
112 |   - openssl=1.1.1q
113 |   - packaging=21.3
114 |   - paml=4.9
115 |   - pandas=1.3.5
116 |   - pango=1.50.8
117 |   - parallel=20220922
118 |   - pcre=8.45
119 |   - pcre2=10.37
120 |   - perl=5.26.2
121 |   - perl-aceperl=1.92
122 |   - perl-algorithm-diff=1.1903
123 |   - perl-algorithm-munkres=0.08
124 |   - perl-apache-test=1.40
125 |   - perl-app-cpanminus=1.7044
126 |   - perl-appconfig=1.71
127 |   - perl-array-compare=3.0.1
128 |   - perl-autoloader=5.74
129 |   - perl-base=2.23
130 |   - perl-bio-asn1-entrezgene=1.73
131 |   - perl-bio-coordinate=1.007001
132 |   - perl-bio-featureio=1.6.905
133 |   - perl-bio-phylo=0.58
134 |   - perl-bio-samtools=1.43
135 |   - perl-bio-tools-phylo-paml=1.7.3
136 |   - perl-bio-tools-run-alignment-clustalw=1.7.4
137 |   - perl-bio-tools-run-alignment-tcoffee=1.7.4
138 |   - perl-bioperl=1.7.2
139 |   - perl-bioperl-core=1.007002
140 |   - perl-bioperl-run=1.007002
141 |   - perl-business-isbn=3.004
142 |   - perl-business-isbn-data=20140910.003
143 |   - perl-cache-cache=1.08
144 |   - perl-capture-tiny=0.48
145 |   - perl-carp=1.38
146 |   - perl-cgi=4.44
147 |   - perl-class-data-inheritable=0.08
148 |   - perl-class-inspector=1.34
149 |   - perl-class-load=0.25
150 |   - perl-class-load-xs=0.10
151 |   - perl-class-method-modifiers=2.12
152 |   - perl-clone=0.42
153 |   - perl-common-sense=3.74
154 |   - perl-compress-raw-zlib=2.087
155 |   - perl-constant=1.33
156 |   - perl-convert-binary-c=0.78
157 |   - perl-convert-binhex=1.125
158 |   - perl-crypt-rc4=2.02
159 |   - perl-data-dumper=2.173
160 |   - perl-data-optlist=0.110
161 |   - perl-data-stag=0.14
162 |   - perl-date-format=2.30
163 |   - perl-db-file=1.855
164 |   - perl-dbd-sqlite=1.64
165 |   - perl-dbi=1.642
166 |   - perl-devel-globaldestruction=0.14
167 |   - perl-devel-overloadinfo=0.005
168 |   - perl-devel-stacktrace=2.04
169 |   - perl-digest-hmac=1.03
170 |   - perl-digest-md5=2.55
171 |   - perl-digest-perl-md5=1.9
172 |   - perl-digest-sha1=2.13
173 |   - perl-dist-checkconflicts=0.11
174 |   - perl-dynaloader=1.25
175 |   - perl-email-date-format=1.005
176 |   - perl-encode=2.88
177 |   - perl-encode-locale=1.05
178 |   - perl-error=0.17027
179 |   - perl-eval-closure=0.14
180 |   - perl-exception-class=1.44
181 |   - perl-exporter=5.72
182 |   - perl-exporter-tiny=1.002001
183 |   - perl-extutils-makemaker=7.36
184 |   - perl-file-listing=6.04
185 |   - perl-file-path=2.16
186 |   - perl-file-slurp-tiny=0.004
187 |   - perl-file-sort=1.01
188 |   - perl-file-temp=0.2304
189 |   - perl-file-which=1.23
190 |   - perl-font-afm=1.20
191 |   - perl-font-ttf=1.06
192 |   - perl-gd=2.68
193 |   - perl-getopt-long=2.50
194 |   - perl-graph=0.9704
195 |   - perl-graphviz=2.24
196 |   - perl-html-element-extended=1.18
197 |   - perl-html-entities-numbered=0.04
198 |   - perl-html-formatter=2.16
199 |   - perl-html-parser=3.72
200 |   - perl-html-tableextract=2.13
201 |   - perl-html-tagset=3.20
202 |   - perl-html-tidy=1.60
203 |   - perl-html-tree=5.07
204 |   - perl-html-treebuilder-xpath=0.14
205 |   - perl-http-cookies=6.04
206 |   - perl-http-daemon=6.01
207 |   - perl-http-date=6.02
208 |   - perl-http-message=6.18
209 |   - perl-http-negotiate=6.01
210 |   - perl-image-info=1.38
211 |   - perl-image-size=3.300
212 |   - perl-io-html=1.001
213 |   - perl-io-sessiondata=1.03
214 |   - perl-io-socket-ssl=2.066
215 |   - perl-io-string=1.08
216 |   - perl-io-stringy=2.111
217 |   - perl-io-tty=1.12
218 |   - perl-ipc-run=20180523.0
219 |   - perl-ipc-sharelite=0.17
220 |   - perl-jcode=2.07
221 |   - perl-json=4.02
222 |   - perl-json-xs=2.34
223 |   - perl-lib=0.63
224 |   - perl-libwww-perl=6.39
225 |   - perl-libxml-perl=0.08
226 |   - perl-list-moreutils=0.428
227 |   - perl-list-moreutils-xs=0.428
228 |   - perl-local-lib=2.000024
229 |   - perl-lwp-mediatypes=6.04
230 |   - perl-lwp-protocol-https=6.07
231 |   - perl-lwp-simple=6.15
232 |   - perl-mailtools=2.21
233 |   - perl-math-cdf=0.1
234 |   - perl-math-derivative=1.01
235 |   - perl-math-random=0.72
236 |   - perl-math-spline=0.02
237 |   - perl-mime-base64=3.15
238 |   - perl-mime-lite=3.030
239 |   - perl-mime-tools=5.508
240 |   - perl-mime-types=2.17
241 |   - perl-mldbm=2.05
242 |   - perl-module-build=0.4224
243 |   - perl-module-implementation=0.09
244 |   - perl-module-runtime=0.016
245 |   - perl-module-runtime-conflicts=0.003
246 |   - perl-moo=2.003004
247 |   - perl-moose=2.2011
248 |   - perl-mozilla-ca=20180117
249 |   - perl-mro-compat=0.13
250 |   - perl-net-http=6.19
251 |   - perl-net-ssleay=1.88
252 |   - perl-ntlm=1.09
253 |   - perl-ole-storage_lite=0.19
254 |   - perl-package-deprecationmanager=0.17
255 |   - perl-package-stash=0.38
256 |   - perl-package-stash-xs=0.28
257 |   - perl-parallel-forkmanager=2.02
258 |   - perl-params-util=1.07
259 |   - perl-parent=0.236
260 |   - perl-parse-recdescent=1.967015
261 |   - perl-pathtools=3.75
262 |   - perl-pdf-api2=2.035
263 |   - perl-pod-escapes=1.07
264 |   - perl-pod-usage=1.69
265 |   - perl-postscript=0.06
266 |   - perl-role-tiny=2.000008
267 |   - perl-scalar-list-utils=1.52
268 |   - perl-set-scalar=1.29
269 |   - perl-soap-lite=1.19
270 |   - perl-socket=2.027
271 |   - perl-sort-naturally=1.03
272 |   - perl-spreadsheet-parseexcel=0.65
273 |   - perl-spreadsheet-writeexcel=2.40
274 |   - perl-statistics-descriptive=3.0702
275 |   - perl-storable=3.15
276 |   - perl-sub-exporter=0.987
277 |   - perl-sub-exporter-progressive=0.001013
278 |   - perl-sub-identify=0.14
279 |   - perl-sub-install=0.928
280 |   - perl-sub-name=0.21
281 |   - perl-sub-quote=2.006003
282 |   - perl-sub-uplevel=0.2800
283 |   - perl-svg=2.84
284 |   - perl-svg-graph=0.02
285 |   - perl-task-weaken=1.06
286 |   - perl-template-toolkit=2.26
287 |   - perl-test=1.26
288 |   - perl-test-deep=1.128
289 |   - perl-test-differences=0.67
290 |   - perl-test-exception=0.43
291 |   - perl-test-harness=3.42
292 |   - perl-test-leaktrace=0.16
293 |   - perl-test-most=0.35
294 |   - perl-test-requiresinternet=0.05
295 |   - perl-test-warn=0.36
296 |   - perl-text-diff=1.45
297 |   - perl-tie-ixhash=1.23
298 |   - perl-time-hires=1.9760
299 |   - perl-time-local=1.28
300 |   - perl-timedate=2.30
301 |   - perl-tree-dag_node=1.31
302 |   - perl-try-tiny=0.30
303 |   - perl-type-tiny=1.004004
304 |   - perl-types-serialiser=1.0
305 |   - perl-unicode-map=0.112
306 |   - perl-uri=1.76
307 |   - perl-www-robotrules=6.02
308 |   - perl-xml-dom=1.46
309 |   - perl-xml-dom-xpath=0.14
310 |   - perl-xml-filter-buffertext=1.01
311 |   - perl-xml-libxml=2.0132
312 |   - perl-xml-libxslt=1.94
313 |   - perl-xml-namespacesupport=1.12
314 |   - perl-xml-parser=2.44_01
315 |   - perl-xml-regexp=0.04
316 |   - perl-xml-sax=1.02
317 |   - perl-xml-sax-base=1.09
318 |   - perl-xml-sax-expat=0.51
319 |   - perl-xml-sax-writer=0.57
320 |   - perl-xml-simple=2.25
321 |   - perl-xml-twig=3.52
322 |   - perl-xml-writer=0.625
323 |   - perl-xml-xpath=1.44
324 |   - perl-xml-xpathengine=0.14
325 |   - perl-xsloader=0.24
326 |   - perl-yaml=1.29
327 |   - pip=22.3.1
328 |   - pixman=0.40.0
329 |   - pthread-stubs=0.4
330 |   - pyparsing=3.0.4
331 |   - python=3.7.12
332 |   - python-dateutil=2.8.2
333 |   - python_abi=3.7
334 |   - pytz=2022.1
335 |   - r-base=4.1.3
336 |   - r-bitops=1.0_7
337 |   - r-catools=1.18.2
338 |   - r-gplots=3.1.3
339 |   - r-gtools=3.9.3
340 |   - r-kernsmooth=2.23_20
341 |   - r-randomforest=4.7_1.1
342 |   - r-rocr=1.0_11
343 |   - readline=8.1.2
344 |   - sed=4.8
345 |   - seqkit=2.3.1
346 |   - setuptools=65.5.1
347 |   - six=1.16.0
348 |   - sqlite=3.38.5
349 |   - stringtie=2.2.1
350 |   - sysroot_linux-64=2.12
351 |   - t_coffee=11.0.8
352 |   - tidyp=1.04
353 |   - tk=8.6.12
354 |   - tktable=2.10
355 |   - transdecoder=5.5.0
356 |   - wheel=0.38.4
357 |   - xorg-kbproto=1.0.7
358 |   - xorg-libice=1.0.10
359 |   - xorg-libsm=1.2.3
360 |   - xorg-libx11=1.7.2
361 |   - xorg-libxau=1.0.9
362 |   - xorg-libxdmcp=1.1.3
363 |   - xorg-libxext=1.3.4
364 |   - xorg-libxrender=0.9.10
365 |   - xorg-libxt=1.2.1
366 |   - xorg-renderproto=0.11.1
367 |   - xorg-xextproto=7.3.0
368 |   - xorg-xproto=7.0.31
369 |   - xz=5.2.6
370 |   - zlib=1.2.11
371 |   - zstd=1.5.2
372 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
  1 | ARG UBUNTU_VER=20.04
  2 | ARG CONDA_VER=latest
  3 | ARG OS_TYPE=x86_64
  4 | ARG PY_VER=3.8.11
  5 | ARG TF_VER=2.5.0
  6 | 
  7 | FROM ubuntu:${UBUNTU_VER}
  8 | 
  9 | # System packages 
 10 | ARG DEBIAN_FRONTEND=noninteractive
 11 | ENV TZ=Etc
 12 | RUN apt-get update && apt-get install -yq build-essential g++ python-dev autotools-dev libicu-dev libbz2-dev libboost-all-dev zlib1g-dev curl wget unzip sed jq vim nano libidn11 libnet-perl perl-doc liblmdb-dev && apt-get install -y git && apt install -y make && apt install -y autoconf
 13 | RUN apt install -y parallel
 14 | 
 15 | 
 16 | # Install make
 17 | RUN apt update && apt install -y make && apt install -y autoconf
 18 | 
 19 | # cmake
 20 | RUN wget https://github.com/Kitware/CMake/releases/download/v3.17.3/cmake-3.17.3.tar.gz --no-check-certificate && tar -zxvf cmake-3.17.3.tar.gz && cd cmake-3.17.3 && apt-get install libssl-dev && ./bootstrap && make && make install && cd /
 21 | 
 22 | # R and dependences
 23 | RUN apt install -y dirmngr gnupg apt-transport-https ca-certificates software-properties-common && apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 && add-apt-repository 'deb https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/' && apt install -y r-base
 24 | RUN R -e "install.packages('ROCR',dependencies=TRUE, repos='http://cran.rstudio.com/')"
 25 | RUN R -e "install.packages('randomForest',dependencies=TRUE, repos='http://cran.rstudio.com/')"
 26 | 
 27 | # stringtie
 28 | RUN git clone https://github.com/gpertea/stringtie && cd stringtie && make release && make test && ./run_tests.sh && cp stringtie /usr/local/bin/ && cp stringtie /usr/bin/ && cd SuperReads_RNA && ./install.sh && cd /
 29 | 
 30 | # FEELnc
 31 | RUN apt-get install -y libcurl4 libcurl4-openssl-dev && apt-get install -y libxml-dom-xpath-perl && apt-get install -y cpanminus
 32 | RUN cpanm Parallel::ForkManager Bio::DB::SeqFeature
 33 | 
 34 | # Installing KmerInShort 
 35 | RUN git clone --recursive https://github.com/rizkg/KmerInShort && cd KmerInShort && mkdir build;  cd build;  cmake ..;  make -j 8 && cp KmerInShort /usr/local/bin/ && cp KmerInShort /usr/bin/ && cd /
 36 | 
 37 | # Installing fasta_ushuffle
 38 | RUN wget -O fasta_ushuffle.zip https://github.com/agordon/fasta_ushuffle/archive/refs/heads/master.zip --no-check-certificate && unzip fasta_ushuffle.zip && cd fasta_ushuffle-master/ && make  && cp fasta_ushuffle ushuffle /usr/local/bin/ && cp fasta_ushuffle ushuffle /usr/bin/ && cd / 
 39 | 
 40 | # Installing FEELnc
 41 | RUN git clone https://github.com/tderrien/FEELnc.git && cd /FEELnc && export FEELNCPATH=$(pwd) && export PERL5LIB=$PERL5LIB:${FEELNCPATH}/lib/ && export PATH=$PATH:${FEELNCPATH}/scripts/ && export PATH=$PATH:${FEELNCPATH}/utils/ && export PATH=$PATH:${FEELNCPATH}/bin/LINUX/ && cp -r ${FEELNCPATH}/bin/LINUX/ ~/bin/ 
 42 | ENV PATH=/FEELnc/bin/LINUX:${PATH}
 43 | ENV FEELNCPATH=/FEELnc
 44 | ENV PERL5LIB=:/FEELnc/lib/
 45 | ENV PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/FEELnc/bin/LINUX:/FEELnc/utils:/FEELnc/scripts/
 46 | 
 47 | # FEELnc Test
 48 | RUN cd /FEELnc/test/ && FEELnc_filter.pl -i transcript_chr38.gtf -a annotation_chr38.gtf -b transcript_biotype=protein_coding > candidate_lncRNA.gtf && FEELnc_codpot.pl -i candidate_lncRNA.gtf -a annotation_chr38.gtf -b transcript_biotype=protein_coding -g genome_chr38.fa --mode=shuffle && FEELnc_classifier.pl -i feelnc_codpot_out/candidate_lncRNA.gtf.lncRNA.gtf -a annotation_chr38.gtf > candidate_lncRNA_classes.txt && cd /
 49 | 
 50 | # Installing gffcompare and gclib
 51 | RUN git clone https://github.com/gpertea/gclib && git clone https://github.com/gpertea/gffcompare && git clone https://github.com/gpertea/gffread
 52 | RUN cd /gffcompare && make release && cp gffcompare trmap /usr/local/bin/ && cp gffcompare trmap /usr/bin/ && cd /
 53 | RUN cd /gffread && make release && cp gffread /usr/local/bin/ && cd /
 54 | 
 55 | # Installing ncbi-blast+
 56 | RUN apt-get remove -y ncbi-blast+
 57 | RUN apt-get install -y ncbi-blast+
 58 | 
 59 | # gmap 
 60 | RUN apt-get install -y gmap
 61 | 
 62 | # bedtools
 63 | RUN apt-get install -y bedtools
 64 | 
 65 | # samtools
 66 | RUN apt-get install -y samtools && apt-get install -y bcftools
 67 | 
 68 | # transdecoder (TransDecoder.LongOrfs 5.5.0)
 69 | RUN wget https://github.com/TransDecoder/TransDecoder/archive/refs/tags/TransDecoder-v5.5.0.zip --no-check-certificate && unzip TransDecoder-v5.5.0.zip && mv TransDecoder-TransDecoder-v5.5.0 TransDecoder-v5.5.0 && apt-get install -y hmmer
 70 | RUN cd /TransDecoder-v5.5.0 && ln -s /TransDecoder-v5.5.0/TransDecoder.LongOrfs /usr/local/bin/ && ln -s /TransDecoder-v5.5.0/TransDecoder.Predict /usr/local/bin/ && ln -s /TransDecoder-v5.5.0/TransDecoder.LongOrfs /usr/bin/ && ln -s /TransDecoder-v5.5.0/TransDecoder.Predict /usr/bin/ 
 71 | 
 72 | # seqkit
 73 | RUN wget https://github.com/shenwei356/seqkit/releases/download/v0.12.1/seqkit_linux_386.tar.gz --no-check-certificate && gunzip seqkit_linux_386.tar.gz && tar -xvf seqkit_linux_386.tar && cp seqkit /usr/local/bin/ && cp seqkit /usr/bin/ && cd /
 74 | 
 75 | # emboss
 76 | RUN apt-get install -y emboss
 77 | 
 78 | # Clustalo
 79 | RUN apt-get install -y clustalo
 80 | 
 81 | # Cufflinks
 82 | RUN apt-get install -y cufflinks
 83 | 
 84 | # gawk 
 85 | RUN apt install -y gawk
 86 | 
 87 | # minimap2
 88 | RUN apt-get -y install minimap2
 89 | 
 90 | # pandas
 91 | RUN apt-get -y install python3-pip
 92 | RUN pip install pandas
 93 | RUN pip install numpy
 94 | 
 95 | # annotate_my_genomes
 96 | RUN git clone https://github.com/cfarkas/annotate_my_genomes.git && cd annotate_my_genomes && chmod 755 ./makefile.sh && ./makefile.sh
 97 | RUN ln -s /annotate_my_genomes/bash_scripts/ /usr/local/bin/add_ncbi_annotation.sh
 98 | RUN ln -s /annotate_my_genomes/bash_scripts/ /usr/local/bin/annotate_my_genomes.sh 
 99 | RUN ln -s /annotate_my_genomes/bash_scripts/ /usr/local/bin/genome_download.sh
100 | RUN ln -s /annotate_my_genomes/bash_scripts/ /usr/local/bin/get_transcripts.sh
101 | RUN ln -s /annotate_my_genomes/bash_scripts/ /usr/bin/add_ncbi_annotation.sh
102 | RUN ln -s /annotate_my_genomes/bash_scripts/ /usr/bin/annotate_my_genomes.sh
103 | RUN ln -s /annotate_my_genomes/bash_scripts/ /usr/bin/genome_download.sh
104 | RUN ln -s /annotate_my_genomes/bash_scripts/ /usr/bin/get_transcripts.sh 
105 | ENV PATH=/annotate_my_genomes/bin:${PATH}
106 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 cfarkas
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # annotate_my_genomes
  2 | 
  3 | Transcriptome annotation pipeline using short and long sequencing reads from non-model (and model) animal organisms.
  4 | 
  5 | ![image](https://user-images.githubusercontent.com/7016350/108611599-a6319f00-73a5-11eb-89b7-3cfd44b00cc5.png)
  6 | 
  7 | #### See publication here: https://doi.org/10.1093/gigascience/giac099
  8 | 
  9 | ## I) Pipeline Outline
 10 |   ```annotate_my_genomes``` is a pipeline that aims to annotate genome-guided transcriptome assemblies from StringTie, coming from long read RNA-Seq alignments in vertebrate genomes (i.e. PacBio technology). Transcripts are classified by its coding potential, probable gene function and identified as novel or reconciliated with the current reference annotation from RefSeq/NCBI, without loosing isoform and exon information. Also, known/novel coding sequences in nucleotides and correspondent proteins will be resolved.  
 11 | 
 12 | This pipeline requieres to run:
 13 | 
 14 | 1) StringTie assembled transcripts (in GTF format). Check here: https://github.com/cfarkas/annotate_my_genomes/wiki#ii-obtaining-stringtie-gtf-file-for-annotation
 15 | 
 16 | 2) At minimum, coding UCSC/NCBI reference genome annotations (in GTF format) and genome assembly (non-masked fasta from UCSC). All these requirements can be downloaded once by using the ```genome-download``` program provided in this repository and inputting a genome prefix as follows: 
 17 | ```
 18 | ./genome-download [genome]  # mm10 for mouse, hg38 for human, galGal6 for chicken, etc. Use genome-download-macOSX instead in macOSX 
 19 | ```
 20 | - In example, ```./genome-download mm10 ``` , will output: ```mm10.fa```, ```mm10.gtf``` and ```mm10_ncbiRefSeq.gtf``` files.
 21 | - ```mm10.gtf``` contains coding genes and ```mm10_ncbiRefSeq.gtf``` contains all NCBI annotations.
 22 | 
 23 | - For genomes, check UCSC genome prefixes here: http://hgdownload.soe.ucsc.edu/downloads.html
 24 | 
 25 | 3) Finally, the basic pipeline can be runned using a mouse transcriptome as example (stringtie.gtf) and 20 threads, as follows:
 26 | ```
 27 | mkdir output1
 28 | ./annotate-my-genomes -a /path/to/stringtie.gtf -r /path/to/mm10.gtf -g /path/to/mm10.fa -c /path/to/annotate_my_genomes/gawn_config.sh -t 20 -o /path/to/output1
 29 | ```
 30 | The latter will output inside output1 folder:
 31 | ```
 32 | - final_annotated.gtf: an annotated GTF file in the "gene_id" field, containing novel genes and lncRNA classification (second field in GTF file). 
 33 | - transcripts.fa : associated transcripts from final_annotated.gtf 
 34 | - cds. fa: associated coding sequences to final_annotated.gtf
 35 | - prot.fa  associated protein sequences to final_annotated.gtf
 36 | - coding_transcripts.gtf: GTF file containing cds sequences.
 37 | - novel coding sequences (novel-cds.fa) and correspondent novel protein sequences (novel-prot.fa).
 38 | ```
 39 | * Users can also employ ```mm10_ncbiRefSeq.gtf``` by using ```add-ncbi-annotation``` instead of ```annotate-my-genomes``` binary. See an example here: https://github.com/cfarkas/annotate_my_genomes/blob/master/README.md#v-adding-ncbi-annotations-to-increase-annotation-of-transcripts
 40 | 
 41 | ## II) Installation:  
 42 | 
 43 | ### Option 1: Via Nextflow (recommended)
 44 | 
 45 | - Nextflow (https://www.nextflow.io/) is a great workflow framework and a programming DSL that eases the writing of data-intensive computational pipelines. We encourage and support the usage of this framework across different platforms for reproducibility. 
 46 | 
 47 | ### Requirements: 
 48 | 
 49 | - Nextflow can be installed as depicted here (https://www.nextflow.io/) or via anaconda as follows:
 50 | 
 51 | ```
 52 | conda install -c bioconda nextflow
 53 | ```
 54 | Also install (not through conda):
 55 | 
 56 | - ```wget``` Comes by default with Linux/Ubuntu distros
 57 | - ```sed``` editor. Comes by default with Linux/Ubuntu distros
 58 | - ```ncbi-blast+``` version equal or higher than v2.7.1. To install it, see here: https://github.com/cfarkas/annotate_my_genomes/wiki#5-installing-up-to-date-ncbi-blast-version-v271
 59 | - ```SAMtools``` . To install it, see here: https://github.com/cfarkas/annotate_my_genomes/wiki#9-obtaining-and-installing-up-to-date-samtools-with-htslib-version--19
 60 | 
 61 | 
 62 | ### Installation:
 63 | 
 64 | In a given directory:
 65 | ```
 66 | git clone https://github.com/cfarkas/annotate_my_genomes.git                        # clone repository
 67 | cd annotate_my_genomes                                                              # enter repository
 68 | current_dir=$(pwd)                                                                  # set working directory
 69 | echo $current_dir                                                                   # check working directory
 70 | nextflow run makefile.nf --workdir $current_dir --conda ./22.04_environment.yml     # make & install; use environment.yml for Ubuntu < 22.04
 71 | ```
 72 | 
 73 | ### Option 2: Installing dependences via anaconda (tested in Ubuntu 16.05, 18.04, 20.04 and 22.04 LTS)
 74 | 
 75 | ### Requirements: 
 76 | - requires miniconda, python2.7 and/or python>=3. To install miniconda, see: https://docs.conda.io/en/latest/miniconda.html
 77 | 
 78 | Also install (not through conda):
 79 | 
 80 | - ```wget``` Comes by default with Linux/Ubuntu distros
 81 | - ```sed``` editor. Comes by default with Linux/Ubuntu distros
 82 | - ```ncbi-blast+``` version equal or higher than v2.7.1. To install it, see here: https://github.com/cfarkas/annotate_my_genomes/wiki#5-installing-up-to-date-ncbi-blast-version-v271
 83 | - ```SAMtools``` . To install it, see here: https://github.com/cfarkas/annotate_my_genomes/wiki#9-obtaining-and-installing-up-to-date-samtools-with-htslib-version--19
 84 | 
 85 | ### Installation:
 86 | 
 87 | In a given directory:
 88 | ```
 89 | git clone https://github.com/cfarkas/annotate_my_genomes.git   # clone repository
 90 | cd annotate_my_genomes                                         # enter repository
 91 | conda config --add channels bioconda                           # add bioconda channel (if you haven't already done so)
 92 | conda config --add channels conda-forge                        # add conda-forge channel (if you haven't already done so)
 93 | conda env create -f 22.04_environment.yml                      # create and install environment; use environment.yml for Ubuntu < 22.04
 94 | conda activate annotate_my_genomes                             # activate environment
 95 | bash makefile.sh                                               # make  & install
 96 | ```
 97 | - Copy binaries to ```/usr/local/bin```
 98 | ```
 99 | sudo cp ./bin/* /usr/local/bin/
100 | ```
101 | 
102 | After these steps, a conda enviroment called ```annotate_my_genomes``` can be managed as follows:
103 | ```
104 | # To activate this environment, use
105 | #
106 | #     $ conda activate annotate_my_genomes
107 | #
108 | # To deactivate an active environment, use
109 | #
110 | #     $ conda deactivate
111 | ```
112 | 
113 | #### Notes: 
114 | 
115 | - By activating annotate_my_genomes enviroment, all binaries in the annotate_my_genomes repository can be executed.
116 | - To install optional programs for downstream analysis, please see here: https://github.com/cfarkas/annotate_my_genomes/wiki#optional-dependences-to-run-all-the-downstream-analysis
117 | 
118 | - Uninstall environment as follows: 
119 | ```
120 | conda remove --name annotate_my_genomes --all
121 | ```
122 | 
123 | - Inside the repository, there is a file called ```gawn_config.sh```. Optionally, edit and increase/decrease the number of cpus for blast processing:
124 | ```
125 | NCPUS=10
126 | ```
127 | To a value according to the computational capacity of your machine.
128 | 
129 | ### Option 3: Run through docker:
130 | - See installation and pipeline run here: https://hub.docker.com/r/carlosfarkas/annotate_my_genomes
131 | ```
132 | # Run docker without sudo privileges as follows:
133 | sudo chmod 666 /var/run/docker.sock
134 | 
135 | # Downloading the docker image
136 | docker pull carlosfarkas/annotate_my_genomes:latest
137 | 
138 | # Downloading repository
139 | git clone https://github.com/cfarkas/annotate_my_genomes.git && cd annotate_my_genomes
140 | 
141 | # make & install using workdir
142 | chmod 755 makefile.sh
143 | docker run --volume $HOME:$HOME --workdir $(pwd) carlosfarkas/annotate_my_genomes ./makefile.sh         # make & install
144 |                     
145 |                     OR
146 |                     
147 | # make & install using -it (interactively)
148 | docker run -v $(pwd):/annotate_my_genomes -it carlosfarkas/annotate_my_genomes:latest
149 | cd annotate_my_genomes/
150 | bash makefile.sh     
151 | ```
152 | ### Option 4: Without using conda, program by program:
153 | 
154 | - See detailed installation steps in our wiki here: https://github.com/cfarkas/annotate_my_genomes/wiki
155 | 
156 | ## III) Running the whole pipeline via nextflow (recommended)
157 | 
158 | - Inside ```annotate_my_genomes``` folder, enter into ```nextflow_scripts``` subdirectory and run the full pipeline using ```--flags``` parameters. 
159 | - NOTE 1: Users **must provide full paths to inputs in the command line**.  We recommed to split the flags with backslashes and run the pipeline exactly as follows: 
160 | - NOTE 2: Use environment.yml for Ubuntu < 22.04
161 | ```
162 | cd nextflow_scripts/
163 | ```
164 | 2.1) Run ```genome-download.nf``` (i.e : output galGal6 genome)
165 | ```
166 | nextflow run genome-download.nf \
167 | --genome galGal6 \
168 | --conda /path/to/22.04_environment.yml --outdir /path/to/output_folder/
169 | ```
170 | 2.2) Run ```annotate-my-genomes.nf``` . Details here: https://github.com/cfarkas/annotate_my_genomes/blob/master/README.md#b-simplest-usage
171 | ```
172 | nextflow run annotate-my-genomes.nf \
173 | --stringtie /path/to/stringtie.gtf \
174 | --ref_annotation /path/to/galGal6.gtf \ 
175 | --genome /path/to/galGal6.fa \
176 | --config /path/to/annotate_my_genomes/gawn_config.sh \
177 | --threads 20 \
178 | --conda /path/to/22.04_environment.yml --outdir /path/to/output_folder/
179 | ```
180 | 2.3) Run ```add-ncbi-annotation.nf``` . Details here: https://github.com/cfarkas/annotate_my_genomes/blob/master/README.md#c-adding-ncbi-annotations-to-increase-annotation-of-transcripts
181 | ```
182 | nextflow run add-ncbi-annotation.nf \
183 | --stringtie /path/to/stringtie.gtf \
184 | --NCBI_annotation /path/to/galGal6_ncbiRefSeq.gtf \
185 | --ref_annotation /path/to/galGal6.gtf \
186 | --genome /path/to/galGal6.fa \
187 | --config /path/to/annotate_my_genomes/gawn_config.sh \
188 | --threads 20  \
189 | --conda /path/to/22.04_environment.yml --outdir /path/to/output_folder/
190 | ```
191 | 2.4) Run ```isoform-identification.nf``` . Details here: https://github.com/cfarkas/annotate_my_genomes/blob/master/README.md#d-post-processing-add-ncbi-annotation-outputs
192 | ```
193 | nextflow run isoform-identification.nf \
194 | --NCBI_tmap /path/to/gffcompare.tmap \
195 | --NCBI_transcripts /path/to/NCBI_transcripts.fa \
196 | --genome_name galGal6 \
197 | --conda /path/to/22.04_environment.yml --outdir /path/to/output_folder/
198 | ```
199 | 
200 | #### Notes: 
201 | 
202 | - Users must provide full paths to files when running nextflow scripts.
203 | 
204 | - Inside the repository, there is a file called gawn_config.sh. Optionally, edit and increase/decrease the number of cpus for blast processing:
205 | ```
206 | NCPUS=10
207 | ```
208 | To a value according to the computational capacity of your machine. 
209 | 
210 | 
211 | ## IV) Running the whole pipeline via anaconda + binaries: 
212 | 
213 | ### A) Quickstart (Running the test)
214 | 
215 | - Inside ```test``` folder, run the pipeline with a provided set of transcripts from chromosome 33, Gallus gallus genome version "6", in GTF format. 
216 | - Users need to specify the stringtie output (GTF format), UCSC reference genome (GTF annotation and fasta file), gawn_config.sh file (check NCPUS for blast, default = 10), number of threads for text processing (20 for this example) and the output folder. 
217 | 
218 | Go to ```annotate_my_genomes/test``` directory and execute the following:
219 | 
220 | ```
221 | # Download Gallus gallus v6 fasta assembly (non masked) with matched GTF files (UCSC/Ensembl)
222 | ./genome-download galGal6        
223 | 
224 | # Execute pipeline on stringtie_chr33.gtf (provided file) with 20 threads:
225 | mkdir output1
226 | ./annotate-my-genomes -a stringtie_chr33.gtf -r galGal6.gtf -g galGal6.fa -c gawn_config.sh -t 20 -o output1
227 | 
228 | # Include NCBI annptations on stringtie_chr33.gtf (provided file) with 20 threads:
229 | mkdir output2
230 | ./add-ncbi-annotation -a stringtie_chr33.gtf -n galGal6_ncbiRefSeq.gtf -r galGal6.gtf -g galGal6.fa -c gawn_config.sh -t 20 -o output2
231 | ```
232 | 
233 | ### B) Simplest usage
234 | (Optional) Edit NCPUS value in ```gawn_config.sh``` file inside the repository. Default is 10
235 | - As example, to annotate a chicken GTF file (i.e: "target.gtf") using 20 threads for cpu processing:
236 | 
237 | ```
238 | mkdir output1
239 | ./genome-download galGal6          
240 | ./annotate-my-genomes -a /path/to/target.gtf -r /path/to/galGal6.gtf -g /path/to/galGal6.fa -c /path/to/gawn_config.sh -t 20 -o /path/to/output1
241 | ```
242 | - ```final_annotated.gtf``` (located in output1/) will contained the merged NCBI-updated annotation (in UCSC coordinates)
243 | - To produce ```target.gtf``` assembly, check stringtie parameters here: https://github.com/cfarkas/annotate_my_genomes/wiki#ii-obtaining-stringtie-gtf-file-for-annotation
244 | 
245 | ### C) Adding NCBI annotations to increase annotation of transcripts
246 | Users can add annotations from NCBI by using the three outputs from ./genome-download program as inputs into ./add-ncbi-annotation. 
247 | - Resuming the previous example, using add-ncbi-annotation instead of annotate-my-genomes:
248 | ```
249 | mkdir output2
250 | ./genome-download galGal6         
251 | ./add-ncbi-annotation -a /path/to/target.gtf -n /path/to/galGal6_ncbiRefSeq.gtf -r /path/to/galGal6.gtf -g /path/to/galGal6.fa -c /path/to/gawn_config.sh -t 20 -o /path/to/output2
252 | ```
253 | - ```final_annotated.gtf``` (located in output2/) will contained the merged NCBI-updated annotation (in UCSC coordinates).
254 | 
255 | As example for mouse genome, change galGal6 prefix to mm10. Using 30 threads for processing "mouse.gtf" assembly:
256 | ```
257 | mkdir output3
258 | ./genome-download mm10            
259 | ./add-ncbi-annotation -a /path/to/mouse.gtf -n /path/to/mm10_ncbiRefSeq.gtf -r /path/to/mm10.gtf -g /path/to/mm10.fa -c /path/to/gawn_config.sh -t 30 -o /path/to/output3
260 | ```
261 | ### D) Post processing add-ncbi-annotation outputs
262 | 
263 | If ```stringtie.gtf``` (as an example of input GTF) was annotated with ```add-ncbi-annotation```, users can produce transcripts annotation tables (csv format) using two outputs from add-ncbi-annotation pipeline as follows:
264 | 
265 | - gffcompare.tmap (inside ```output_files``` subdirectory)
266 | - NCBI_transcripts.fa (inside ```gffcompare_outputs_NCBI``` subdirectory)
267 | 
268 | By using isoform-identification pipeline, as follows: 
269 | 
270 | ```
271 | isoform-identification -m /path/to/gffcompare.tmap -t /path/to/NCBI_transcripts.fa -g galGal6
272 | ```
273 | In this example:
274 | - ```gffcompare.tmap``` correspond to the transcript map output from gffcompare
275 | - ```NCBI_transcripts.fa``` correspond to the transcripts sequences from ```stringtie.gtf```, in fasta format 
276 | - ```galGal6``` correspond to the NCBI genome name (in this example, Gallus gallus 6 genome, galGal6). 
277 | 
278 | The outputs ```Ref_Transcript_Annotation.csv``` and ```Novel_Transcript_Annotation.csv``` files will contain detailed annotation of transcripts. Ref_Transcript_Annotation.csv should look like this:
279 | 
280 | ```
281 | ref_gene_id	ref_id	class_code	qry_gene_id	qry_id	num_exons	FPKM	TPM	Annotation Status	NCBI RefSeq Gene ID	Transcript Description	NCBI RefSeq Protein ID	Alternative Gene Name	RefSeq Transcript Info	cds_seq
282 | OR14J1L40	XM_025145345.1	x	STRG.16902	STRG.16902.1	3	0.089321	0.347251	Model	OR14J1L40	olfactory receptor 14J1-like 40	XP_025001113.1			AATTTCATTGGAATTAAATTTATTATACGTATGACAAACTGatatgaagaagaaacagaaacaccacATAAAATCTATCAGGCTTTTCCTAAATTTTCTGTAGTCTTGAGAGCATGATGAACATCTTTCTGATAGTGAAACCGGGTATGTTGGAGTATCTTCCTGAGGGAacccttgagctcctggttcctcatgctgtagatgagggggttcaaAGCTGGAGGCACCACTGTGTATAGAAATGACACCACCAGGTCCagagatggggaggagatggagggaggcttcaggtaggcaaacatggcagtgctgacaaacagggagagcacagccaggtgagggaggcacgtggagaaggttttgtgctgtccctgctcagagggcatcctcagcacggccctgaagatctgcacataggagaagagaatgaaagcaaagcaccCAGATGCTAAAGAGGCACTGACAATAAGAAGCCAAATGTCTTTGAGATAGGAGTGTGAGCaagagagcttgaggatctgggggatttcacagaagaactgatccacagcattgccttggcacagaggcagggaaaatgtattggcagtgtgcagcagggaattaaggacccccgtgccccaggcagctgctgccatggtggcacacgctctgctgcccagcagggtccggtagtgcaggggcttgcagatggcaac
283 | LOC100857209	XM_015272533.2	x	STRG.16904	STRG.16904.1	3	0.099526	0.386921	Model	LOC100857209	olfactory receptor 14A16-like	XP_015128019.2			catctgcagttcctgggcatggagtcctgttcagacTGCAGGAGATAATGATGAGTCGATACCATTCTCAGAGACACTCCTCCTGCAcactttgaaaatgcatttaactCCATAGCAtgagtttattttcatgagcttcAGAATCATGTAAGAAGTAGAAACTTAAGGAGCATTTAGTTTCCTATCATTTCCTAATCATATCCCAGGCTCCTGGattttttcctcataggagCTGTTTCCACATCTCTTTTCTttacccctaaccctaacttcTATGTTCTTCAACTTCTGTTAGAGAAATCTGTTTGATTGGAGGCTAAGTACATTATTCATGACTGCAGAGAATGACAATAAtttcagctggtgctgtcctttgggggaggagaggctgaaagcacatgAGGAGATTGTTCATATAACAGCAGACTGAGAAAGGTACAATTCAGGGTACTCAGAGATGTGTTCATATTTTCTGGCTCCcttcagatttctgcctccaatccttttcccttctcttagggtataaaagaaaaatccctgccctgtctctcctcttgcaaagAGGAGCAAACACCTTTGGAAACACCCTATGGTGCAGCtgtagctgtgatACCCCTGGCTCAGGCAgaagctgtggcagcagaaggccccttCCCTGCCGGGGGGCttcttccccccacacgtctccctgcagcgccctgggcagctccccgggcaggctgagtgctgagcctggcaggcggcagagtccctgccccggcacacagcccctggggcacagcagggaccctgctctgcactacagccctgggcacccggctgcacccaaacagcacagcctgcagccgtcctgggacacgcagccctcagggctgtgctctgatgctgcagcacagaagcccTCATCTGGAACAGTAGTCTTTTTCCATAGCAAGGAAACATGAAGTACTTTCAGCCAGATCTGCTATGGGATATCCCTGATTCAGTGATCCCTCCTGGAAAAACAGCTTCATTGCCTACTGCAAGAGACTTACCCTGTCAAGCGCTGTGAGCAAtgctcctccagtgagctcacatCCTACTCACACTGTACACATCCTGtaatctctttctcttttctcttctatcTTCATGTCACCTGCAGATCATGTCTatagccctgctgtgctgtacagaagagctgctcctgtgcaCAGCTGTCTCTCCGCAGCGCTGCCTGCTTTTatgagctccctgtgtcccaggagcctggcccagctcagcagc
284 | LOC112530844	XM_025145380.1	p	STRG.16906	STRG.16906.1	1	0.192245	0.747381	Model	LOC112530844	olfactory receptor 14A16-like	XP_025001148.1			aaatcagcgggagacaagtctcatgctttcatgatcaacaagtctcagctttattgAAGCACACGCAGGCATTTATACGATAGTTAATGAGCTACTACATATGCCAAATTGGGTTCTCTTATTGGTTAGTTCTTTACGTGAGAAAGTAACCTTCAACGCTAGATACCGTGACAGTCCCGTGATGAATGCCCGATTGTTTACCGCATACCACTCAATTTTCTTAACTGCAGCATGTTcttatcacttccttgctcctgagtGAGGGCAGCACGACCTTGCCTGGTTTAATGAGCAGGGCCCTATctccttaccagctgcatcccatCATGGCCCCTCTCCCGGAGCCAGTGCTCCGGGTCCCAAAAGCTCTCCACACTTCCCCCGTTTTCTTTTGGTACGAGCCAGGTTGTATGAATCGCATCTTGAACCACCTTTTGCTAGCATTACAGTAAACAAAGCATGATTATCAGCATACCAATCACTATCTATAAGAATACACTAGATTTATgttacacacttctacaaagcattccttgtcagtaaactaacagtaaagactacacagcacaccagtattaactacagtttcaatatcccgatgaataaaataccacagtccCCACTCTGGATCAACCACTGTACCTGACCCCCACAATTAGTGCGCTTCTGAGTCTCATAACCGccaattgctcctggcagttcccagtgtCCAAGAGACCTTtctgatgagatgttttctgcaatCTGCTAAGGGAATACCAGTCGCAGCTCAGGAGTCACGGCACTGTATATGATGTCTTGCACACCATGCGGCTATCGCTCGCCGGAGTCGCCGTTGTTGTCATCGGGTTGAGATGGGTTGTTGATGTTCGGGGCTGGCTTAgtccatttactgggaacccataatgggccagatcctgtggAAACACAGCTCTCTCCTGGaagcctcccatgatgtttacaaaattccTATTGATTCCTAATTCactcaaagtttccacaaacccTTAACACCGTACagtgatattgttcagttataaacacttgggaacagatctcacagaagcttgTCCATGTTCCCTTACACGCTTCCATgcaatcagaacacagtactagATAAACAGGTtgacactcattccctgaaaggaacacatctcactcacaccacactcactctgacatttagaacaaaaaacatAGTTTATACATAACccacaatgctgacgacgtcttttAGCTTGTATCTTAATAACACTAGTGCATTAGTCAATTAGTTGCAATtcctaccccagccggcaatctaacctgtgagctcacgtatctcggggggggggggggaagcaggcacgctccttcataccctgcgtaggacgtctcctcacgccttacgggcacccccttttctatacacatacctgaTACACcaatggatggtccttgtctgtccctgcagtgatcgggtgaggaagggagaccttccaagaaatcttggggcgcgccaaaggtgtcccctctctcaatCGATCCCGCAGCCGAACAGAGCGGATCTATTCTCGTTGCAAAATTGAGTTGtagaaatcagaccctatatccggtaaggatatagagcaggcatgcGTCTATTGATGTCTATTGAtagtgcaagggggatcactccacctaacttgcacaccgtcaggagaaattgtactatagatataggtcaaactaatacataaccaatagttgacaggaattcagatacattttcattacgtccctgaaagacacattttcatgcagtataatgagacagaagaacagagggtAGTGCTGGCGCAGTTCTCATaatttgcagttgcttgcagcttgactcacagcacctggcacagcggtctctatcacagctctgcattcctttcgcctactcccatcattgttctgtgtgagacagtgatccatagcagctgttttacttgcactgacccagggggagaaaaacatgacctcgCTGGGTCAGCCGTCCATCCACAATTTCCCTGTTCTACTATTGCCTGGCCTGTGGGTGAGTTTGGGATACCCGTACTGTGTTTTACTCCCCATGTTTGCAGAAACTCCCCAAGCCTACGACTAGTGTAGGCTGGGccattgtctgtttttattcGTAGTGATATACCCATAACTGCAAAGCAACAACTGAGATGCTTTTCTACATACAtagccttttctccaggttgagcGGTGGCCCACATAAGATGACTATATGTATCTATAGACACGTGTACATATTTCAGCTGCCCGAACTCACCCACATGCATCACATCCATCTGCCTATTTTCGTTAGCTCTAAGTCCCCTGGGGTTAACTCCTAGCCCGAGACCCATACTGCCATTATGGTGGCTGCACACTGGGCACGATCTAACAATTACCTTAGCATCCTCATATGTTATCTGATATTCCCTTCTTAGCCCCTTGGCATTCTGGTGAAACATAGAGTACGCCTCTCGGGCCAGGACATGCCGGGAGACTAAAGGTCTCTGCGCCAGTGACACCAAGCGATCAGCTCTCGCATTTCCCTCTCCCAAGTCTATCTCCCATTTATGACCTCGAACATGTATTACTGCATATGAGTGCTCCCTAATtctgattgctctctgcaactgcacgAACAGCTTGTACAGCCGCcgattctgcacttcctttatgTAGGCTTCCTCTATTTGGTGGCATACTCCAGCTACATAAAGGGAGTCGGTGACCACATTAAGGGGGCCGATTAAGTTCATCATGGCCCATACAACGGCCACCAGCTCCAATGTTTGCAATAAGTCCTTATCATCGTCTGCAATGAGGTGATGTCTCCAGGAGCCgccctgctgccaggtcactgctgctgttctagacTTCTGTCCCGCATCCGTGTAAGCCGTGATTGTGTTCTGCAAGGGCGTCTCATGCTGCTTTGGTATCCGGAGCCAACTCCATTGACCAATCCAATGTAGCGGCACGTTCGGAATCTTTTCCACTGAAACCGTACTTCCAGCTCCTAAGAGAGCATCCTGTAACTCTGGACTATGCTGCACATACCATGTCAGAGTGTCCTTCTGCATTGGCAGCTGTACACACACAGGCTCCATACCTATGATCTGCAGGGTACGTTCTCGCCCTTTcttaatcacttctgccaggagttcagttttttgaagaagtgtttttgattgctgcagtgagggacagATCCACTCTAGTACCCATACctcccccgttttctttttagattgtgCCAACGCTCCTAAAAGGTACTTTGGTCCATACCATACCATAACCTGTATGGGGAGGTCAGGGTCACGTCTCCGAACACTGCCGTGTATAATGCAGTCCATAATCTGTTGTAGTAGACGTTTGTGCTGCGTTGTCACCGTTACAGGCTGGGCCGGGTCAGTGCCCTGTAACAAAGGTCGCAACGACTCTAAGAGTTCGTTTGGGATGCCCACCACAGGGCTCAACCACTTTAAGTCCCCCAGTAACCTTTGGGCATCATGTAGAGTCTCTAGTTTAGTATCcagttgcagtttctgtggggTTACTATCGTGTTAGTCAGTGTCCATCCTAAGTACTTCCGGGGCGCGGAGAGTTGTACCTTTTCAGGGGCAAACATAAGTTCTTCCCTATTTAGGGTCTTTTCTATTTGCCaaatttgttcctgtgtgaaggcCTCTGGCTGGGCAAAAAGGATGTCCTCCATGTAATGATAAAtgaccatttgtttccattctcgCCGGAGTGGTTGTAGAGCATGATCGACATATAGTTGACATCGCGTGGGGCTATTTTTCATCCTTTGAGGTAATACTGTCCATTCAAAACGTTGATCAGGGTGTTCTCGATTCAATGCAGGCAATGTGAAGGCAAATCGTTTAGTGTCCTGAGGGTGCAGGGTAATAGTaaagaaacagtcctttaaGTCACTAATTAGTAATGGCCAATTGTAAGGTAGCATGGCAGGATTAGGCAGGGCGGGTTGAAGTGCCCCCAGTTGAGAGAGCACATTGTGGCCAATTAAGCATTGAACAGTGGGGGGTAGAGGTGCCACCGAGACAGAGGTATGGACTACTTGTTCATCAAGGTGGATTTGCAGGGGAGGTGACTTTTTCGCTAAGGATAGTCCACCTGTACCCGTCACTGTGGCTATGGCCGCTTGCAGTGGCCATTGAGGCGGCCAAATTTCTGGGCTCAATATGCTGTTGTCGGCCCCTGTATCTAATAGACCttgaagtttgatttcttcctctctgtgtttAAGTGTCACTGGTTTTTTAGGTCGATCATGCAAATTTAGTGATAGCAATGCTAAGTCCCCTGAGGAGCCAAACCCTTGCTCCCCTCGGGGAGACGATTGACACGGTGTTAAGGCTTTGGTCAATTGCTCTAGGGGTACTAACTGCGCTATCCGTTGccctttctcaatttttattggAGGAAACGGGGTGTATACCATAATCTGGATCTCACCCTGAAAGTCCGCATCTATTACCCCAGGGAGGACAAAAAGTCCGAGCATCGATGCTGAAGAACGCCCCAATAAAAGGGCCCCAACAGCGGTTCCATTTATCATTACTGGTCCCCTGATCCCTGTAGACACCCGCTCAGGTTTTGTGGTCATTAAGGTCGTGGTCACTGCGGCTGCCAAGTCCAAGCCGAGGCTTCCTGGTGTGGCTgattgcagggctgctgctggctggaaacGGCTACTTGTGTCTGTGCGTGGCCGTCGTTTCTTTCTCGCGCTGGGCTGGGGGTTTCCTGACCGGCGTCGACAGGCATTGGTATTGTGGTTGTCCATACGACATGTGTGACACCATGAACCGGTGGTTTGACACTGACGACGCATATGTCCCATGCCGCCACAGCGATAGCATTTGATGCGACCAGCAACAGGCGATCTCGGGCCTAAATTTGTTATCGCAGACGCTTGTAAGGATGCAAGAGCTGCTAGCACTTGATTGTGAGAGGCCTCAGCTTGCGCCTTTAAACTTGCCCCTAACTCCTTAATAGCCTCAATCAGAAATGCTTGGGGCCCGACTGGCACGCTTGATagcttttccagtgcctcttcAATAGTCCAATTACTCCTCAAAGTACTCAGAGTACTACGTGCTGTTGAATTACAATTTTGGAGCGCGCATTGTTTTAACATTACTCCTCTCATATACTCTGGCACCCCTGCTTTTTCAATAGCCCCGGCTACCTTATCTATGAATGCCCCAAAGTCCTCATCTCTACCTTGTCGGATCCCCATATAAAATGGCAATCCATCAGGCACCTTAATCTTGTCCATGGCCTGTCTAGCTAAATACATCGTTTCTCGACATTTATCTGGCCCTAATAATGCTTGGGCTTGTGTTCTGAAAAAAGGCCCTAGCCCTAAGAGTTCTTCGATAGTTACACCATGTAGTGGGTCTCCCGGCTGCCTAGCCTTTGAGACACTCTGATGGCACAGTTCTTGCCAATATGCattaaacaacagctgttgATGTTGTGAAGAGATCAATTTTGCTATTGCCCGACAATCGGATGGCAGCAATATCTGCGTACTCCAAATATAATCCAATATCTGCTTAGCTGGCTCGCTTTTTACCCCAAACTGACTAACTGTAGATCGTAGCTGCGATAATAATTTCCAATCTAAAGCTGTGATGGTGGCCTGCATCCCTCCCGCAGGATTAGAGGCATATATCACTGGAAACGCCATGTGCCGCACGGCCTCC
285 | ```
286 | 
287 | ## V) Annotate and identify homologs in novel proteins from transcriptome
288 | 
289 | - See this example: https://github.com/cfarkas/annotate_my_genomes/wiki#5-annotate-and-identify-homologs-in-novel-proteins-from-transcriptome
290 | 
291 | ## VI Annotation of BRAKER2 / TSEBRA gtf output
292 | 
293 | - The output ```braker.gtf``` from BRAKER2 pipeline (https://github.com/Gaius-Augustus/BRAKER) or ```tsebra.gtf``` from TSEBRA pipeline (https://github.com/Gaius-Augustus/TSEBRA) can be annotated using a few tools before running the pipeline.
294 | 
295 | As a requirement, the AGAT toolkit (https://github.com/NBISweden/AGAT) must be installed:
296 | ```
297 | conda activate annotate_my_genomes
298 | conda install -c bioconda agat
299 | ```
300 | - Suppose you recently annotated the Gallus gallus genome (galGal6) using BRAKER2 or TSEBRA. The ```braker.gtf / tsebra.gtf``` output can be pre-processed as follows:
301 | 
302 | #### BRAKER2 run
303 | ```
304 | agat_convert_sp_gff2gtf.pl --gff braker.gtf -o braker_fixed.gtf                        # clean and fix braker.gtf with AGAT                         
305 | stringtie --merge -G galGal6_ncbiRefSeq.gtf braker_fixed.gtf -o braker_merged.gtf      # merge braker.gtf with reference genome GTF (i.e.: galGal6_ncbiRefSeq.gtf)
306 | sed 's/ gene_name.*//'g braker_merged.gtf > braker_fixed.gtf                           # fix additional entries
307 | grep "StringTie" braker_fixed.gtf > braker_stringtie.gtf                               # Exclude reference transcripts not found in braker annotation
308 | ```
309 | - Now, ``` braker_stringtie.gtf``` can annotated as follows (i.e. using 30 threads for processing):
310 | ```
311 | mkdir braker_annotated
312 | add-ncbi-annotation -a braker_stringtie.gtf -n galGal6_ncbiRefSeq.gtf -r galGal6.gtf -g galGal6.fa -c gawn_config.sh -t 30 -o braker_annotated/
313 | ```
314 | 
315 | #### TSEBRA run
316 | ```
317 | agat_convert_sp_gff2gtf.pl --gff tsebra.gtf -o tsebra_fixed.gtf                        # clean and fix tsebra.gtf with AGAT                         
318 | stringtie --merge -G galGal6_ncbiRefSeq.gtf tsebra_fixed.gtf -o tsebra_merged.gtf      # merge tsebra.gtf with reference genome GTF (i.e.: galGal6_ncbiRefSeq.gtf)
319 | sed 's/ gene_name.*//'g tsebra_merged.gtf > tsebra_fixed.gtf                           # fix additional entries
320 | grep "StringTie" tsebra_fixed.gtf > tsebra_stringtie.gtf                               # Exclude reference transcripts not found in braker annotation
321 | ```
322 | - Now, ``` tsebra_stringtie.gtf``` can annotated as follows (i.e. using 30 threads for processing):
323 | ```
324 | mkdir tsebra_annotated
325 | add-ncbi-annotation -a tsebra_stringtie.gtf -n galGal6_ncbiRefSeq.gtf -r galGal6.gtf -g galGal6.fa -c gawn_config.sh -t 30 -o tsebra_annotated/
326 | ```
327 | ### More Scenarios?
328 | 
329 | - For downstream analysis and examples, please visit our wiki page : https://github.com/cfarkas/annotate_my_genomes/wiki
330 | 
331 | ### Notes
332 | Compiling automatically uses Shell script compiler shc to make binaries, please check: https://github.com/neurobin/shc.
333 | 


--------------------------------------------------------------------------------
/additional_scripts/annotate_gtf.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from tqdm import tqdm
 3 | 
 4 | # Define file names from command line arguments
 5 | gtf_file = sys.argv[1]  # GTF file
 6 | hits_file = sys.argv[2]  # Hits file
 7 | annotation_table_file = sys.argv[3]  # Transcriptome annotation table file
 8 | output_file = sys.argv[4]  # Output GTF file with annotations
 9 | 
10 | # Function to append blastx information and gawn_name to GTF entries
11 | def append_annotations_to_gtf(gtf_line, blastx_info, gawn_names):
12 |     try:
13 |         if 'transcript' in gtf_line:
14 |             # Ensure there's a semicolon at the end of the original GTF line
15 |             if not gtf_line.endswith(';'):
16 |                 gtf_line += ';'
17 |                 
18 |             transcript_id = gtf_line.split('transcript_id "')[1].split('"')[0]
19 |             annotations = []
20 |             if transcript_id in blastx_info:
21 |                 annotations.append(f'blastx "{blastx_info[transcript_id]}";')
22 |             if transcript_id in gawn_names:
23 |                 annotations.append(f'gawn_name "{gawn_names[transcript_id]}";')
24 |             if annotations:
25 |                 gtf_line += ' ' + ' '.join(annotations)
26 |         return gtf_line
27 |     except IndexError as e:
28 |         print(f"Warning: Malformed line skipped: {gtf_line}")
29 |         return None
30 | 
31 | # Read the hits file and store the blastx info in a dictionary
32 | blastx_info = {}
33 | with open(hits_file, 'r') as hits:
34 |     for line in hits:
35 |         transcript_id, blastx_id = line.strip().split(' ')
36 |         blastx_info[transcript_id] = blastx_id
37 | 
38 | # Read the annotation table and store the gawn_names in a dictionary
39 | gawn_names = {}
40 | with open(annotation_table_file, 'r') as table:
41 |     for line in table:
42 |         parts = line.strip().split('\t')
43 |         if len(parts) > 2:
44 |             transcript_id = parts[0]
45 |             gawn_name = parts[2]
46 |             gawn_names[transcript_id] = gawn_name
47 | 
48 | # Read the GTF file, modify entries with annotations, and write to the output file
49 | with open(gtf_file, 'r') as gtf, open(output_file, 'w') as out_gtf:
50 |     for line in tqdm(gtf, desc="Annotating GTF"):
51 |         modified_line = append_annotations_to_gtf(line.strip(), blastx_info, gawn_names)
52 |         if modified_line:
53 |             out_gtf.write(modified_line + '\n')
54 | 
55 | print("Annotation completed. Output is in", output_file)
56 | 


--------------------------------------------------------------------------------
/additional_scripts/blast_parser.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # Takes a .xml formatted blast results file as input and prints the query and hit ids
 4 | # for sequences passing the thresholds passed via the command line arguments. For sequences
 5 | # with no hits below the thresholds, the program returns "no hits below threshold" rather
 6 | # than the hit id.
 7 | 
 8 | import getopt, sys
 9 | from Bio import SeqIO
10 | from Bio.Blast import NCBIXML
11 | 
12 | ## Function to parse an XML format BLAST results file.
13 | 
14 | def parse_results(result_file, e_val_thresh, ident_thresh, align_thresh):
15 |         result_handle = open(result_file, 'r')  ## The XML file to parse.
16 |         blast_records = NCBIXML.parse(result_handle)
17 |         print('query_id\thit_id\tpercentage_identity\tquery_length\talignment_length\te_value')
18 | 
19 |         for record in blast_records:  ## Loop through each query.
20 |                 query_id = record.query
21 |                 if len(record.alignments) > 0:  ## Check whether there are hits.
22 |                         e_val = record.alignments[0].hsps[0].expect
23 |                         if e_val < e_val_thresh:  ## Is hit below E-value?
24 |                                 tot_ident = sum([hsp.identities for hsp in record.alignments[0].hsps])  ## Sum of all identities for all hsps.
25 |                                 query_len = record.query_length  ## Length of query
26 |                                 align_len = sum([hsp.align_length for hsp in record.alignments[0].hsps])  ## Length of query alignment to hit.
27 |                                 pct_ident = tot_ident/float(align_len)*100  ## Calculates percentage identity.
28 |                                 top_hit = record.alignments[0].hit_id + record.alignments[0].hit_def
29 |                                 if pct_ident > ident_thresh:  ## Checks whether above percentage identity cutoff.
30 |                                         if align_len > align_thresh:
31 |                                                 print('%s\t%s\t%f\t%i\t%i\t%s' % (query_id, top_hit, pct_ident, query_len, align_len, str(e_val)))
32 |                                         else:
33 |                                                 print('%s\t%s\t%s\t%s\t%s\t%s' % (query_id, '', '', '', '', ''))
34 |                                 else:
35 |                                         print('%s\t%s\t%s\t%s\t%s\t%s' % (query_id, '', '', '', '', ''))
36 |                         else:
37 |                                 print('%s\t%s\t%s\t%s\t%s\t%s' % (query_id, '', '', '', '', ''))
38 |                 else:
39 |                         print('%s\t%s\t%s\t%s\t%s\t%s' % (query_id, '', '', '', '', ''))
40 | 
41 |         result_handle.close()
42 | 
43 | ## How to use this.
44 | 
45 | def usage():
46 |         print("""
47 | \nblast_parser.py.\n
48 | Takes a .xml formatted blast results file as input and prints the query and hit ids
49 | for sequences passing the thresholds passed via the command line arguments. For sequences
50 | with no hits below the thresholds, the program returns "no hits below threshold" rather
51 | than the hit id.\n
52 | Basic usage:
53 | \tpython blast_parser.py -i <results.xml> -e 1e-20 -p 97 -a 100 > parsed_results.txt\n
54 | Arguments:
55 | \t-h, --help\t\t\tPrint this information.
56 | \t-i, --in <results.xml>\t\tXML format BLAST results file.
57 | \t-e, --evalue <number>\t\tExpect value.
58 | \t-p, --pct_ident <number, 0-100>\t\tPercentage identity cutoff.
59 | \t-a, --align_len <number>\t\t Minimum alignment length.
60 | """)
61 | 
62 | ## The main program.
63 | 
64 | def main():
65 |         try:  ## Parses the command line arguments.
66 |                 opts, args = getopt.getopt(sys.argv[1:], 'e:i:p:a:h', ['evalue=', 'in=', 'pct_ident=', 'align_len=', 'help'])
67 |         except getopt.GetoptError:
68 |                 usage()
69 |                 sys.exit(2)
70 | 
71 |         ## Creates variables from the arguments.
72 | 
73 |         for opt, arg in opts:
74 |                 if opt in ('-e', '--evalue'):
75 |                         e_val_thresh = float(arg)
76 |                 elif opt in ('-p', '--pct_ident'):
77 |                         ident_thresh = float(arg)
78 |                 elif opt in ('-a', '--align_len'):
79 |                         align_thresh = float(arg)
80 |                 elif opt in ('-i', '--in'):
81 |                         result_file = arg
82 |                 elif opt in ('-h', '--help'):
83 |                         usage()
84 |                         sys.exit(0)
85 |                 else:
86 |                         usage()
87 |                         sys.exit(2)
88 | 
89 |         try:  ## Tries to parse the results file.
90 |                 parse_results(result_file, e_val_thresh, ident_thresh, align_thresh)
91 |         except:  ## Otherwise, shows usage.
92 |                 sys.exit(1)
93 | 
94 | if __name__ == "__main__":
95 |     main()
96 | 


--------------------------------------------------------------------------------
/additional_scripts/dexseq_count.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | import sys, itertools, optparse, warnings
  3 | 
  4 | optParser = optparse.OptionParser( 
  5 |    
  6 |    usage = "python %prog [options] <flattened_gff_file> <alignment_file> <output_file>",
  7 |    
  8 |    description=
  9 |       "This script counts how many reads in <alignment_file> fall onto each exonic " +
 10 |       "part given in <flattened_gff_file> and outputs a list of counts in " +
 11 |       "<output_file>, for further analysis with the DEXSeq Bioconductor package. " +
 12 |       "Notes: Use dexseq_prepare_annotation.py to produce <flattened_gff_file>. " + 
 13 |       "<alignment_file> may be '-' to indicate standard input.",
 14 |       
 15 |    epilog = 
 16 |       "Written by Simon Anders (sanders@fs.tum.de) and Alejandro Reyes (reyes@embl.de), " +
 17 |       "European Molecular Biology Laboratory (EMBL). (c) 2010-2013. Released under the " +
 18 |       " terms of the GNU General Public License v3. Part of the 'DEXSeq' package." )
 19 |       
 20 | optParser.add_option( "-p", "--paired", type="choice", dest="paired",
 21 |    choices = ( "no", "yes" ), default = "no",
 22 |    help = "'yes' or 'no'. Indicates whether the data is paired-end (default: no)" )
 23 | 
 24 | optParser.add_option( "-s", "--stranded", type="choice", dest="stranded",
 25 |    choices = ( "yes", "no", "reverse" ), default = "yes",
 26 |    help = "'yes', 'no', or 'reverse'. Indicates whether the data is " +
 27 |       "from a strand-specific assay (default: yes ). " +
 28 |       "Be sure to switch to 'no' if you use a non strand-specific RNA-Seq library " +
 29 |       "preparation protocol. 'reverse' inverts strands and is needed for certain " +
 30 |       "protocols, e.g. paired-end with circularization."  )
 31 |    
 32 | optParser.add_option( "-a", "--minaqual", type="int", dest="minaqual",
 33 |    default = 10,
 34 |    help = "skip all reads with alignment quality lower than the given " +
 35 |       "minimum value (default: 10)" )
 36 | 
 37 | optParser.add_option( "-f", "--format", type="choice", dest="alignment",
 38 |    choices=("sam", "bam"), default="sam",
 39 |    help = "'sam' or 'bam'. Format of <alignment file> (default: sam)" )
 40 | 
 41 | optParser.add_option( "-r", "--order", type="choice", dest="order",
 42 |    choices=("pos", "name"), default="name",
 43 |    help = "'pos' or 'name'. Sorting order of <alignment_file> (default: name). Paired-end sequencing " +
 44 |       "data must be sorted either by position or by read name, and the sorting order " +
 45 |       "must be specified. Ignored for single-end data." )
 46 | 
 47 |    
 48 | if len( sys.argv ) == 1:
 49 |    optParser.print_help()
 50 |    sys.exit(1)
 51 | 
 52 | (opts, args) = optParser.parse_args()
 53 | 
 54 | if len( args ) != 3:
 55 |    sys.stderr.write( sys.argv[0] + ": Error: Please provide three arguments.\n" )
 56 |    sys.stderr.write( "  Call with '-h' to get usage information.\n" )
 57 |    sys.exit( 1 )
 58 | 
 59 | try:
 60 |    import HTSeq
 61 | except ImportError:
 62 |    sys.stderr.write( "Could not import HTSeq. Please install the HTSeq Python framework\n" )   
 63 |    sys.stderr.write( "available from http://www-huber.embl.de/users/anders/HTSeq\n" )   
 64 |    sys.exit(1)
 65 | 
 66 | gff_file = args[0]
 67 | sam_file = args[1]
 68 | out_file = args[2]
 69 | stranded = opts.stranded == "yes" or opts.stranded == "reverse"
 70 | reverse = opts.stranded == "reverse"
 71 | is_PE = opts.paired == "yes"
 72 | alignment = opts.alignment
 73 | minaqual = opts.minaqual
 74 | order = opts.order
 75 | 
 76 | if alignment == "bam":
 77 |    try:
 78 |       import pysam
 79 |    except ImportError:
 80 |       sys.stderr.write( "Could not import pysam, which is needed to process BAM file (though\n" )
 81 |       sys.stderr.write( "not to process text SAM files). Please install the 'pysam' library from\n" )
 82 |       sys.stderr.write( "https://code.google.com/p/pysam/\n" )   
 83 |       sys.exit(1)
 84 | 
 85 | 
 86 | 
 87 | if sam_file == "-":
 88 |    sam_file = sys.stdin
 89 | 
 90 | 
 91 | # Step 1: Read in the GFF file as generated by aggregate_genes.py
 92 | # and put everything into a GenomicArrayOfSets
 93 | 
 94 | features = HTSeq.GenomicArrayOfSets( "auto", stranded=stranded )     
 95 | for f in  HTSeq.GFF_Reader( gff_file ):
 96 |    if f.type == "exonic_part":
 97 |       f.name = f.attr['gene_id'] + ":" + f.attr['exonic_part_number']
 98 |       features[f.iv] += f.name
 99 | 
100 | # initialise counters
101 | num_reads = 0
102 | counts = {}
103 | counts[ '_empty' ] = 0
104 | counts[ '_ambiguous' ] = 0
105 | counts[ '_lowaqual' ] = 0
106 | counts[ '_notaligned' ] = 0
107 | counts['_ambiguous_readpair_position'] = 0
108 | 
109 | # put a zero for each feature ID
110 | for iv, s in features.steps():
111 |    for f in s:
112 |       counts[ f ] = 0
113 | 
114 | #We need this little helper below:
115 | def reverse_strand( s ):
116 |    if s == "+":
117 |       return "-"
118 |    elif s == "-":
119 |       return "+"
120 |    else:
121 |       raise SystemError("illegal strand")
122 | 
123 | def update_count_vector( counts, rs ):
124 |    if( type(rs) == str):
125 |       counts[ rs ] += 1
126 |    else:
127 |       for f in rs:
128 |          counts[f] += 1
129 |    return counts
130 | 
131 | 
132 | def map_read_pair(af, ar):
133 |    rs = set()
134 |    if af and ar and not af.aligned and not ar.aligned:
135 |       return '_notaligned'
136 |    if af and ar and not af.aQual < minaqual and ar.aQual < minaqual:
137 |       return '_lowaqual'
138 |    if af and af.aligned and af.aQual >= minaqual and af.iv.chrom in list(features.chrom_vectors.keys()):
139 |       for cigop in af.cigar:
140 |          if cigop.type != "M":
141 |             continue
142 |          if reverse:
143 |             cigop.ref_iv.strand = reverse_strand( cigop.ref_iv.strand )
144 |          for iv, s in features[cigop.ref_iv].steps():
145 |             rs = rs.union( s )
146 |    if ar and ar.aligned and ar.aQual >= minaqual and ar.iv.chrom in list(features.chrom_vectors.keys()):
147 |       for cigop in ar.cigar:
148 |          if cigop.type != "M":
149 |             continue
150 |          if not reverse:
151 |             cigop.ref_iv.strand = reverse_strand( cigop.ref_iv.strand )
152 |          for iv, s in features[cigop.ref_iv].steps():
153 |                rs = rs.union( s )
154 |    set_of_gene_names = set( [ f.split(":")[0] for f in rs ] )
155 |    if len( set_of_gene_names ) == 0:
156 |       return '_empty'
157 |    elif len( set_of_gene_names ) > 1:
158 |       return '_ambiguous'
159 |    else:
160 |       return rs
161 | 
162 | 
163 | def clean_read_queue( queue, current_position ):
164 |    clean_queue = dict( queue )
165 |    for i in queue:
166 |       if queue[i].mate_start.pos < current_position:
167 |          warnings.warn( "Read "+ i + " claims to have an aligned mate that could not be found in the same chromosome." )
168 |          del clean_queue[i]
169 |    return clean_queue
170 | 
171 |    
172 | if alignment == "sam":
173 |    reader = HTSeq.SAM_Reader
174 | else:
175 |    reader = HTSeq.BAM_Reader
176 | 
177 | 
178 | # Now go through the aligned reads
179 | num_reads = 0
180 | 
181 | if not is_PE:
182 |    for a in reader( sam_file ):
183 |       if not a.aligned:
184 |          counts[ '_notaligned' ] += 1
185 |          continue
186 |       if "NH" in a.optional_fields and a.optional_field("NH") > 1:
187 |          continue
188 |       if a.aQual < minaqual:
189 |          counts[ '_lowaqual' ] += 1
190 |          continue
191 |       rs = set()
192 |       for cigop in a.cigar:
193 |          if cigop.type != "M":
194 |             continue
195 |          if reverse:
196 |             cigop.ref_iv.strand = reverse_strand( cigop.ref_iv.strand )
197 |          for iv, s in features[cigop.ref_iv].steps( ):
198 |             rs = rs.union( s )
199 |       set_of_gene_names = set( [ f.split(":")[0] for f in rs ] )
200 |       if len( set_of_gene_names ) == 0:
201 |          counts[ '_empty' ] += 1
202 |       elif len( set_of_gene_names ) > 1:
203 |          counts[ '_ambiguous' ] +=1
204 |       else:
205 |          for f in rs:
206 |             counts[ f ] += 1
207 |       num_reads += 1
208 |       if num_reads % 100000 == 0:
209 |          sys.stderr.write( "%d reads processed.\n" % num_reads )
210 | 
211 | else: # paired-end
212 |    alignments = dict()
213 |    if order == "name":
214 |       for af, ar in HTSeq.pair_SAM_alignments( reader( sam_file ) ):
215 |          if af == None or ar == None:
216 |             continue
217 |          if not ar.aligned:
218 |             continue
219 |          if not af.aligned:
220 |             continue
221 |          elif ar.optional_field("NH") > 1 or af.optional_field("NH") > 1:
222 |             continue
223 |          elif af.iv.chrom != ar.iv.chrom:
224 |             counts['_ambiguous_readpair_position'] += 1
225 |             continue
226 |          else:
227 |             rs = map_read_pair( af, ar )
228 |             counts = update_count_vector( counts, rs )
229 |             num_reads += 1
230 |          if num_reads % 100000 == 0:
231 |             sys.stderr.write( "%d reads processed.\n" % num_reads )
232 | 
233 |    else:
234 |       processed_chromosomes = dict()
235 |       num_reads = 0
236 |       current_chromosome=''
237 |       current_position=''
238 |       for a in reader( sam_file ):
239 |          if not a.aligned:
240 |             continue
241 |          if a.optional_field("NH") > 1:
242 |             continue
243 |          if current_chromosome != a.iv.chrom:
244 |             if current_chromosome in processed_chromosomes:
245 |                raise SystemError("A chromosome that had finished to be processed before was found again in the alignment file, is your alignment file properly sorted by position?")
246 |             processed_chromosomes[current_chromosome] = 1
247 |             alignments = clean_read_queue( alignments, current_position )
248 |             del alignments
249 |             alignments = dict()
250 |          if current_chromosome == a.iv.chrom and a.iv.start < current_position:
251 |             raise SystemError("Current read position is smaller than previous reads, is your alignment file properly sorted by position?")
252 |          current_chromosome = a.iv.chrom
253 |          current_position = a.iv.start
254 |          if a.read.name and a.mate_aligned:
255 |             if a.read.name in alignments:
256 |                b = alignments[ a.read.name ]
257 |                if a.pe_which == "first" and b.pe_which == "second":
258 |                   af=a
259 |                   ar=b
260 |                else:
261 |                   af=b
262 |                   ar=a
263 |                rs = map_read_pair(af, ar)
264 |                del alignments[ a.read.name ]
265 |                counts = update_count_vector(counts, rs)
266 |             else:
267 |                if a.mate_start.chrom != a.iv.chrom:
268 |                   counts['_ambiguous_readpair_position'] += 1
269 |                   continue
270 |                else:
271 |                   alignments[ a.read.name ] = a
272 |          else:
273 |             continue
274 |          num_reads += 1
275 |          if num_reads % 200000 == 0:
276 |             alignments = clean_read_queue( alignments, current_position )
277 |             sys.stderr.write( "%d reads processed.\n" % (num_reads / 2) )
278 |  
279 | 
280 |  
281 | # Step 3: Write out the results
282 | 
283 | fout = open( out_file, "w" )
284 | for fn in sorted( counts.keys() ):
285 |    fout.write( "%s\t%d\n" % ( fn, counts[fn] ) )
286 | fout.close()
287 | 


--------------------------------------------------------------------------------
/additional_scripts/dexseq_prepare_annotation_fixed.py:
--------------------------------------------------------------------------------
  1 | import sys, collections, itertools, os.path, optparse
  2 | 
  3 | optParser = optparse.OptionParser( 
  4 |    
  5 |    usage = "python %prog [options] <in.gtf> <out.gff>",
  6 |    
  7 |    description=
  8 |       "Script to prepare annotation for DEXSeq." +
  9 |       "This script takes an annotation file in Ensembl GTF format" +
 10 |       "and outputs a 'flattened' annotation file suitable for use " +
 11 |       "with the count_in_exons.py script ",
 12 |       
 13 |    epilog = 
 14 |       "Written by Simon Anders (sanders@fs.tum.de), European Molecular Biology " +
 15 |       "Laboratory (EMBL). (c) 2010. Released under the terms of the GNU General " +
 16 |       "Public License v3. Part of the 'DEXSeq' package." )
 17 | 
 18 | optParser.add_option( "-r", "--aggregate", type="choice", dest="aggregate",
 19 |    choices = ( "no", "yes" ), default = "yes",
 20 |    help = "'yes' or 'no'. Indicates whether two or more genes sharing an exon should be merged into an 'aggregate gene'. If 'no', the exons that can not be assiged to a single gene are ignored." )
 21 | 
 22 | (opts, args) = optParser.parse_args()
 23 | 
 24 | if len( args ) != 2:
 25 |    sys.stderr.write( "Script to prepare annotation for DEXSeq.\n\n" )
 26 |    sys.stderr.write( "Usage: python %s <in.gtf> <out.gff>\n\n" % os.path.basename(sys.argv[0]) )
 27 |    sys.stderr.write( "This script takes an annotation file in Ensembl GTF format\n" )
 28 |    sys.stderr.write( "and outputs a 'flattened' annotation file suitable for use\n" )
 29 |    sys.stderr.write( "with the count_in_exons.py script.\n" )
 30 |    sys.exit(1)
 31 | 
 32 | try:
 33 |    import HTSeq
 34 | except ImportError:
 35 |    sys.stderr.write( "Could not import HTSeq. Please install the HTSeq Python framework\n" )   
 36 |    sys.stderr.write( "available from http://www-huber.embl.de/users/anders/HTSeq\n" )   
 37 |    sys.exit(1)
 38 | 
 39 | 
 40 | 
 41 | 
 42 | gtf_file = args[0]
 43 | out_file = args[1]
 44 | 
 45 | aggregateGenes = opts.aggregate == "yes"
 46 | 
 47 | # Step 1: Store all exons with their gene and transcript ID 
 48 | # in a GenomicArrayOfSets
 49 | 
 50 | exons = HTSeq.GenomicArrayOfSets( "auto", stranded=True )
 51 | for f in HTSeq.GFF_Reader( gtf_file ):
 52 |    if f.type != "exon":
 53 |       continue
 54 |    f.attr['gene_id'] = f.iv.chrom + '_' + f.attr['gene_id'].replace( ":", "_" ) + f.iv.strand
 55 |    exons[f.iv] += ( f.attr['gene_id'], f.attr['transcript_id'] )
 56 | 
 57 | 
 58 | # Step 2: Form sets of overlapping genes
 59 | 
 60 | # We produce the dict 'gene_sets', whose values are sets of gene IDs. Each set
 61 | # contains IDs of genes that overlap, i.e., share bases (on the same strand).
 62 | # The keys of 'gene_sets' are the IDs of all genes, and each key refers to
 63 | # the set that contains the gene.
 64 | # Each gene set forms an 'aggregate gene'.
 65 | 
 66 | if aggregateGenes == True:
 67 |    gene_sets = collections.defaultdict( lambda: set() )
 68 |    for iv, s in exons.steps():
 69 |       # For each step, make a set, 'full_set' of all the gene IDs occuring
 70 |       # in the present step, and also add all those gene IDs, whch have been
 71 |       # seen earlier to co-occur with each of the currently present gene IDs.
 72 |       full_set = set()
 73 |       for gene_id, transcript_id in s:
 74 |          full_set.add( gene_id )
 75 |          full_set |= gene_sets[ gene_id ]
 76 |       # Make sure that all genes that are now in full_set get associated
 77 |       # with full_set, i.e., get to know about their new partners
 78 |       for gene_id in full_set:
 79 |          assert gene_sets[ gene_id ] <= full_set
 80 |          gene_sets[ gene_id ] = full_set
 81 | 
 82 | 
 83 | # Step 3: Go through the steps again to get the exonic sections. Each step
 84 | # becomes an 'exonic part'. The exonic part is associated with an
 85 | # aggregate gene, i.e., a gene set as determined in the previous step, 
 86 | # and a transcript set, containing all transcripts that occur in the step.
 87 | # The results are stored in the dict 'aggregates', which contains, for each
 88 | # aggregate ID, a list of all its exonic_part features.
 89 | 
 90 | aggregates = collections.defaultdict( lambda: list() )
 91 | for iv, s in exons.steps( ):
 92 |    # Skip empty steps
 93 |    if len(s) == 0:
 94 |       continue
 95 |    gene_id = list(s)[0][0]
 96 |    ## if aggregateGenes=FALSE, ignore the exons associated to more than one gene ID
 97 |    if aggregateGenes == False:
 98 |       check_set = set()
 99 |       for geneID, transcript_id in s:
100 |          check_set.add( geneID )
101 |       if( len( check_set ) > 1 ):
102 |          continue
103 |       else:
104 |          aggregate_id = gene_id
105 |    # Take one of the gene IDs, find the others via gene sets, and
106 |    # form the aggregate ID from all of them   
107 |    else:
108 |       assert set( gene_id for gene_id, transcript_id in s ) <= gene_sets[ gene_id ] 
109 |       aggregate_id = '+'.join( gene_sets[ gene_id ] )
110 |    # Make the feature and store it in 'aggregates'
111 |    f = HTSeq.GenomicFeature( aggregate_id, "exonic_part", iv )   
112 |    f.source = os.path.basename( sys.argv[0] )
113 | #   f.source = "camara"
114 |    f.attr = {}
115 |    f.attr[ 'gene_id' ] = aggregate_id
116 |    transcript_set = set( ( transcript_id for gene_id, transcript_id in s ) )
117 |    f.attr[ 'transcripts' ] = '+'.join( transcript_set )
118 |    aggregates[ aggregate_id ].append( f )
119 | 
120 | 
121 | # Step 4: For each aggregate, number the exonic parts
122 | 
123 | aggregate_features = []
124 | for l in list(aggregates.values()):
125 |    for i in range( len(l)-1 ):
126 |       assert l[i].name == l[i+1].name, str(l[i+1]) + " has wrong name"
127 |       assert l[i].iv.end <= l[i+1].iv.start, str(l[i+1]) + " starts too early"
128 |       if l[i].iv.chrom != l[i+1].iv.chrom:
129 |          raise ValueError("Same name found on two chromosomes: %s, %s" % ( str(l[i]), str(l[i+1]) ))
130 |       if l[i].iv.strand != l[i+1].iv.strand:
131 |          raise ValueError("Same name found on two strands: %s, %s" % ( str(l[i]), str(l[i+1]) ))
132 |    aggr_feat = HTSeq.GenomicFeature( l[0].name, "aggregate_gene", 
133 |       HTSeq.GenomicInterval( l[0].iv.chrom, l[0].iv.start, 
134 |          l[-1].iv.end, l[0].iv.strand ) )
135 |    aggr_feat.source = os.path.basename( sys.argv[0] )
136 |    aggr_feat.attr = { 'gene_id': aggr_feat.name }
137 |    for i in range( len(l) ):
138 |       l[i].attr['exonic_part_number'] = "%03d" % ( i+1 )
139 |    aggregate_features.append( aggr_feat )
140 |       
141 |       
142 | # Step 5: Sort the aggregates, then write everything out
143 | 
144 | aggregate_features.sort( key = lambda f: ( f.iv.chrom, f.iv.start ) )
145 | 
146 | fout = open( out_file, "w" ) 
147 | for aggr_feat in aggregate_features:
148 |    fout.write( aggr_feat.get_gff_line() )
149 |    for f in aggregates[ aggr_feat.name ]:
150 |       fout.write( f.get_gff_line() )
151 | 
152 | fout.close()      
153 | 


--------------------------------------------------------------------------------
/additional_scripts/download_proteome_uniprot.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | use warnings;
 4 | use strict;
 5 | use LWP::UserAgent;
 6 | use HTTP::Date;
 7 | 
 8 | # Check that a taxonomy identifier was passed as a command line argument
 9 | if (!$ARGV[0]) {
10 |   die "Error: No taxonomy identifier specified.\nUsage: perl download_proteome_uniprot.pl <taxonomy_id>\n";
11 | }
12 | 
13 | # Taxonomy identifier of top node for query, e.g. 2 for Bacteria, 2157 for Archaea, etc.
14 | # (see https://www.uniprot.org/taxonomy)
15 | my $top_node = $ARGV[0];
16 | 
17 | # Create a user agent for making HTTP requests
18 | my $agent = LWP::UserAgent->new;
19 | 
20 | # Get a list of all reference proteomes of organisms below the given taxonomy node.
21 | my $query_list = "https://rest.uniprot.org/proteomes/stream?query=reference:true+taxonomy_id:$top_node&format=list";
22 | 
23 | my $response_list = $agent->get($query_list);
24 | 
25 | # Check for HTTP errors
26 | if (!$response_list->is_success) {
27 |   die 'Failed to get proteome list: ' . $response_list->status_line .
28 |     ' for ' . $response_list->request->uri . "\n";
29 | }
30 | 
31 | # For each proteome, mirror its set of UniProt entries in compressed FASTA format.
32 | for my $proteome (split(/\n/, $response_list->content)) {
33 |   my $file = $proteome . '.fasta.gz';
34 |   my $query_proteome = "https://rest.uniprot.org/uniprotkb/stream?query=proteome:$proteome&format=fasta&compressed=true";
35 |   my $response_proteome = $agent->mirror($query_proteome, $file);
36 | 
37 |   # Check for HTTP errors
38 |   if ($response_proteome->is_success) {
39 |     my $release = $response_proteome->header('x-uniprot-release');
40 |     my $date = $response_proteome->header('x-uniprot-release-date');
41 |     print "File $file: downloaded entries of UniProt release $release ($date)\n";
42 |   }
43 |   elsif ($response_proteome->code == HTTP::Status::RC_NOT_MODIFIED) {
44 |     print "File $file: up-to-date\n";
45 |   }
46 |   else {
47 |     die 'Failed to download proteome: ' . $response_proteome->status_line .
48 |       ' for ' . $response_proteome->request->uri . "\n";
49 |   }
50 | }
51 | 


--------------------------------------------------------------------------------
/additional_scripts/homolog_parser.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | ### Libraries
  5 | 
  6 | # conda install -c anaconda pandas
  7 | # conda install --channel conda-forge --channel bioconda pybedtools
  8 | # pip install xlrd==1.2.0
  9 | 
 10 | import sys
 11 | import csv
 12 | import argparse
 13 | import pandas as pd
 14 | import numpy as np
 15 | import pybedtools
 16 | 
 17 | np.random.seed(5)
 18 | 
 19 | parser = argparse.ArgumentParser(description="This script will annotate and add genomic coordinates to Novel proteins not included in reference GTF annotations.")
 20 | parser.add_argument('--Ref', help="Ref_Transcript_Annotation.csv file, obtained with isoform-identification pipeline", type=str)
 21 | parser.add_argument('--blastp', help="parsed_results.tab file, blastp result from Novel proteins against UniProt database", type=str)
 22 | parser.add_argument('--bed', help="final_annotated.format.bed, which correspond to final_annotation.gtf file as BED format", type=str)
 23 | parser.add_argument('--eggnog', help="out.emapper.annotations.xlsx file, which correspond to eggNOG-mapper annotations", type=str)
 24 | args = parser.parse_args()
 25 | 
 26 | name = sys.argv[0]
 27 | REFERENCE = str(sys.argv[2])
 28 | BLASTP = str(sys.argv[4])
 29 | BED = str(sys.argv[6])
 30 | EGGNOG = str(sys.argv[8])
 31 | 
 32 | class bcolors:
 33 |     HEADER = '\033[95m'
 34 |     OKBLUE = '\033[94m'
 35 |     OKCYAN = '\033[96m'
 36 |     OKGREEN = '\033[92m'
 37 |     OKRED = '\033[91m'
 38 |     FAIL = '\033[91m'
 39 |     ENDC = '\033[0m'
 40 |     BOLD = '\033[1m'
 41 |     UNDERLINE = '\033[4m'
 42 | 
 43 | 
 44 | print(bcolors.OKGREEN + "1 ::: Reading Ref_Transcript_Annotation.csv ::: " + bcolors.ENDC)
 45 | print("")
 46 | 
 47 | 
 48 | df1 = pd.read_csv(REFERENCE, sep = ',')
 49 | df1 = df1.rename(columns={'qry_id': 'transcript'})
 50 | print("Number of transcripts matching reference:", df1.shape[0])
 51 | print("")
 52 | 
 53 | 
 54 | df1.head(5)
 55 | 
 56 | print(bcolors.OKGREEN + "2 ::: Reading parsed_results.tab ::: " + bcolors.ENDC)
 57 | print("")
 58 | 
 59 | 
 60 | df2 = pd.read_csv(BLASTP, sep = '\t')
 61 | print("Number of novel proteins sharing >=90% identity with uniprot:", df2.shape[0])
 62 | print("")
 63 | 
 64 | 
 65 | df2.head(5)
 66 | 
 67 | 
 68 | print(bcolors.OKGREEN + " ::: formatting columns of blastp_results :::" + bcolors.ENDC)
 69 | print("")
 70 | 
 71 | df2[['hit_id','Transcript Description']] = df2['hit_id'].str.split(" ", 1 ,expand=True)
 72 | df2
 73 | 
 74 | 
 75 | df2[['Transcript Description', 'organism']] = df2['Transcript Description'].str.split('OS=',expand=True)
 76 | df2
 77 | 
 78 | print(df2.head(5))
 79 | 
 80 | blastp_results = df2.iloc[:, 0:7]
 81 | blastp_results.columns = ['transcript', 'hit_id', 'percentage_identity', 'query_length', 'alignment_length', 'e_value', 'Transcript Description']
 82 | 
 83 | 
 84 | print(blastp_results.head(5))
 85 | 
 86 | 
 87 | print(bcolors.OKGREEN + "3 ::: Reading final_annotated.format.bed ::: " + bcolors.ENDC)
 88 | print("")
 89 | 
 90 | 
 91 | df3 = pd.read_csv(BED, sep = '\t')
 92 | print("Number of trascripts in bed format:", df3.shape[0])
 93 | print("")
 94 | df3.columns = ['chr', 'start', 'end', 'transcript', 'gene']
 95 | 
 96 | 
 97 | print(df3.head(5))
 98 | 
 99 | 
100 | print(bcolors.OKGREEN + "4 ::: Adding genomic coordinates to blastp_results ::: " + bcolors.ENDC)
101 | print("")
102 | 
103 | 
104 | result1 = pd.merge(blastp_results, df3, on='transcript', how='inner')
105 | result1
106 | 
107 | 
108 | print(bcolors.OKGREEN + "5 ::: Adding genomic coordinates to Novel proteins and Ref_Transcript_Annotation ::: " + bcolors.ENDC)
109 | print("")
110 | 
111 | 
112 | result2 = pd.merge(df1, df3, on='transcript', how='inner')
113 | result2
114 | 
115 | 
116 | Novel_protein_hits = result1.loc[:, ['chr', 'start', 'end', 'transcript', 'Transcript Description', 'percentage_identity']]
117 | print("Novel protein hits:")
118 | print(Novel_protein_hits.head(5))
119 | print("")
120 | 
121 | Reference_annotation = result2.loc[:, ['chr', 'start', 'end', 'transcript', 'Transcript Description', 'NCBI RefSeq Gene ID']]
122 | print("Reference annotations:")
123 | print(Reference_annotation.head(5))
124 | print("")
125 | 
126 | print(bcolors.OKGREEN + " 6 ::: Creating bed files with annotations ::: " + bcolors.ENDC)
127 | print("")
128 | a = pybedtools.BedTool.from_dataframe(Novel_protein_hits)
129 | b = pybedtools.BedTool.from_dataframe(Reference_annotation)
130 | a.saveas('Novel_protein_with_coordinates.bed')
131 | b.saveas('Reference_annotation_with_coordinates.bed')
132 | 
133 | print(bcolors.OKGREEN + " 7 ::: Parsing eggNOG-mapper annotations and intersect with blastp results::: " + bcolors.ENDC)
134 | print("")
135 | 
136 | df1 = pd.read_excel(EGGNOG)
137 | 
138 | df1.head(5)
139 | 
140 | df1 = df1.rename(columns={'query': 'transcript'})
141 | 
142 | intersect1 = pd.merge(df1, blastp_results, on='transcript', how='inner')
143 | Novel_protein_hits_coords = result1.loc[:, ['chr', 'start', 'end', 'transcript']]
144 | intersect2 = pd.merge(Novel_protein_hits_coords, intersect1, on='transcript', how='inner')
145 | intersect2 = intersect2.rename(columns={'hit_id': 'blastp_hit_id', 'percentage_identity': 'blastp_percentage_identity', 'e_value': 'blastp_e_value', 'Transcript Description': 'blastp_transcript_description'})
146 | intersect2.to_csv('eggNOG-mapper-blastp-intersections.csv')
147 | 
148 | print(intersect2.head(5))
149 | print("")
150 | 
151 | print("filtering table with intersections with assigned Gene IDs: ")
152 | intersect3 = intersect2.replace('-', np.nan)
153 | intersect3 = intersect3.dropna(subset=['Preferred_name'])
154 | c = pybedtools.BedTool.from_dataframe(intersect3)
155 | c.saveas('eggNOG-mapper-blastp-with_coordinates.bed')
156 | 
157 | print("")
158 | print(bcolors.OKGREEN + "Novel_protein_with_coordinates.bed ==> corresponds to a BED file containing novel proteins with mapped coordinates" + bcolors.ENDC)
159 | print("")
160 | print(bcolors.OKGREEN + "Reference_annotation_with_coordinates.bed ==> corresponds to a BED file containing annotated proteins with mapped coordinates" + bcolors.ENDC)
161 | print("")
162 | print(bcolors.OKGREEN + "eggNOG-mapper-blastp-intersections.csv ==> corresponds to a csv file containing eggNOG-mapper + blastp intersections" + bcolors.ENDC)
163 | print("")
164 | print(bcolors.OKGREEN + "eggNOG-mapper-blastp-with_coordinates.bed ==> corresponds to a BED file containing eggNOG-mapper + blastp intersections" + bcolors.ENDC)
165 | print("")
166 | print("All Done")
167 | print("")
168 | 


--------------------------------------------------------------------------------
/additional_scripts/transcriptome_metrics.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | set -e
  4 | 
  5 | usage="$(basename "$0") [-h] [-f <final_annotated.gtf>] [-g <reference_genome.fasta>]
  6 | This script will obtain metrics from the annotated StringTie transcripts (final_annotated.gtf) and output them into -transcriptome_metrics- sudirectory.
  7 | Arguments:
  8 |     -h  show this help text
  9 |     -f  Name of the StringTie annotated GTF from the pipeline
 10 |     -g  Reference genome (in fasta format)"
 11 | options=':hf:g:'
 12 | while getopts $options option; do
 13 |   case "$option" in
 14 |     h) echo "$usage"; exit;;
 15 |     f) f=$OPTARG;;
 16 |     g) g=$OPTARG;;
 17 |     :) printf "missing argument for -%s\n" "$OPTARG" >&2; echo "$usage" >&2; exit 1;;
 18 |    \?) printf "illegal option: -%s\n" "$OPTARG" >&2; echo "$usage" >&2; exit 1;;
 19 |   esac
 20 | done
 21 | 
 22 | # mandatory arguments
 23 | if [ ! "$f" ] || [ ! "$g" ]; then
 24 |   echo "arguments -f and -g must be provided"
 25 |   echo "$usage" >&2; exit 1
 26 | fi
 27 | 
 28 | begin=`date +%s`
 29 | #    .---------- constant part!
 30 | #    vvvv vvvv-- the code from above
 31 | YELLOW='\033[1;33m'
 32 | PURPLE='\033[0;35m'
 33 | CYAN='\033[0;36m'
 34 | NC='\033[0m' # No Color
 35 | echo "Cleaning directory..."
 36 | rm -r -f transcriptome_metrics
 37 | echo ""
 38 | echo "done"
 39 | echo ""
 40 | echo "===> Working on transcriptome metrics"
 41 | echo ""
 42 | perl -lne 'print "@m" if @m=(/((?:transcript_id|gene_id)\s+\S+)/g);' ${f} > final_annotated.tab
 43 | sed -i 's/transcript_id //g' final_annotated.tab
 44 | sed -i 's/;/\t/g' final_annotated.tab
 45 | sed -i 's/gene_id//g' final_annotated.tab
 46 | sed -i 's/"//g' final_annotated.tab
 47 | awk '!a[$0]++' final_annotated.tab > genes_and_transcripts.tab && rm final_annotated.tab
 48 | awk '{print $1"\t"$2}' genes_and_transcripts.tab > genes-and-transcripts.tab && rm genes_and_transcripts.tab
 49 | awk '{print $1}' genes-and-transcripts.tab > genes.tab
 50 | # Novel genes list
 51 | grep "STRG." genes.tab > novel-genes.tab
 52 | # Known genes list
 53 | grep -v "STRG." genes.tab > known-genes.tab
 54 | echo "::: Parsing final_annotated.gtf file to obtain novel/known and coding/lncRNA transcripts, respectively."
 55 | echo ""
 56 | grep -w -F -f novel-genes.tab final_annotated.gtf > novel-genes.gtf
 57 | grep -w -F -f known-genes.tab final_annotated.gtf > known-genes.gtf
 58 | grep "coding" known-genes.gtf > known-genes-coding.gtf
 59 | grep "lncRNA" known-genes.gtf > known-genes-lncRNA.gtf
 60 | grep "StringTie" known-genes.gtf > known-genes-other.gtf  # other = no lncRNA and no protein-coding
 61 | grep "coding" novel-genes.gtf > novel-genes-coding.gtf
 62 | grep "lncRNA" novel-genes.gtf > novel-genes-lncRNA.gtf
 63 | grep "StringTie" novel-genes.gtf > novel-genes-other.gtf  # other = no lncRNA and no protein-coding
 64 | echo "::: We will use gffread to obtain reconciled and novel transcripts in the parsed GTF file"
 65 | echo ""
 66 | gffread -w known-transcripts-coding.fa -g ${g} known-genes-coding.gtf
 67 | gffread -w known-transcripts-lncRNA.fa -g ${g} known-genes-lncRNA.gtf
 68 | gffread -w known-transcripts-other.fa -g ${g} known-genes-other.gtf
 69 | gffread -w novel-transcripts-coding.fa -g ${g} novel-genes-coding.gtf
 70 | gffread -w novel-transcripts-lncRNA.fa -g ${g} novel-genes-lncRNA.gtf
 71 | gffread -w novel-transcripts-other.fa -g ${g} novel-genes-other.gtf
 72 | exec 3<> transcriptome_metrics.txt
 73 | echo "Number of reconciled coding transcripts:" >> transcriptome_metrics.txt
 74 | grep ">" known-transcripts-coding.fa -c >> transcriptome_metrics.txt
 75 | echo "" >> transcriptome_metrics.txt
 76 | echo "Number of reconciled non-coding transcripts:" >> transcriptome_metrics.txt
 77 | grep ">" known-transcripts-lncRNA.fa -c >> transcriptome_metrics.txt
 78 | echo "" >> transcriptome_metrics.txt
 79 | echo "Number of other expressed features, annotated:" >> transcriptome_metrics.txt
 80 | grep ">" known-transcripts-other.fa -c >> transcriptome_metrics.txt
 81 | echo "" >> transcriptome_metrics.txt
 82 | echo "Number of non-annotated coding transcripts:" >> transcriptome_metrics.txt
 83 | grep ">" novel-transcripts-coding.fa -c >> transcriptome_metrics.txt
 84 | echo "" >> transcriptome_metrics.txt
 85 | echo "Number of non-annotated non-coding novel transcripts:" >> transcriptome_metrics.txt
 86 | grep ">" novel-transcripts-lncRNA.fa -c >> transcriptome_metrics.txt
 87 | echo "" >> transcriptome_metrics.txt
 88 | echo "Number of novel expressed features:" >> transcriptome_metrics.txt
 89 | grep ">" novel-transcripts-other.fa -c >> transcriptome_metrics.txt
 90 | exec 3>&-
 91 | echo "::: Done. transcriptome_metrics.txt contains metrics of classified transcripts. Continue with gene metrics.."
 92 | echo ""
 93 | echo ""
 94 | echo "===> Working on gene metrics"
 95 | echo ""
 96 | exec 3<> gene_metrics.txt
 97 | # known coding genes counts
 98 | perl -lne 'print "@m" if @m=(/((?:transcript_id|gene_id)\s+\S+)/g);' known-genes-coding.gtf > known-genes-coding.tab
 99 | sed -i 's/transcript_id //g' known-genes-coding.tab
100 | sed -i 's/;/\t/g' known-genes-coding.tab
101 | sed -i 's/gene_id//g' known-genes-coding.tab
102 | sed -i 's/"//g' known-genes-coding.tab
103 | awk '{print $1}' known-genes-coding.tab > known-genes-coding.tabular && rm known-genes-coding.tab
104 | awk '!a[$0]++' known-genes-coding.tabular > known-genes-coding.tab && rm known-genes-coding.tabular
105 | echo "Number of reconciled coding genes:" >> gene_metrics.txt
106 | cat known-genes-coding.tab | wc -l >> gene_metrics.txt
107 | echo "" >> gene_metrics.txt
108 | echo "Number of reconciled non-coding genes:" >> gene_metrics.txt
109 | perl -lne 'print "@m" if @m=(/((?:transcript_id|gene_id)\s+\S+)/g);' known-genes-lncRNA.gtf > known-genes-lncRNA.tab
110 | sed -i 's/transcript_id //g' known-genes-lncRNA.tab
111 | sed -i 's/;/\t/g' known-genes-lncRNA.tab
112 | sed -i 's/gene_id//g' known-genes-lncRNA.tab
113 | sed -i 's/"//g' known-genes-lncRNA.tab
114 | awk '{print $1}' known-genes-lncRNA.tab > known-genes-lncRNA.tabular && rm known-genes-lncRNA.tab
115 | awk '!a[$0]++' known-genes-lncRNA.tabular > known-genes-lncRNA.tab && rm known-genes-lncRNA.tabular
116 | cat known-genes-lncRNA.tab | wc -l >> gene_metrics.txt
117 | echo "" >> gene_metrics.txt
118 | echo "Number of other reconciled genes:" >> gene_metrics.txt
119 | perl -lne 'print "@m" if @m=(/((?:transcript_id|gene_id)\s+\S+)/g);' known-genes-other.gtf > known-genes-other.tab
120 | sed -i 's/transcript_id //g' known-genes-other.tab
121 | sed -i 's/;/\t/g' known-genes-other.tab
122 | sed -i 's/gene_id//g' known-genes-other.tab
123 | sed -i 's/"//g' known-genes-other.tab
124 | awk '{print $1}' known-genes-other.tab > known-genes-other.tabular && rm known-genes-other.tab
125 | awk '!a[$0]++' known-genes-other.tabular > known-genes-other.tab && rm known-genes-other.tabular
126 | cat known-genes-other.tab | wc -l >> gene_metrics.txt
127 | echo "" >> gene_metrics.txt
128 | echo "Number of non-annotated coding genes:" >> gene_metrics.txt
129 | perl -lne 'print "@m" if @m=(/((?:transcript_id|gene_id)\s+\S+)/g);' novel-genes-coding.gtf > novel-genes-coding.tab
130 | sed -i 's/transcript_id //g' novel-genes-coding.tab
131 | sed -i 's/;/\t/g' novel-genes-coding.tab
132 | sed -i 's/gene_id//g' novel-genes-coding.tab
133 | sed -i 's/"//g' novel-genes-coding.tab
134 | awk '{print $1}' novel-genes-coding.tab > novel-genes-coding.tabular && rm novel-genes-coding.tab
135 | awk '!a[$0]++' novel-genes-coding.tabular > novel-genes-coding.tab && rm novel-genes-coding.tabular
136 | cat novel-genes-coding.tab | wc -l >> gene_metrics.txt
137 | echo "" >> gene_metrics.txt
138 | echo "Number of non-annotated non-coding genes:" >> gene_metrics.txt
139 | perl -lne 'print "@m" if @m=(/((?:transcript_id|gene_id)\s+\S+)/g);' novel-genes-lncRNA.gtf > novel-genes-lncRNA.tab
140 | sed -i 's/transcript_id //g' novel-genes-lncRNA.tab
141 | sed -i 's/;/\t/g' novel-genes-lncRNA.tab
142 | sed -i 's/gene_id//g' novel-genes-lncRNA.tab
143 | sed -i 's/"//g' novel-genes-lncRNA.tab
144 | awk '{print $1}' novel-genes-lncRNA.tab > novel-genes-lncRNA.tabular && rm novel-genes-lncRNA.tab
145 | awk '!a[$0]++' novel-genes-lncRNA.tabular > novel-genes-lncRNA.tab && rm novel-genes-lncRNA.tabular
146 | cat novel-genes-lncRNA.tab | wc -l >> gene_metrics.txt
147 | echo "" >> gene_metrics.txt
148 | echo "Number of non-annotated other genes:" >> gene_metrics.txt
149 | perl -lne 'print "@m" if @m=(/((?:transcript_id|gene_id)\s+\S+)/g);' novel-genes-other.gtf > novel-genes-other.tab
150 | sed -i 's/transcript_id //g' novel-genes-other.tab
151 | sed -i 's/;/\t/g' novel-genes-other.tab
152 | sed -i 's/gene_id//g' novel-genes-other.tab
153 | sed -i 's/"//g' novel-genes-other.tab
154 | awk '{print $1}' novel-genes-other.tab > novel-genes-other.tabular && rm novel-genes-other.tab
155 | awk '!a[$0]++' novel-genes-other.tabular > novel-genes-other.tab && rm novel-genes-other.tabular
156 | cat novel-genes-other.tab | wc -l  >> gene_metrics.txt
157 | exec 3>&-
158 | echo "::: gene_metrics.txt were succesfully generated."
159 | echo ""
160 | mkdir transcriptome_metrics
161 | mv genes-and-transcripts.tab genes.tab novel-genes.tab known-genes.tab novel-genes.gtf known-genes.gtf known-genes-coding.gtf known-genes-lncRNA.gtf known-genes-other.gtf novel-genes-coding.gtf novel-genes-lncRNA.gtf novel-genes-other.gtf known-transcripts-coding.fa known-transcripts-lncRNA.fa known-transcripts-other.fa novel-transcripts-coding.fa novel-transcripts-lncRNA.fa novel-transcripts-other.fa known-genes-coding.tab known-genes-lncRNA.tab known-genes-other.tab novel-genes-coding.tab novel-genes-lncRNA.tab novel-genes-other.tab ./transcriptome_metrics/
162 | mv transcriptome_metrics.txt gene_metrics.txt ./transcriptome_metrics/
163 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::${NC}\n"
164 | echo "All Done. The transcripts were classified in the ./transcriptome_metrics subdirectory."
165 | echo ""
166 | echo "Transcript discoveries are summarized in transcriptome_metrics.txt file."
167 | echo ""
168 | echo "Gene discoveries are summarized in gene_metrics.txt file."
169 | echo ""
170 | echo "known-genes-coding.gtf, known-genes-lncRNA.gtf and known-genes-other.gtf contains reconciled annotation with reference, in GTF format"
171 | echo ""
172 | echo "novel-genes-coding.gtf, novel-genes-lncRNA.gtf and novel-genes-other.gtf contains novel annotation with reference, in GTF format"
173 | echo ""
174 | echo "known-transcripts-coding.fa, known-transcripts-lncRNA.fa and known-transcripts-other.fa contains reconciled classified transcripts, in FASTA format"
175 | echo ""
176 | echo "novel-transcripts-coding.fa, novel-transcripts-lncRNA.fa and novel-transcripts-other.fa contains novel classified transcripts, in FASTA format"
177 | echo ""
178 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::${NC}\n"
179 | 


--------------------------------------------------------------------------------
/bash_scripts/add_ncbi_annotation.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | set -e
  4 | 
  5 | {
  6 | 
  7 | usage="$(basename "$0") [-h] [-a <stringtie.gtf>] [-n <NCBI_reference.gtf>] [-r <reference_genome.gtf>] [-g <reference_genome.fasta>] [-c <gawn_config>] [-t <threads>] [-o <output>]
  8 | This pipeline will Overlap StringTie transcripts (GTF format) with current NCBI annotation and will annotate novel transcripts.
  9 | Arguments:
 10 |     -h  show this help text
 11 |     -a  StringTie GTF
 12 |     -n  NCBI gene annotation (in GTF format)
 13 |     -r  UCSC gene annotation (in GTF format)
 14 |     -g  Reference genome (in fasta format)
 15 |     -c  GAWN config file (path to gawn_config.sh in annotate_my_genomes folder)
 16 |     -t  Number of threads for processing (integer)
 17 |     -o  output folder (must exist)"
 18 | options=':ha:n:r:g:c:t:o:'
 19 | while getopts $options option; do
 20 |   case "$option" in
 21 |     h) echo "$usage"; exit;;
 22 |     a) a=$OPTARG;;
 23 |     n) n=$OPTARG;;
 24 |     r) r=$OPTARG;;
 25 |     g) g=$OPTARG;;
 26 |     c) c=$OPTARG;;
 27 |     t) t=$OPTARG;;
 28 |     o) o=$OPTARG;;
 29 |     :) printf "missing argument for -%s\n" "$OPTARG" >&2; echo "$usage" >&2; exit 1;;
 30 |    \?) printf "illegal option: -%s\n" "$OPTARG" >&2; echo "$usage" >&2; exit 1;;
 31 |   esac
 32 | done
 33 | 
 34 | # mandatory arguments
 35 | if [ ! "$a" ] || [ ! "$n" ] || [ ! "$r" ] || [ ! "$g" ] || [ ! "$c" ] || [ ! "$t" ] || [ ! "$o" ]; then
 36 |   echo "arguments -a, -n, -r, -g, -c, -t and -o must be provided"
 37 |   echo "$usage" >&2; exit 1
 38 | fi
 39 | 
 40 | # Conditions : output folder
 41 | if [ ! -d "$o" ]; then
 42 |   echo "Output directory: $o not found. Please create the output directory first, before running the pipeline."
 43 |   exit 9999 # die with error code 9999
 44 | fi
 45 | 
 46 | # Conditions : Input existance if [ ! "$a" ] || [ ! "$n" ] || [ ! "$r" ] || [ ! "$g" ] || [ ! "$c" ] || [ ! "$t" ] || [ ! "$o" ]; then
 47 | 
 48 | if [ ! -e "$a" ]; then
 49 |     echo "$a does not exist. Check your -a input"
 50 |     exit 9999 # die with error code 9999
 51 | fi
 52 | 
 53 | if [ ! -e "$n" ]; then
 54 |     echo "$n does not exist. Check your -n input"
 55 |     exit 9999 # die with error code 9999
 56 | fi
 57 | 
 58 | if [ ! -e "$r" ]; then
 59 |     echo "$r does not exist. Check your -r input"
 60 |     exit 9999 # die with error code 9999
 61 | fi
 62 | 
 63 | if [ ! -e "$g" ]; then
 64 |     echo "$g does not exist. Check your -g input"
 65 |     exit 9999 # die with error code 9999
 66 | fi
 67 | 
 68 | if [ ! -e "$c" ]; then
 69 |     echo "$c does not exist. Check your -c input"
 70 |     exit 9999 # die with error code 9999
 71 | fi
 72 | 
 73 | # Conditions : Getting absolute path of inputs
 74 | echo ""
 75 | a_DIR="$( cd "$( dirname "$a" )" && pwd )"
 76 | echo ""
 77 | echo "::: The absolute path of -a is $a_DIR"
 78 | echo ""
 79 | n_DIR="$( cd "$( dirname "$n" )" && pwd )"
 80 | echo ""
 81 | echo "::: The absolute path of -n is $n_DIR"
 82 | echo ""
 83 | r_DIR="$( cd "$( dirname "$r" )" && pwd )"
 84 | echo ""
 85 | echo "::: The absolute path of -r is $r_DIR"
 86 | echo ""
 87 | g_DIR="$( cd "$( dirname "$g" )" && pwd )"
 88 | echo ""
 89 | echo "::: The absolute path of -g is $g_DIR"
 90 | echo ""
 91 | c_DIR="$( cd "$( dirname "$c" )" && pwd )"
 92 | echo ""
 93 | echo "::: The absolute path of -c is $c_DIR"
 94 | echo ""
 95 | o_DIR="$( cd "$( dirname "$o" )" && pwd )"
 96 | echo ""
 97 | echo "::: The absolute path of -o is $o_DIR"
 98 | echo ""
 99 | 
100 | 
101 | begin=`date +%s`
102 | #    .---------- constant part!
103 | #    vvvv vvvv-- the code from above
104 | YELLOW='\033[1;33m'
105 | PURPLE='\033[0;35m'
106 | CYAN='\033[0;36m'
107 | NC='\033[0m' # No Color
108 | 
109 | 
110 | printf "${YELLOW}::: Defining Variables :::\n"
111 | echo ""
112 | echo "Defining variables:"
113 | echo""
114 | FILE1="$a"
115 | basename "$FILE1"
116 | stringtie_input="$(basename -- $FILE1)"
117 | echo "The stringtie file used as input is the following: $stringtie_input"
118 | echo ""
119 | FILE2="$n"
120 | basename "$FILE2"
121 | ncbi_reference_gtf="$(basename -- $FILE2)"
122 | echo "The NCBI reference GTF used as input is the following: $ncbi_reference_gtf"
123 | echo ""
124 | FILE3="$r"
125 | basename "$FILE3"
126 | reference_gtf="$(basename -- $FILE3)"
127 | echo "The reference GTF used as input is the following: $reference_gtf"
128 | echo ""
129 | FILE4="$g"
130 | basename "$FILE4"
131 | reference_genome="$(basename -- $FILE4)"
132 | echo "The reference genome used as input is the following: $reference_genome"
133 | echo ""
134 | FILE5="$c"
135 | basename "$FILE5"
136 | gawn_config="$(basename -- $FILE5)"
137 | echo "The gawn_config file used as input is the following: $gawn_config"
138 | echo ""
139 | FILE6="$t"
140 | basename "$FILE6"
141 | threads="$(basename -- $FILE6)"
142 | echo "The number of threads for calculation are the following: $threads"
143 | echo ""
144 | FILE7="$o"
145 | basename "$FILE7"
146 | output_folder="$(basename -- $FILE7)"
147 | echo "The output folder is the following: $output_folder"
148 | echo ""
149 | 
150 | printf "${YELLOW}:::::::::::::::::::::::::::::::\n"
151 | printf "${YELLOW}::: 0. Defining directories :::\n"
152 | printf "${YELLOW}:::::::::::::::::::::::::::::::${CYAN}\n"
153 | echo ""
154 | 
155 | dir0=$(cd -P -- "$(dirname -- "$0")" && pwd -P)
156 | 
157 | sec=$(date "+%Y%m%d_%H%M%S")
158 | # mkdir add_ncbi_annotation_$sec
159 | 
160 | if [ -z "$(ls -A ${o_DIR}/${output_folder})" ]; then
161 |    echo ""
162 |    echo "Output folder is empty. We will work inside the provided output folder: "
163 |    cd ${o_DIR}/${output_folder}
164 |    echo ""
165 | else
166 |    echo ""
167 |    echo "Output folder is not empty. Creating temporary folder:"
168 |    sec=$(date "+%Y%m%d_%H%M%S")
169 |    cd ${o_DIR}/${output_folder}
170 |    mkdir add_ncbi_annotation_$sec && cd add_ncbi_annotation_$sec
171 | fi
172 | 
173 | # cd ${o_DIR}/${output_folder}
174 | 
175 | if [ -f $stringtie_input ]; then
176 |     echo ""
177 |     echo "$stringtie_input file found in output directory. Continue."
178 |     echo ""
179 |     : 
180 | else
181 |     echo ""
182 |     echo "Copying $stringtie_input file into the output directory:"
183 |     cp ${a_DIR}/${stringtie_input} ./
184 |     echo ""
185 | fi
186 | 
187 | # cp ${a_DIR}/${stringtie_input} ${o_DIR}/${output_folder}
188 | # cd add_ncbi_annotation_$sec
189 | 
190 | dir1=$(cd -P -- "$(dirname -- "$0")" && pwd -P)
191 | echo ""
192 | echo "Current Working Directory:"
193 | echo ""
194 | echo $dir1
195 | echo ""
196 | printf "${YELLOW}::: Done :::\n"
197 | echo ""
198 | 
199 | printf "${YELLOW}:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n"
200 | printf "${YELLOW}::: 1. Overlapping StringTie transcripts with NCBI annotation :::\n"
201 | printf "${YELLOW}:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::${CYAN}\n"
202 | echo ""
203 | 
204 | gffcompare -R -r ${n_DIR}/${ncbi_reference_gtf} -s ${g_DIR}/${reference_genome} -o NCBI_compare ${stringtie_input}
205 | echo "Done."
206 | printf "${PURPLE}::: Done :::\n"
207 | echo ""
208 | 
209 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::\n"
210 | printf "${YELLOW}::: 2. Writting novel discoveries to Stats.txt :::\n"
211 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::${CYAN}\n"
212 | 
213 | echo ""
214 | # Stats
215 | exec 3<> Stats.txt
216 | echo "Number of assembled genes:" >> Stats.txt
217 | cat NCBI_compare.${stringtie_input}.tmap | sed "1d" | cut -f4 | sort | uniq | wc -l >> Stats.txt
218 | echo "" >> Stats.txt
219 | echo "Number of novel genes:" >> Stats.txt
220 | cat NCBI_compare.${stringtie_input}.tmap | awk '$3=="u"{print $0}' | cut -f4 | sort | uniq | wc -l >> Stats.txt
221 | echo "" >> Stats.txt
222 | echo "Number of novel transcripts:" >> Stats.txt
223 | cat NCBI_compare.${stringtie_input}.tmap | awk '$3=="u"{print $0}' | cut -f5 | sort | uniq | wc -l >> Stats.txt
224 | echo "" >> Stats.txt
225 | echo "Number of transcripts matching annotation:" >> Stats.txt
226 | cat NCBI_compare.${stringtie_input}.tmap | awk '$3=="="{print $0}' | cut -f5 | sort | uniq | wc -l >> Stats.txt
227 | exec 3>&-
228 | printf "${PURPLE}Done\n"
229 | echo ""
230 | 
231 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n"
232 | printf "${YELLOW}::: 3. Replacing gene_id field in final_annotated.gtf file with NCBI gene_id's :::\n"
233 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::${CYAN}\n"
234 | 
235 | echo ""
236 | #######################################
237 | # Merging novel transcripts with ref. #
238 | #######################################
239 | awk '{print $4"\t"$1}' NCBI_compare.${stringtie_input}.tmap > NCBI_compare.${stringtie_input}.tmap.1
240 | tail -n +2 NCBI_compare.${stringtie_input}.tmap.1 > NCBI_compare.${stringtie_input}.tmap.2
241 | awk '$2 != "-"' NCBI_compare.${stringtie_input}.tmap.2 > namelist
242 | awk '!a[$0]++' namelist > namelist_unique
243 | tac namelist_unique > namelist_unique_sorted
244 | rm namelist namelist_unique
245 | awk '{print $1}' namelist_unique_sorted  > A
246 | awk '{print $2}' namelist_unique_sorted  > B
247 | sed 's/^/"/' A > A.1
248 | sed 's/$/"/' A.1 > A.2
249 | sed 's/^/"/' B > B.1
250 | sed 's/$/"/' B.1 > B.2
251 | paste -d'\t' A.2 B.2 > namelist
252 | rm A A.1 A.2 B B.1 B.2
253 | ###############################
254 | # Getting gene names replaced #
255 | ###############################
256 | awk '{print $1}' namelist > fileA
257 | awk '{print $2}' namelist > fileB
258 | paste -d % fileA fileB > sed.script
259 | sed -i -e 's/^/s%/' sed.script
260 | sed -i -e 's/$/%/' sed.script
261 | cat ${a_DIR}/${stringtie_input} | parallel --pipe -j ${t} sed -f sed.script > final_annotated.gtf
262 | rm -f fileA fileB *tmap.1 *tmap.2
263 | # sorting GTF file
264 | rm -r -f gff3sort
265 | git clone https://github.com/cfarkas/gff3sort.git
266 | perl ./gff3sort/gff3sort.pl final_annotated.gtf > final_annotated.sorted.gtf
267 | rm final_annotated.gtf
268 | mv final_annotated.sorted.gtf final_annotated.gtf
269 | printf "${PURPLE}::: Done. Gene_id field was replaced in the StringTie.gtf file and final_annotated.gtf was generated with these changes\n"
270 | echo ""
271 | printf "${PURPLE}::: Moving gffcompare results to gffcompare_outputs folder ...\n"
272 | echo ""
273 | rm -r -f gffcompare_outputs_NCBI
274 | mkdir gffcompare_outputs_NCBI
275 | mv *.loci *.stats *.refmap *.tmap *.tracking ./gffcompare_outputs_NCBI
276 | echo ""
277 | printf "${PURPLE}::: Done\n"
278 | echo ""
279 | 
280 | printf "${YELLOW}:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n"
281 | printf "${YELLOW}::: 4. Obtaining Transcripts in FASTA format with gffread :::\n"
282 | printf "${YELLOW}:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::${CYAN}\n"
283 | 
284 | echo ""
285 | gffread -w NCBI_transcripts.fa -g ${g_DIR}/${reference_genome} final_annotated.gtf
286 | echo ""
287 | printf "${PURPLE}::: Done. NCBI_transcripts.fa are located in current directory\n"
288 | echo ""
289 | 
290 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n"
291 | printf "${YELLOW}::: 5. Performing gene annotation by using GAWN pipeline :::\n"
292 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::${CYAN}\n"
293 | 
294 | ################################################################
295 | # Configuring Gawn Inputs, config file and running GAWN pipeline
296 | ################################################################
297 | 
298 | echo ""
299 | printf "${PURPLE}::: Downloading GAWN annotation folder. See https://github.com/enormandeau/gawn.git${CYAN}\n"
300 | echo ""
301 | rm -r -f gawn
302 | git clone https://github.com/cfarkas/gawn.git
303 | cd gawn/02_infos/
304 | dir2=$(cd -P -- "$(dirname -- "$0")" && pwd -P)
305 | echo "Done"
306 | echo ""
307 | cd ${dir1}
308 | cp ${g_DIR}/${reference_genome} ${dir1}/gawn/03_data/genome.fasta
309 | cp NCBI_transcripts.fa ${dir1}/gawn/03_data/transcriptome.fasta
310 | rm ${dir2}/gawn_config.sh
311 | cp ${c_DIR}/${gawn_config} ${dir2}/gawn_config.sh
312 | echo ""
313 | printf "${PURPLE}::: Starting GAWN transcript annotation${CYAN}\n"
314 | echo ""
315 | cd ${dir1}/gawn/
316 | ./gawn 02_infos/gawn_config.sh
317 | echo ""
318 | printf "${PURPLE}::: Done. The novel transcripts were annotated in ./gawn/04_annotation/ :::${CYAN}\n"
319 | echo ""
320 | 
321 | #################################
322 | # Extracting transcriptome hits #
323 | #################################
324 | 
325 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::\n"
326 | printf "${YELLOW}::: 6. Extracting transcriptome hits :::\n"
327 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::${CYAN}\n"
328 | 
329 | echo ""
330 | cd ${dir1}/
331 | cp ${dir1}/gawn/04_annotation/transcriptome.swissprot ${dir1}
332 | cp ${dir1}/gawn/04_annotation/transcriptome.hits ${dir1}
333 | printf "${PURPLE}::: Done. transcriptome hits were succesfully extracted :::${CYAN}\n"
334 | echo ""
335 | 
336 | ############################################
337 | # FEELnc long noncoding RNA identification #
338 | ############################################
339 | 
340 | cd ${dir1}
341 | 
342 | printf "${YELLOW}:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n"
343 | printf "${YELLOW}::: 7. Classifying protein-coding and long non-coding transcripts with FEELnc :::\n"
344 | printf "${YELLOW}:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::${CYAN}\n"
345 | 
346 | grep "NM_" ${r_DIR}/${reference_gtf} > NM_coding.gtf
347 | echo ""
348 | printf "${PURPLE}::: 1/3) Filtering transcripts :::${CYAN}\n"
349 | # Filter
350 | FEELnc_filter.pl -i final_annotated.gtf -a NM_coding.gtf -b transcript_biotype=protein_coding > candidate_lncRNA.gtf
351 | rm -r -f ${g_DIR}/${reference_genome}.index
352 | printf "${PURPLE}::: 2/3) Evaluating coding potential :::${CYAN}\n"
353 | # Coding_Potential
354 | FEELnc_codpot.pl -i candidate_lncRNA.gtf -a NM_coding.gtf -b transcript_biotype=protein_coding -g ${g_DIR}/${reference_genome} --mode=shuffle
355 | printf "${PURPLE}::: 3/3) Classifiyng lncRNA transcripts :::${CYAN}\n"
356 | # Classifier
357 | FEELnc_classifier.pl -i feelnc_codpot_out/candidate_lncRNA.gtf.lncRNA.gtf -a NM_coding.gtf > candidate_lncRNA_classes.txt
358 | echo ""
359 | printf "${PURPLE}::: FEELnc calculations were done. The output is called candidate_lncRNA_classes.txt :::\n"
360 | echo ""
361 | 
362 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::\n"
363 | printf "${YELLOW}::: 8. Parsing GAWN and FEELnc outputs :::\n"
364 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::${CYAN}\n"
365 | 
366 | echo ""
367 | cd ${dir1}
368 | awk '{print $3}' candidate_lncRNA_classes.txt > lncRNA_genes
369 | tail -n +2 lncRNA_genes > lncRNA_transcripts
370 | rm lncRNA_genes
371 | grep -w -F -f lncRNA_transcripts final_annotated.gtf > merged.fixed.lncRNAs.gtf
372 | grep --invert-match -F -f lncRNA_transcripts final_annotated.gtf > merged.fixed.coding.gtf
373 | rm final_annotated.gtf
374 | sed -i 's/StringTie/lncRNA/' merged.fixed.lncRNAs.gtf
375 | awk '{print $1"\t"$2}' transcriptome.hits > coding_list
376 | awk -F'\t' '$2!=""' coding_list > coding_transcripts
377 | awk '{print $1}' coding_transcripts > coding_transcripts.tab
378 | rm coding_lis* coding_transcripts lncRNA_transcripts
379 | grep -w -F -f coding_transcripts.tab merged.fixed.coding.gtf > coding-genes.gtf
380 | grep --invert-match -F -f coding_transcripts.tab merged.fixed.coding.gtf > other-genes.gtf
381 | cat coding-genes.gtf merged.fixed.lncRNAs.gtf other-genes.gtf > final_annotated.gtf
382 | rm coding_transcripts.tab
383 | # sorting GTF file
384 | perl ./gff3sort/gff3sort.pl coding-genes.gtf > coding-genes.sorted.gtf
385 | rm coding-genes.gtf
386 | mv coding-genes.sorted.gtf coding-genes.gtf
387 | echo "All done"
388 | echo ""
389 | ##########################################
390 | # Gene Prediction Step with TransDecoder #
391 | ##########################################
392 | cd ${dir1}
393 | echo ""
394 | 
395 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n"
396 | printf "${YELLOW}::: 9. Predicting coding regions from transcripts with coding potential using TransDecoder :::\n"
397 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::${CYAN}\n"
398 | 
399 | echo ""
400 | gffread -w coding-transcripts.fa -g ${g_DIR}/${reference_genome} coding-genes.gtf
401 | TransDecoder.LongOrfs -m 60 -t coding-transcripts.fa
402 | TransDecoder.Predict -t coding-transcripts.fa --single_best_only
403 | awk '{print $1}' coding-transcripts.fa.transdecoder.bed > coding.sequences
404 | tail -n +2 coding.sequences > coding.hits && rm coding.sequences
405 | echo ""
406 | printf "${PURPLE}::: Done. coding-transcripts.fa.transdecoder.gff3 file is present in current directory...${CYAN}\n"
407 | echo ""
408 | 
409 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n"
410 | printf "${YELLOW}::: 10. Converting gff3 to GTF format and formatting coding sequences and proteins :::\n"
411 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::${CYAN}\n"
412 | 
413 | echo ""
414 | sed 's/Name=.*$//' coding-transcripts.fa.transdecoder.gff3 > coding-transcripts.fa.test.gff3
415 | sed -i 's/ID=GENE[.]/ID=/'g coding-transcripts.fa.test.gff3
416 | sed -i 's/Parent=GENE[.]/Parent=/'g coding-transcripts.fa.test.gff3
417 | sed -i 's/~~/;protein_id=/'g coding-transcripts.fa.test.gff3
418 | gffread coding-transcripts.fa.test.gff3 -T -P -g NCBI_transcripts.fa -o coding_transcripts.gtf
419 | rm coding-transcripts.fa.test.gff3
420 | # removing protein id by expansion
421 | sed -i 's/[.]p[0-9]//'g coding_transcripts.gtf
422 | sed -i 's/[.]p[0-9][0-9]//'g coding_transcripts.gtf
423 | sed -i 's/[.]p[0-9][0-9][0-9]//'g coding_transcripts.gtf
424 | sed -i 's/[.]p[0-9][0-9][0-9][0-9]//'g coding_transcripts.gtf
425 | sed -i 's/[.]p[0-9][0-9][0-9][0-9][0-9]//'g coding_transcripts.gtf
426 | #
427 | # obtaining cds.fa and prot.fa from coding_transcripts.gtf
428 | echo ""
429 | echo "::: Obtaining cds.fa and prot.fa from coding_transcripts.gtf"
430 | echo ""
431 | gffread -x cds.fa -g NCBI_transcripts.fa coding_transcripts.gtf
432 | gffread -y prot.fa -g NCBI_transcripts.fa coding_transcripts.gtf
433 | echo "done"
434 | rm coding-transcripts.fa coding-genes.gtf merged.fixed.lncRNAs.gtf other-genes.gtf
435 | grep "StringTie" final_annotated.gtf > genes.gtf
436 | grep "lncRNA" final_annotated.gtf > lncRNAs.gtf
437 | grep -w -F -f coding.hits genes.gtf > coding-genes.gtf
438 | grep --invert-match -F -f coding.hits genes.gtf > other-genes.gtf
439 | sed -i 's/StringTie/coding/' coding-genes.gtf
440 | cat coding-genes.gtf lncRNAs.gtf other-genes.gtf > final_annotated.gtf
441 | echo ""
442 | echo "::: Parsing transcriptome hits"
443 | echo ""
444 | grep -w -F -f coding.hits transcriptome.swissprot > coding.annotation
445 | rm transcriptome.swissprot
446 | mv coding.annotation transcriptome.swissprot
447 | echo "done"
448 | # sorting GTF file
449 | echo ""
450 | echo "::: Sorting final_annotated.gtf"
451 | echo ""
452 | perl ./gff3sort/gff3sort.pl final_annotated.gtf > final_annotated.sorted.gtf
453 | echo "done"
454 | rm final_annotated.gtf
455 | mv final_annotated.sorted.gtf final_annotated.gtf
456 | rm coding-genes.gtf lncRNAs.gtf other-genes.gtf transcriptome.hits
457 | ### Novel coding genes and correspondent proteins
458 | echo ""
459 | echo "::: Obtaining novel coding transcripts (cds) and correspondent proteins"
460 | echo ""
461 | #
462 | wget https://raw.githubusercontent.com/cfarkas/annotate_my_genomes/master/additional_scripts/transcriptome_metrics.sh
463 | bash transcriptome_metrics.sh -f final_annotated.gtf -g ${g_DIR}/${reference_genome}
464 | cp ./transcriptome_metrics/known-genes-coding.gtf ./
465 | cp ./transcriptome_metrics/novel-genes-coding.gtf ./
466 | cp ./transcriptome_metrics/novel-transcripts-lncRNA.fa ./
467 | cp ./transcriptome_metrics/known-transcripts-lncRNA.fa ./
468 | #
469 | perl -lne 'print "@m" if @m=(/((?:transcript_id|gene_id)\s+\S+)/g);' novel-genes-coding.gtf > novel_annotated.tab
470 | awk '{print $(NF)}' novel_annotated.tab > novel-coding-transcripts.matches
471 | sed -i 's/;//g' novel-coding-transcripts.matches
472 | sed -i 's/"//g' novel-coding-transcripts.matches
473 | awk '!a[$0]++' novel-coding-transcripts.matches > novel-coding-transcripts.tab && rm novel-coding-transcripts.matches
474 | mv novel-coding-transcripts.tab novel-coding-transcripts.matches
475 | #
476 | seqkit fx2tab cds.fa > cds.tab
477 | seqkit fx2tab prot.fa > prot.tab
478 | grep -w -F -f novel-coding-transcripts.matches cds.tab > novel-coding-cds.tab
479 | grep -w -F -f novel-coding-transcripts.matches prot.tab > novel-coding-prot.tab
480 | seqkit tab2fx novel-coding-cds.tab > novel-cds.fa && seqkit tab2fx novel-coding-prot.tab > novel-prot.fa
481 | rm -r -f novel-coding-cds.tab novel-coding-prot.tab novel-coding-transcripts.matches cds.tab prot.tab
482 | # obtaining final gff file
483 | echo ""
484 | echo "::: Obtaining final gff file"
485 | echo ""
486 | gffread -E -F --merge final_annotated.gtf -o final_annotated.gff
487 | rm -r -f gff3sort
488 | echo "done"
489 | echo ""
490 | rm merged.fixed.coding.gtf namelist namelist_unique_sorted coding.hits
491 | 
492 | ###############################
493 | # Configuring Summary Results #
494 | ###############################
495 | 
496 | printf "${YELLOW}:::::::::::::::::::::::::::::::::::::::::::::::::::::\n"
497 | printf "${YELLOW}::: 11. Moving results to the specified directory :::\n"
498 | printf "${YELLOW}:::::::::::::::::::::::::::::::::::::::::::::::::::::${CYAN}\n"
499 | 
500 | echo ""
501 | printf "${PURPLE}::: Moving results to the specified directory :::${CYAN}\n"
502 | rm -r -f output_files
503 | mkdir output_files
504 | mv candidate_lncRNA_classes.txt final_annotated.gtf final_annotated.gff NCBI_transcripts.fa cds.fa prot.fa Stats.txt coding_transcripts.gtf transcriptome.swissprot novel-cds.fa novel-prot.fa sed.script novel-transcripts-lncRNA.fa known-transcripts-lncRNA.fa known-genes-coding.gtf novel-genes-coding.gtf ./output_files
505 | rm -r -f *feelncfilter.log genes.gtf pipeliner* NM_coding.gtf candidate_lncRNA.gtf* coding-transcripts.fa.transdecoder_dir.__* NCBI_transcripts.fa.fai
506 | rm -r -f transdecoder
507 | mkdir transdecoder
508 | mv coding-transcripts.fa.transdecoder.* ./transdecoder
509 | mv NCBI_compare.annotated.gtf ./gffcompare_outputs_NCBI
510 | cp ${dir1}/gffcompare_outputs_NCBI/NCBI_compare.${stringtie_input}.tmap  ./
511 | mv NCBI_compare.${stringtie_input}.tmap gffcompare.tmap
512 | mv gffcompare.tmap ./output_files/
513 | 
514 | # cd ${dir0}
515 | # mv add_ncbi_annotation_$sec ${o_DIR}/${output_folder}
516 | 
517 | cd ${dir0}
518 | 
519 | echo "Done"
520 | echo ""
521 | printf "${YELLOW}::: Done:::\n"
522 | echo ""
523 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n"
524 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n"
525 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::${CYAN}\n"
526 | echo ""
527 | echo "The following files are available in ${dir1}/output_files : "
528 | echo ""
529 | echo "Transcript discoveries are summarized in Stats.txt file. GAWN protein annotation is called transcriptome.hits"
530 | echo ""
531 | echo "gffcompare.tmap file contains Best Reference Transcript for each assembled transcript"
532 | echo ""
533 | echo "GTF file named final_annotated.gtf (and correspondent gff file) contain novel genes and lncRNA classification (second field in GTF file)"
534 | echo ""
535 | echo "candidate_lncRNA_classes.txt contained detailed long non-coding classification of transcripts"
536 | echo ""
537 | echo "Associated FASTA file to this GTF correspond to NCBI_transcripts.fa file"
538 | echo ""
539 | echo "TransDecoder GTF file suitable to parse NCBI_transcripts.fa (coding_transcripts.gtf), contains all coding transcripts resolved by TransDecoder"
540 | echo ""
541 | echo "Predicted coding sequences and correspondent protein sequences were named cds.fa and prot.fa, respectively"
542 | echo ""
543 | echo "Novel predicted coding sequences and correspondent protein sequences were named novel-cds.fa and novel-prot.fa, respectively"
544 | echo ""
545 | echo "Novel and Known predicted lncRNAs were named novel-transcripts-lncRNA.fa and known-transcripts-lncRNA.fa, respectively"
546 | echo ""
547 | echo "Novel and Known coding genes were named novel-genes-coding.gtf and known-genes-coding.gtf, respectively"
548 | echo ""
549 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n"
550 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n"
551 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::${NC}\n"
552 | end=`date +%s`
553 | elapsed=`expr $end - $begin`
554 | echo Time taken: $elapsed
555 | #
556 | } | tee logfile_add_ncbi_annotation_$seconds
557 | #
558 | 


--------------------------------------------------------------------------------
/bash_scripts/annotate_my_genomes.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | set -e
  4 | 
  5 | {
  6 | 
  7 | usage="$(basename "$0") [-h] [-a <stringtie.gtf>] [-r <reference_genome.gtf>] [-g <reference_genome.fasta>] [-c <gawn_config>] [-t <threads>]
  8 | This pipeline will Overlap StringTie transcripts (GTF format) with current UCSC annotation and will annotate novel transcripts.
  9 | Arguments:
 10 |     -h  show this help text
 11 |     -a  StringTie GTF
 12 |     -r  UCSC gene annotation (in GTF format)
 13 |     -g  Reference genome (in fasta format)
 14 |     -c  GAWN config file (path to gawn_config.sh in annotate_my_genomes folder)
 15 |     -t  Number of threads for processing (integer)
 16 |     -o  output folder (must exist)"
 17 | options=':ha:r:g:c:t:o:'
 18 | while getopts $options option; do
 19 |   case "$option" in
 20 |     h) echo "$usage"; exit;;
 21 |     a) a=$OPTARG;;
 22 |     r) r=$OPTARG;;
 23 |     g) g=$OPTARG;;
 24 |     c) c=$OPTARG;;
 25 |     t) t=$OPTARG;;
 26 |     o) o=$OPTARG;;
 27 |     :) printf "missing argument for -%s\n" "$OPTARG" >&2; echo "$usage" >&2; exit 1;;
 28 |    \?) printf "illegal option: -%s\n" "$OPTARG" >&2; echo "$usage" >&2; exit 1;;
 29 |   esac
 30 | done
 31 | 
 32 | # mandatory arguments
 33 | if [ ! "$a" ] || [ ! "$r" ] || [ ! "$g" ] || [ ! "$c" ] || [ ! "$t" ] || [ ! "$o" ]; then
 34 |   echo ""
 35 |   echo "arguments -a, -r, -g, -c, -t and -o must be provided"
 36 |   echo ""
 37 |   echo "$usage" >&2; exit 1
 38 | fi
 39 | 
 40 | # Conditions : output folder
 41 | if [ ! -d "$o" ]; then
 42 |   echo ""
 43 |   echo "Output directory: $o not found. Please create the output directory first, before running the pipeline."
 44 |   echo ""
 45 |   exit 9999 # die with error code 9999
 46 | fi
 47 | 
 48 | # Conditions : Input existance
 49 | 
 50 | if [ ! -e "$a" ]; then
 51 |     echo ""
 52 |     echo "$a does not exist. Check your -a input"
 53 |     echo ""
 54 |     exit 9999 # die with error code 9999
 55 | fi
 56 | 
 57 | if [ ! -e "$r" ]; then
 58 |     echo ""
 59 |     echo "$r does not exist. Check your -r input"
 60 |     echo ""
 61 |     exit 9999 # die with error code 9999
 62 | fi
 63 | 
 64 | if [ ! -e "$g" ]; then
 65 |     echo ""
 66 |     echo "$g does not exist. Check your -g input"
 67 |     echo ""
 68 |     exit 9999 # die with error code 9999
 69 | fi
 70 | 
 71 | if [ ! -e "$c" ]; then
 72 |     echo ""
 73 |     echo "$c does not exist. Check your -c input"
 74 |     echo ""
 75 |     exit 9999 # die with error code 9999
 76 | fi
 77 | 
 78 | # Conditions : Getting absolute path of inputs
 79 | echo ""
 80 | a_DIR="$( cd "$( dirname "$a" )" && pwd )"
 81 | echo ""
 82 | echo "::: The absolute path of -a is $a_DIR"
 83 | echo ""
 84 | r_DIR="$( cd "$( dirname "$r" )" && pwd )"
 85 | echo ""
 86 | echo "::: The absolute path of -r is $r_DIR"
 87 | echo ""
 88 | g_DIR="$( cd "$( dirname "$g" )" && pwd )"
 89 | echo ""
 90 | echo "::: The absolute path of -g is $g_DIR"
 91 | echo ""
 92 | c_DIR="$( cd "$( dirname "$c" )" && pwd )"
 93 | echo ""
 94 | echo "::: The absolute path of -c is $c_DIR"
 95 | echo ""
 96 | o_DIR="$( cd "$( dirname "$o" )" && pwd )"
 97 | echo ""
 98 | echo "::: The absolute path of -o is $o_DIR"
 99 | echo ""
100 | 
101 | begin=`date +%s`
102 | #    .---------- constant part!
103 | #    vvvv vvvv-- the code from above
104 | YELLOW='\033[1;33m'
105 | PURPLE='\033[0;35m'
106 | CYAN='\033[0;36m'
107 | NC='\033[0m' # No Color
108 | 
109 | printf "${YELLOW}::: Defining Variables :::\n"
110 | echo ""
111 | echo "Defining variables:"
112 | echo""
113 | FILE1="$a"
114 | basename "$FILE1"
115 | stringtie_input="$(basename -- $FILE1)"
116 | echo "The stringtie file used as input is the following: $stringtie_input"
117 | echo ""
118 | FILE2="$r"
119 | basename "$FILE2"
120 | reference_gtf="$(basename -- $FILE2)"
121 | echo "The reference GTF used as input is the following: $reference_gtf"
122 | echo ""
123 | FILE3="$g"
124 | basename "$FILE3"
125 | reference_genome="$(basename -- $FILE3)"
126 | echo "The reference genome used as input is the following: $reference_genome"
127 | echo ""
128 | FILE4="$c"
129 | basename "$FILE4"
130 | gawn_config="$(basename -- $FILE4)"
131 | echo "The gawn_config file used as input is the following: $gawn_config"
132 | echo ""
133 | FILE5="$t"
134 | basename "$FILE5"
135 | threads="$(basename -- $FILE5)"
136 | echo "The number of threads for calculation are the following: $threads"
137 | echo ""
138 | FILE6="$o"
139 | basename "$FILE6"
140 | output_folder="$(basename -- $FILE6)"
141 | echo "The output folder is the following: $output_folder"
142 | echo ""
143 | 
144 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n"
145 | printf "${YELLOW}::: 0. Defining directories and StringTie input data :::\n"
146 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::${CYAN}\n"
147 | echo ""
148 | 
149 | dir0=$(cd -P -- "$(dirname -- "$0")" && pwd -P)
150 | 
151 | sec=$(date "+%Y%m%d_%H%M%S")
152 | # mkdir annotate_my_genomes_$sec
153 | 
154 | if [ -z "$(ls -A ${o_DIR}/${output_folder})" ]; then
155 |    echo ""
156 |    echo "Output folder is empty. We will work inside the provided output folder:"
157 |    echo ""
158 |    cd ${o_DIR}/${output_folder}
159 | else
160 |    echo ""
161 |    echo "Output folder is not empty. Creating temporary folder:"
162 |    echo ""
163 |    sec=$(date "+%Y%m%d_%H%M%S")
164 |    cd ${o_DIR}/${output_folder}
165 |    mkdir annotate_my_genomes_$sec && cd annotate_my_genomes_$sec
166 | fi
167 | 
168 | # cd ${o_DIR}/${output_folder}
169 | 
170 | if [ -f $stringtie_input ]; then
171 |     echo ""
172 |     echo "$stringtie_input file found in output directory. Continue."
173 |     echo ""
174 |     : 
175 | else
176 |     echo ""
177 |     echo "Copying $stringtie_input file into the output directory:"
178 |     cp ${a_DIR}/${stringtie_input} ./
179 |     echo ""
180 | fi
181 | 
182 | # cp ${a_DIR}/${stringtie_input} ${o_DIR}/${output_folder}
183 | # cd annotate_my_genomes_$sec
184 | 
185 | dir1=$(cd -P -- "$(dirname -- "$0")" && pwd -P)
186 | echo ""
187 | echo "Current Working Directory:"
188 | echo ""
189 | echo $dir1
190 | echo ""
191 | printf "${YELLOW}::: Done :::\n"
192 | echo ""
193 | 
194 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n"
195 | printf "${YELLOW}::: 1. Overlapping StringTie transcripts with UCSC GTF :::\n"
196 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::${CYAN}\n"
197 | echo ""
198 | 
199 | gffcompare -R -r ${r_DIR}/${reference_gtf} -s ${g_DIR}/${reference_genome} -o UCSC_compare ${stringtie_input}
200 | echo "Done."
201 | printf "${PURPLE}::: Done :::\n"
202 | echo ""
203 | 
204 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::\n"
205 | printf "${YELLOW}::: 2. Writting novel discoveries to Stats.txt :::\n"
206 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::${CYAN}\n"
207 | echo ""
208 | 
209 | # Stats
210 | exec 3<> Stats.txt
211 | echo "Number of assembled genes:" >> Stats.txt
212 | cat UCSC_compare.${stringtie_input}.tmap | sed "1d" | cut -f4 | sort | uniq | wc -l >> Stats.txt
213 | echo "" >> Stats.txt
214 | echo "Number of novel genes:" >> Stats.txt
215 | cat UCSC_compare.${stringtie_input}.tmap | awk '$3=="u"{print $0}' | cut -f4 | sort | uniq | wc -l >> Stats.txt
216 | echo "" >> Stats.txt
217 | echo "Number of novel transcripts:" >> Stats.txt
218 | cat UCSC_compare.${stringtie_input}.tmap | awk '$3=="u"{print $0}' | cut -f5 | sort | uniq | wc -l >> Stats.txt
219 | echo "" >> Stats.txt
220 | echo "Number of transcripts matching annotation:" >> Stats.txt
221 | cat UCSC_compare.${stringtie_input}.tmap | awk '$3=="="{print $0}' | cut -f5 | sort | uniq | wc -l >> Stats.txt
222 | exec 3>&-
223 | printf "${PURPLE}Done\n"
224 | echo ""
225 | 
226 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n"
227 | printf "${YELLOW}::: 3. Replacing gene_id field in final_annotated.gtf file with UCSC gene_id's :::\n"
228 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::${CYAN}\n"
229 | 
230 | echo ""
231 | #######################################
232 | # Merging novel transcripts with ref. #
233 | #######################################
234 | awk '{print $4"\t"$1}' UCSC_compare.${stringtie_input}.tmap > UCSC_compare.${stringtie_input}.tmap.1
235 | tail -n +2 UCSC_compare.${stringtie_input}.tmap.1 > UCSC_compare.${stringtie_input}.tmap.2
236 | awk '$2 != "-"' UCSC_compare.${stringtie_input}.tmap.2 > namelist
237 | awk '!a[$0]++' namelist > namelist_unique
238 | tac namelist_unique > namelist_unique_sorted
239 | rm namelist namelist_unique
240 | awk '{print $1}' namelist_unique_sorted  > A
241 | awk '{print $2}' namelist_unique_sorted  > B
242 | sed 's/^/"/' A > A.1
243 | sed 's/$/"/' A.1 > A.2
244 | sed 's/^/"/' B > B.1
245 | sed 's/$/"/' B.1 > B.2
246 | paste -d'\t' A.2 B.2 > namelist
247 | rm A A.1 A.2 B B.1 B.2
248 | ###############################
249 | # Getting gene names replaced #
250 | ###############################
251 | awk '{print $1}' namelist > fileA
252 | awk '{print $2}' namelist > fileB
253 | paste -d % fileA fileB > sed.script
254 | sed -i -e 's/^/s%/' sed.script
255 | sed -i -e 's/$/%/' sed.script
256 | cat ${a_DIR}/${stringtie_input} | parallel --pipe -j ${t} sed -f sed.script > final_annotated.gtf
257 | rm -f fileA fileB *tmap.1 *tmap.2
258 | # sorting GTF file
259 | rm -r -f gff3sort
260 | git clone https://github.com/cfarkas/gff3sort.git
261 | perl ./gff3sort/gff3sort.pl final_annotated.gtf > final_annotated.sorted.gtf
262 | rm final_annotated.gtf
263 | mv final_annotated.sorted.gtf final_annotated.gtf
264 | printf "${PURPLE}::: Done. Gene_id field was replaced in the StringTie.gtf file and final_annotated.gtf was generated with these changes\n"
265 | echo ""
266 | printf "${PURPLE}::: Moving gffcompare results to gffcompare_outputs folder ...\n"
267 | echo ""
268 | rm -r -f gffcompare_outputs_UCSC
269 | mkdir gffcompare_outputs_UCSC
270 | mv *.loci *.stats *.refmap *.tmap *.tracking ./gffcompare_outputs_UCSC
271 | echo ""
272 | 
273 | printf "${YELLOW}:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n"
274 | printf "${YELLOW}::: 4. Obtaining Transcripts in FASTA format with gffread :::\n"
275 | printf "${YELLOW}:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::${CYAN}\n"
276 | 
277 | echo ""
278 | gffread -w transcripts.fa -g ${g_DIR}/${reference_genome} final_annotated.gtf
279 | echo ""
280 | printf "${PURPLE}::: Done. transcripts.fa are located in current directory\n"
281 | echo ""
282 | 
283 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n"
284 | printf "${YELLOW}::: 5. Performing gene annotation by using GAWN pipeline :::\n"
285 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::${CYAN}\n"
286 | 
287 | ################################################################
288 | # Configuring Gawn Inputs, config file and running GAWN pipeline
289 | ################################################################
290 | 
291 | echo ""
292 | printf "${PURPLE}::: Downloading GAWN annotation folder. See https://github.com/enormandeau/gawn.git${CYAN}\n"
293 | echo ""
294 | rm -r -f gawn
295 | git clone https://github.com/cfarkas/gawn.git
296 | cd gawn/02_infos/
297 | dir2=$(cd -P -- "$(dirname -- "$0")" && pwd -P)
298 | echo "Done"
299 | echo ""
300 | cd ${dir1}
301 | cp ${g_DIR}/${reference_genome} ${dir1}/gawn/03_data/genome.fasta
302 | cp transcripts.fa ${dir1}/gawn/03_data/transcriptome.fasta
303 | rm ${dir2}/gawn_config.sh
304 | cp ${c_DIR}/${gawn_config} ${dir2}/gawn_config.sh
305 | echo ""
306 | printf "${PURPLE}::: Starting GAWN transcript annotation${CYAN}\n"
307 | echo ""
308 | cd ${dir1}/gawn/
309 | ./gawn 02_infos/gawn_config.sh
310 | echo ""
311 | printf "${PURPLE}::: Done. The novel transcripts were annotated in ./gawn/04_annotation/ :::${CYAN}\n"
312 | echo ""
313 | 
314 | #################################
315 | # Extracting transcriptome hits #
316 | #################################
317 | 
318 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::\n"
319 | printf "${YELLOW}::: 6. Extracting transcriptome hits :::\n"
320 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::${CYAN}\n"
321 | 
322 | echo ""
323 | cd ${dir1}
324 | cp ${dir1}/gawn/04_annotation/transcriptome.swissprot ${dir1}
325 | cp ${dir1}/gawn/04_annotation/transcriptome.hits ${dir1}
326 | printf "${PURPLE}::: Done. transcriptome hits were succesfully extracted :::${CYAN}\n"
327 | echo ""
328 | 
329 | ############################################
330 | # FEELnc long noncoding RNA identification #
331 | ############################################
332 | 
333 | cd ${dir1}
334 | 
335 | printf "${YELLOW}:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n"
336 | printf "${YELLOW}::: 7. Classifying protein-coding and long non-coding transcripts with FEELnc :::\n"
337 | printf "${YELLOW}:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::${CYAN}\n"
338 | 
339 | grep "NM_" ${r_DIR}/${reference_gtf} > NM_coding.gtf
340 | echo ""
341 | printf "${PURPLE}::: 1/3) Filtering transcripts :::${CYAN}\n"
342 | # Filter
343 | FEELnc_filter.pl -i final_annotated.gtf -a NM_coding.gtf -b transcript_biotype=protein_coding > candidate_lncRNA.gtf
344 | rm -r -f ${g_DIR}/${reference_genome}.index
345 | printf "${PURPLE}::: 2/3) Evaluating coding potential :::${CYAN}\n"
346 | # Coding_Potential
347 | FEELnc_codpot.pl -i candidate_lncRNA.gtf -a NM_coding.gtf -b transcript_biotype=protein_coding -g ${g_DIR}/${reference_genome} --mode=shuffle
348 | printf "${PURPLE}::: 3/3) Classifiyng lncRNA transcripts :::${CYAN}\n"
349 | # Classifier
350 | FEELnc_classifier.pl -i feelnc_codpot_out/candidate_lncRNA.gtf.lncRNA.gtf -a NM_coding.gtf > candidate_lncRNA_classes.txt
351 | echo ""
352 | printf "${PURPLE}::: FEELnc calculations were done. The output is called candidate_lncRNA_classes.txt :::\n"
353 | echo ""
354 | 
355 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::\n"
356 | printf "${YELLOW}::: 8. Parsing GAWN and FEELnc outputs :::\n"
357 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::${CYAN}\n"
358 | 
359 | echo ""
360 | cd ${dir1}
361 | awk '{print $3}' candidate_lncRNA_classes.txt > lncRNA_genes
362 | tail -n +2 lncRNA_genes > lncRNA_transcripts
363 | rm lncRNA_genes
364 | grep -w -F -f lncRNA_transcripts final_annotated.gtf > merged.fixed.lncRNAs.gtf
365 | grep --invert-match -F -f lncRNA_transcripts final_annotated.gtf > merged.fixed.coding.gtf
366 | rm final_annotated.gtf
367 | sed -i 's/StringTie/lncRNA/' merged.fixed.lncRNAs.gtf
368 | awk '{print $1"\t"$2}' transcriptome.hits > coding_list
369 | awk -F'\t' '$2!=""' coding_list > coding_transcripts
370 | awk '{print $1}' coding_transcripts > coding_transcripts.tab
371 | rm coding_lis* coding_transcripts lncRNA_transcripts
372 | grep -w -F -f coding_transcripts.tab merged.fixed.coding.gtf > coding-genes.gtf
373 | grep --invert-match -F -f coding_transcripts.tab merged.fixed.coding.gtf > other-genes.gtf
374 | cat coding-genes.gtf merged.fixed.lncRNAs.gtf other-genes.gtf > final_annotated.gtf
375 | rm coding_transcripts.tab
376 | # sorting GTF file
377 | perl ./gff3sort/gff3sort.pl coding-genes.gtf > coding-genes.sorted.gtf
378 | rm coding-genes.gtf
379 | mv coding-genes.sorted.gtf coding-genes.gtf
380 | echo "All done"
381 | echo ""
382 | ##########################################
383 | # Gene Prediction Step with TransDecoder #
384 | ##########################################
385 | cd ${dir1}
386 | echo ""
387 | 
388 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n"
389 | printf "${YELLOW}::: 9. Predicting coding regions from transcripts with coding potential using TransDecoder :::\n"
390 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::${CYAN}\n"
391 | 
392 | echo ""
393 | echo ""
394 | gffread -w coding-transcripts.fa -g ${g_DIR}/${reference_genome} coding-genes.gtf
395 | TransDecoder.LongOrfs -m 60 -t coding-transcripts.fa
396 | TransDecoder.Predict -t coding-transcripts.fa --single_best_only
397 | awk '{print $1}' coding-transcripts.fa.transdecoder.bed > coding.sequences
398 | tail -n +2 coding.sequences > coding.hits && rm coding.sequences
399 | echo ""
400 | printf "${PURPLE}::: Done. coding-transcripts.fa.transdecoder.gff3 file is present in current directory...${CYAN}\n"
401 | echo ""
402 | 
403 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n"
404 | printf "${YELLOW}::: 10. Converting gff3 to GTF format and formatting coding sequences and proteins :::\n"
405 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::${CYAN}\n"
406 | 
407 | echo ""
408 | sed 's/Name=.*$//' coding-transcripts.fa.transdecoder.gff3 > coding-transcripts.fa.test.gff3
409 | sed -i 's/ID=GENE[.]/ID=/'g coding-transcripts.fa.test.gff3
410 | sed -i 's/Parent=GENE[.]/Parent=/'g coding-transcripts.fa.test.gff3
411 | sed -i 's/~~/;protein_id=/'g coding-transcripts.fa.test.gff3
412 | gffread coding-transcripts.fa.test.gff3 -T -P -g transcripts.fa -o coding_transcripts.gtf
413 | rm coding-transcripts.fa.test.gff3
414 | # removing protein id by expansion
415 | sed -i 's/[.]p[0-9]//'g coding_transcripts.gtf
416 | sed -i 's/[.]p[0-9][0-9]//'g coding_transcripts.gtf
417 | sed -i 's/[.]p[0-9][0-9][0-9]//'g coding_transcripts.gtf
418 | sed -i 's/[.]p[0-9][0-9][0-9][0-9]//'g coding_transcripts.gtf
419 | sed -i 's/[.]p[0-9][0-9][0-9][0-9][0-9]//'g coding_transcripts.gtf
420 | #
421 | # obtaining cds.fa and prot.fa from coding_transcripts.gtf
422 | echo ""
423 | echo "::: Obtaining cds.fa and prot.fa from coding_transcripts.gtf"
424 | echo ""
425 | gffread -x cds.fa -g transcripts.fa coding_transcripts.gtf
426 | gffread -y prot.fa -g transcripts.fa coding_transcripts.gtf
427 | echo "done"
428 | rm coding-transcripts.fa coding-genes.gtf merged.fixed.lncRNAs.gtf other-genes.gtf
429 | grep "StringTie" final_annotated.gtf > genes.gtf
430 | grep "lncRNA" final_annotated.gtf > lncRNAs.gtf
431 | grep -w -F -f coding.hits genes.gtf > coding-genes.gtf
432 | grep --invert-match -F -f coding.hits genes.gtf > other-genes.gtf
433 | sed -i 's/StringTie/coding/' coding-genes.gtf
434 | cat coding-genes.gtf lncRNAs.gtf other-genes.gtf > final_annotated.gtf
435 | echo ""
436 | echo "::: Parsing transcriptome hits"
437 | echo ""
438 | grep -w -F -f coding.hits transcriptome.swissprot > coding.annotation
439 | rm transcriptome.swissprot
440 | mv coding.annotation transcriptome.swissprot
441 | echo "done"
442 | # sorting GTF file
443 | echo ""
444 | echo "::: Sorting final_annotated.gtf"
445 | echo ""
446 | perl ./gff3sort/gff3sort.pl final_annotated.gtf > final_annotated.sorted.gtf
447 | echo "done"
448 | rm final_annotated.gtf
449 | mv final_annotated.sorted.gtf final_annotated.gtf
450 | rm coding-genes.gtf lncRNAs.gtf other-genes.gtf transcriptome.hits
451 | ### Novel coding genes and correspondent proteins
452 | echo ""
453 | echo "::: Obtaining novel coding transcripts (cds) and correspondent proteins"
454 | echo ""
455 | #
456 | wget https://raw.githubusercontent.com/cfarkas/annotate_my_genomes/master/additional_scripts/transcriptome_metrics.sh
457 | bash transcriptome_metrics.sh -f final_annotated.gtf -g ${g_DIR}/${reference_genome}
458 | cp ./transcriptome_metrics/known-genes-coding.gtf ./
459 | cp ./transcriptome_metrics/novel-genes-coding.gtf ./
460 | cp ./transcriptome_metrics/novel-transcripts-lncRNA.fa ./
461 | cp ./transcriptome_metrics/known-transcripts-lncRNA.fa ./
462 | #
463 | perl -lne 'print "@m" if @m=(/((?:transcript_id|gene_id)\s+\S+)/g);' novel-genes-coding.gtf > novel_annotated.tab
464 | awk '{print $(NF)}' novel_annotated.tab > novel-coding-transcripts.matches
465 | sed -i 's/;//g' novel-coding-transcripts.matches
466 | sed -i 's/"//g' novel-coding-transcripts.matches
467 | awk '!a[$0]++' novel-coding-transcripts.matches > novel-coding-transcripts.tab && rm novel-coding-transcripts.matches
468 | mv novel-coding-transcripts.tab novel-coding-transcripts.matches
469 | #
470 | seqkit fx2tab cds.fa > cds.tab
471 | seqkit fx2tab prot.fa > prot.tab
472 | grep -w -F -f novel-coding-transcripts.matches cds.tab > novel-coding-cds.tab
473 | grep -w -F -f novel-coding-transcripts.matches prot.tab > novel-coding-prot.tab
474 | seqkit tab2fx novel-coding-cds.tab > novel-cds.fa && seqkit tab2fx novel-coding-prot.tab > novel-prot.fa
475 | rm -r -f novel-coding-cds.tab novel-coding-prot.tab novel-coding-transcripts.matches cds.tab prot.tab
476 | # obtaining final gff file
477 | echo ""
478 | echo "::: Obtaining final gff file"
479 | echo ""
480 | gffread -E -F --merge final_annotated.gtf -o final_annotated.gff
481 | rm -r -f gff3sort
482 | echo "done"
483 | echo ""
484 | rm -r -f merged.fixed.coding.gtf namelist namelist_unique_sorted coding.hits
485 | 
486 | ###############################
487 | # Configuring Summary Results #
488 | ###############################
489 | 
490 | printf "${YELLOW}:::::::::::::::::::::::::::::::::::::::::::::::::::::\n"
491 | printf "${YELLOW}::: 11. Moving results to the specified directory :::\n"
492 | printf "${YELLOW}:::::::::::::::::::::::::::::::::::::::::::::::::::::${CYAN}\n"
493 | 
494 | echo ""
495 | printf "${PURPLE}::: Moving results to the specified directory :::${CYAN}\n"
496 | rm -r -f output_files
497 | mkdir output_files
498 | mv candidate_lncRNA_classes.txt final_annotated.gtf final_annotated.gff transcripts.fa cds.fa prot.fa coding_transcripts.gtf Stats.txt transcriptome.swissprot novel-cds.fa novel-prot.fa sed.script novel-transcripts-lncRNA.fa known-transcripts-lncRNA.fa known-genes-coding.gtf novel-genes-coding.gtf ./output_files
499 | rm -r -f *feelncfilter.log genes.gtf pipeliner* NM_coding.gtf candidate_lncRNA.gtf* coding-transcripts.fa.transdecoder_dir.__* transcripts.fa.fai
500 | rm -r -f transdecoder
501 | mkdir transdecoder
502 | mv coding-transcripts.fa.transdecoder.* ./transdecoder
503 | mv UCSC_compare.annotated.gtf ./gffcompare_outputs_UCSC
504 | cp ${dir1}/gffcompare_outputs_UCSC/UCSC_compare.${stringtie_input}.tmap ./
505 | mv UCSC_compare.${stringtie_input}.tmap gffcompare.tmap
506 | mv gffcompare.tmap ./output_files/
507 | 
508 | # cd ${dir0}
509 | # mv annotate_my_genomes_$sec ${o_DIR}/${output_folder}
510 | 
511 | cd ${dir0}
512 | 
513 | echo "Done"
514 | echo ""
515 | printf "${YELLOW}::: Done:::\n"
516 | echo ""
517 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n"
518 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n"
519 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::${CYAN}\n"
520 | echo ""
521 | echo "The following files are available in ${dir1}/output_files : "
522 | echo ""
523 | echo "Transcript discoveries are summarized in Stats.txt file. GAWN protein annotation is named transcriptome.hits"
524 | echo ""
525 | echo "gffcompare.tmap file contains Best Reference Transcript for each assembled transcript"
526 | echo ""
527 | echo "GTF file named final_annotated.gtf (and correspondent gff file) contain novel genes and lncRNA classification (second field in GTF file)"
528 | echo ""
529 | echo "candidate_lncRNA_classes.txt contained detailed long non-coding classification of transcripts".
530 | echo ""
531 | echo "Associated FASTA file to this GTF correspond to transcripts.fa file"
532 | echo ""
533 | echo "TransDecoder GTF file suitable to parse transcripts.fa (coding_transcripts.gtf), contains all coding transcripts resolved by TransDecoder"
534 | echo ""
535 | echo "Predicted coding sequences and correspondent protein sequences were named cds.fa and prot.fa, respectively"
536 | echo ""
537 | echo "Novel predicted coding sequences and correspondent protein sequences were named novel-cds.fa and novel-prot.fa, respectively"
538 | echo ""
539 | echo "Novel and Known predicted lncRNAs were named novel-transcripts-lncRNA.fa and known-transcripts-lncRNA.fa, respectively"
540 | echo ""
541 | echo "Novel and Known coding genes were named novel-genes-coding.gtf and known-genes-coding.gtf, respectively"
542 | echo ""
543 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n"
544 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n"
545 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::${NC}\n"
546 | end=`date +%s`
547 | elapsed=`expr $end - $begin`
548 | echo Time taken: $elapsed
549 | #
550 | } | tee logfile_annotate_my_genomes_${sec}
551 | #
552 | 


--------------------------------------------------------------------------------
/bash_scripts/genome_download.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | genome=${1}
 6 | 
 7 | if [ "$1" == "-h" ]; then
 8 |   echo ""
 9 |   echo "Usage: ./`basename $0` [genome]"
10 |   echo ""
11 |   echo "This program will download and index the specified genome from the uscs server (goldenpath) including annotation"
12 |   echo ""
13 |   echo "[genome]: UCSC Prefix of the genome assembly. Check names here: https://genome.ucsc.edu/cgi-bin/hgGateway"
14 |   echo ""
15 |   exit 0
16 | fi
17 | 
18 | if [ "$1" == "-help" ]; then
19 |   echo ""
20 |   echo "Usage: ./`basename $0` [genome]"
21 |   echo ""
22 |   echo "This program will download and index the specified genome from the uscs server (goldenpath) including annotation"
23 |   echo ""
24 |   echo "[genome]: UCSC Prefix of the genome assembly. Check names here: https://genome.ucsc.edu/cgi-bin/hgGateway"
25 |   echo ""
26 |   exit 0
27 | fi
28 | if [ "$1" == "--h" ]; then
29 |   echo ""
30 |   echo "Usage: ./`basename $0` [genome]"
31 |   echo ""
32 |   echo "This program will download and index the specified genome from the uscs server (goldenpath) including annotation"
33 |   echo ""
34 |   echo "[genome]: UCSC Prefix of the genome assembly. Check names here: https://genome.ucsc.edu/cgi-bin/hgGateway"
35 |   echo ""
36 |   exit 0
37 | fi
38 | 
39 | if [ "$1" == "--help" ]; then
40 |   echo ""
41 |   echo "Usage: ./`basename $0` [genome]"
42 |   echo ""
43 |   echo "This program will download and index the specified genome from the uscs server (goldenpath) including annotation"
44 |   echo ""
45 |   echo "[genome]: UCSC Prefix of the genome assembly. Check names here: https://genome.ucsc.edu/cgi-bin/hgGateway"
46 |   echo ""
47 |   exit 0
48 | fi
49 | 
50 | [ $# -eq 0 ] && { echo "Usage: ./`basename $0` [genome]"; exit 1; }
51 | 
52 | if [ $# -ne 1 ]; then
53 |   echo 1>&2 "Usage: ./`basename $0`  [genome]"
54 |   exit 3
55 | fi
56 | 
57 | # Obtaining {genome}.fa genome, and indexing
58 | wget http://hgdownload.cse.ucsc.edu/goldenpath/${genome}/bigZips/${genome}.2bit
59 | wget http://hgdownload.soe.ucsc.edu/goldenPath/${genome}/database/refGene.txt.gz
60 | wget http://hgdownload.soe.ucsc.edu/goldenPath/${genome}/database/ncbiRefSeq.txt.gz
61 | if [ -f twoBitToFa ]; then
62 |     echo "twoBitToFa script found. Continue:"
63 |     echo ""
64 |     : 
65 | else
66 |     echo "Downloading twoBitToFa script"
67 |     wget http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/twoBitToFa 
68 | fi
69 | chmod 755 twoBitToFa
70 | ./twoBitToFa ${genome}.2bit ${genome}.fa
71 | samtools faidx ${genome}.fa
72 | 
73 | if [ -f genePredToGtf ]; then
74 |     echo "genePredToGtf script found. Continue:"
75 |     echo ""
76 |     : 
77 | else
78 |     echo "Downloading genePredToGtf script"
79 |     wget http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/genePredToGtf 
80 | fi
81 | chmod 755 genePredToGtf
82 | gunzip refGene.txt.gz
83 | gunzip ncbiRefSeq.txt.gz
84 | cut -f 2- refGene.txt | ./genePredToGtf file stdin -source=${genome}_Ref  ${genome}.gtf
85 | cut -f 2- ncbiRefSeq.txt | ./genePredToGtf file stdin -source=${genome}_Ref  ${genome}_ncbiRefSeq.gtf
86 | echo ""
87 | echo "All done. ${genome} FASTA and GTF files are located in the current directory"
88 | 


--------------------------------------------------------------------------------
/bash_scripts/genome_download_macOSX.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | genome=${1}
 6 | 
 7 | if [ "$1" == "-h" ]; then
 8 |   echo ""
 9 |   echo "Usage: ./`basename $0` [genome]"
10 |   echo ""
11 |   echo "This program will download and index the specified genome from the uscs server (goldenpath) including annotation"
12 |   echo ""
13 |   echo "[genome]: UCSC Prefix of the genome assembly. Check names here: https://genome.ucsc.edu/cgi-bin/hgGateway"
14 |   echo ""
15 |   exit 0
16 | fi
17 | 
18 | if [ "$1" == "-help" ]; then
19 |   echo ""
20 |   echo "Usage: ./`basename $0` [genome]"
21 |   echo ""
22 |   echo "This program will download and index the specified genome from the uscs server (goldenpath) including annotation"
23 |   echo ""
24 |   echo "[genome]: UCSC Prefix of the genome assembly. Check names here: https://genome.ucsc.edu/cgi-bin/hgGateway"
25 |   echo ""
26 |   exit 0
27 | fi
28 | if [ "$1" == "--h" ]; then
29 |   echo ""
30 |   echo "Usage: ./`basename $0` [genome]"
31 |   echo ""
32 |   echo "This program will download and index the specified genome from the uscs server (goldenpath) including annotation"
33 |   echo ""
34 |   echo "[genome]: UCSC Prefix of the genome assembly. Check names here: https://genome.ucsc.edu/cgi-bin/hgGateway"
35 |   echo ""
36 |   exit 0
37 | fi
38 | 
39 | if [ "$1" == "--help" ]; then
40 |   echo ""
41 |   echo "Usage: ./`basename $0` [genome]"
42 |   echo ""
43 |   echo "This program will download and index the specified genome from the uscs server (goldenpath) including annotation"
44 |   echo ""
45 |   echo "[genome]: UCSC Prefix of the genome assembly. Check names here: https://genome.ucsc.edu/cgi-bin/hgGateway"
46 |   echo ""
47 |   exit 0
48 | fi
49 | 
50 | [ $# -eq 0 ] && { echo "Usage: ./`basename $0` [genome]"; exit 1; }
51 | 
52 | if [ $# -ne 1 ]; then
53 |   echo 1>&2 "Usage: ./`basename $0`  [genome]"
54 |   exit 3
55 | fi
56 | 
57 | # Obtaining {genome}.fa genome, and indexing
58 | wget http://hgdownload.cse.ucsc.edu/goldenpath/${genome}/bigZips/${genome}.2bit
59 | wget http://hgdownload.soe.ucsc.edu/goldenPath/${genome}/database/refGene.txt.gz
60 | wget http://hgdownload.soe.ucsc.edu/goldenPath/${genome}/database/ncbiRefSeq.txt.gz
61 | if [ -f twoBitToFa ]; then
62 |     echo "twoBitToFa script found. Continue:"
63 |     echo ""
64 |     : 
65 | else
66 |     echo "Downloading twoBitToFa script"
67 |     wget http://hgdownload.cse.ucsc.edu/admin/exe/macOSX.x86_64/twoBitToFa 
68 | fi
69 | chmod 755 twoBitToFa
70 | ./twoBitToFa ${genome}.2bit ${genome}.fa
71 | samtools faidx ${genome}.fa
72 | 
73 | if [ -f genePredToGtf ]; then
74 |     echo "genePredToGtf script found. Continue:"
75 |     echo ""
76 |     : 
77 | else
78 |     echo "Downloading genePredToGtf script"
79 |     wget http://hgdownload.cse.ucsc.edu/admin/exe/macOSX.x86_64/genePredToGtf 
80 | fi
81 | chmod 755 genePredToGtf
82 | gunzip refGene.txt.gz
83 | gunzip ncbiRefSeq.txt.gz
84 | cut -f 2- refGene.txt | ./genePredToGtf file stdin -source=${genome}_Ref  ${genome}.gtf
85 | cut -f 2- ncbiRefSeq.txt | ./genePredToGtf file stdin -source=${genome}_Ref  ${genome}_ncbiRefSeq.gtf
86 | echo ""
87 | echo "All done. ${genome} FASTA and GTF files are located in the current directory"
88 | 


--------------------------------------------------------------------------------
/bash_scripts/get_transcripts.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | dir=$(cd -P -- "$(dirname -- "$0")" && pwd -P)
 6 | usage="$(basename "$0") [-h] [-f <final_annotated.gtf>] [-g <reference_genome.fasta>] [-i <gene_name>]
 7 | This program will obtain and align all transcripts coming from a given gene, in order to obtain a consensus.
 8 | Arguments:
 9 |     -h  show this help text
10 |     -f  Name of the StringTie annotated GTF from the pipeline
11 |     -g  Reference genome (in fasta format)
12 |     -i  Gene Symbol"
13 | options=':hf:g:i:'
14 | while getopts $options option; do
15 |   case "$option" in
16 |     h) echo "$usage"; exit;;
17 |     f) f=$OPTARG;;
18 |     g) g=$OPTARG;;
19 |     i) i=$OPTARG;;
20 |     :) printf "missing argument for -%s\n" "$OPTARG" >&2; echo "$usage" >&2; exit 1;;
21 |    \?) printf "illegal option: -%s\n" "$OPTARG" >&2; echo "$usage" >&2; exit 1;;
22 |   esac
23 | done
24 | 
25 | # mandatory arguments
26 | if [ ! "$f" ] || [ ! "$g" ] || [ ! "$i" ]; then
27 |   echo "arguments -f, -g and -i must be provided"
28 |   echo "$usage" >&2; exit 1
29 | fi
30 | 
31 | echo "Working in $dir"
32 | echo ""
33 | echo "Obtaining GTF for the given gene_name using final_annotated.gtf file"
34 | grep "\<${i}\>" ${f} > ${i}.gtf
35 | echo "Done."
36 | echo ""
37 | echo "Obtaining gene-associated transcripts in fasta format"
38 | gffread -w ${i}.fa -g ${g} ${i}.gtf
39 | echo "Done."
40 | echo ""
41 | echo "Aligning transcript sequences with Clustal Omega"
42 | clustalo -i ${i}.fa -o ${i}.aln
43 | echo "Done"
44 | echo ""
45 | echo "Obtaining consensus sequence from alignment with EMBOSS consensus"
46 | em_cons -sequence ${i}.aln -outseq ${i}.cons
47 | echo ""
48 | echo "All done. ${i}.cons file contain suitable sequence to be validated for qPCR."
49 | echo ""
50 | echo "${i}.fa contain transcript sequences in fasta format associated with ${i} gene"
51 | 


--------------------------------------------------------------------------------
/bash_scripts/isoform_identification.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | set -e
  4 | 
  5 | {
  6 | 
  7 | usage="$(basename "$0") [-h] [-m <gffcompare tmap file>] [-t <transcripts.fa file>] [-g <genome name>]
  8 | This script will produce an annotated csv table of transcripts, by using the tmap output file from add-ncbi-annotation pipeline.
  9 | Arguments:
 10 |     -h  show this help text
 11 |     -m  NCBI gffcompare tmap output file. As example: gffcompare.tmap
 12 |     -t  transcripts file, output of add-ncbi-annotation program. As example: NCBI_transcripts.fa
 13 |     -g  UCSC genome name. In example: mm10, galGal6, hg38, rn6."
 14 | options=':hm:t:g:'
 15 | while getopts $options option; do
 16 |   case "$option" in
 17 |     h) echo "$usage"; exit;;
 18 |     m) m=$OPTARG;;
 19 |     t) t=$OPTARG;;
 20 |     g) g=$OPTARG;;
 21 |     :) printf "missing argument for -%s\n" "$OPTARG" >&2; echo "$usage" >&2; exit 1;;
 22 |    \?) printf "illegal option: -%s\n" "$OPTARG" >&2; echo "$usage" >&2; exit 1;;
 23 |   esac
 24 | done
 25 | 
 26 | # mandatory arguments
 27 | if [ ! "$m" ] || [ ! "$t" ] || [ ! "$g" ]; then
 28 |   echo ""
 29 |   echo "arguments -m, -t and -g must be provided"
 30 |   echo ""
 31 |   echo "$usage" >&2; exit 1
 32 | fi
 33 | 
 34 | # Conditions : Input existance
 35 | 
 36 | if [ ! -e "$m" ]; then
 37 |     echo ""
 38 |     echo "$m does not exist. Check your -m input"
 39 |     echo ""
 40 |     exit 9999 # die with error code 9999
 41 | fi
 42 | 
 43 | if [ ! -e "$t" ]; then
 44 |     echo ""
 45 |     echo "$t does not exist. Check your -t input"
 46 |     echo ""
 47 |     exit 9999 # die with error code 9999
 48 | fi
 49 | 
 50 | # Conditions : Getting absolute path of inputs
 51 | echo ""
 52 | m_DIR="$( cd "$( dirname "$m" )" && pwd )"
 53 | echo ""
 54 | echo "::: The absolute path of -m is $m_DIR"
 55 | echo ""
 56 | t_DIR="$( cd "$( dirname "$t" )" && pwd )"
 57 | echo ""
 58 | echo "::: The absolute path of -t is $t_DIR"
 59 | echo ""
 60 | echo ""
 61 | printf "${YELLOW}::: Defining Inputs :::\n"
 62 | echo  ""
 63 | FILE1="$m"
 64 | basename "$FILE1"
 65 | tmap_input="$(basename -- $FILE1)"
 66 | echo "The tmap file used as input is the following: $tmap_input"
 67 | echo""
 68 | FILE2="$t"
 69 | basename "$FILE2"
 70 | transcripts_input="$(basename -- $FILE2)"
 71 | echo "The transcript file used as input is the following: $transcripts_input"
 72 | echo ""
 73 | if [ -f ncbiRefSeqLink.txt ]; then
 74 |     echo "::: ncbiRefSeqLink.txt file found. Continue:"
 75 |     echo ""
 76 |     :
 77 | else
 78 |     echo "::: Downloading ncbiRefSeqLink.txt file"
 79 |     wget http://hgdownload.cse.ucsc.edu/goldenpath/${g}/database/ncbiRefSeqLink.txt.gz
 80 |     gunzip ncbiRefSeqLink.txt.gz
 81 | fi
 82 | 
 83 | # Inputs for python
 84 | cp ${m_DIR}/${tmap_input} ./stringtie_for_script.tmap
 85 | seqkit fx2tab ${t_DIR}/${transcripts_input} > transcripts_Isoform.tab
 86 | # Formatting transcripts_Isoform.tab if gene= is present in file
 87 | sed -i 's/gene=/\t/'g transcripts_Isoform.tab
 88 | awk '{print $1"\t"$NF}' transcripts_Isoform.tab > transcripts_Isoform2.tab
 89 | 
 90 | # Execute gffcompare_parser.py
 91 | python << END
 92 | 
 93 | import sys
 94 | import pandas as pd
 95 | 
 96 | class bcolors:
 97 |     HEADER = '\033[95m'
 98 |     OKBLUE = '\033[94m'
 99 |     OKCYAN = '\033[96m'
100 |     OKGREEN = '\033[92m'
101 |     OKRED = '\033[91m'
102 |     FAIL = '\033[91m'
103 |     ENDC = '\033[0m'
104 |     BOLD = '\033[1m'
105 |     UNDERLINE = '\033[4m'
106 | 
107 | df = pd.read_csv('stringtie_for_script.tmap', sep = '\t')
108 | print(df.sample(10))
109 | print("Total number of transcripts:", df.shape[0])
110 | print("")
111 | 
112 | df2 = df[~df.ref_id.astype(str).str.contains('-')]
113 | novel_transcripts = df[df.ref_id.astype(str).str.contains('-')]
114 | 
115 | df3 = df2[["ref_gene_id", "ref_id", "class_code", "qry_gene_id", "qry_id", "num_exons", "FPKM", "TPM"]]
116 | df_novel_transcripts = novel_transcripts[["ref_gene_id", "ref_id", "class_code", "qry_gene_id", "qry_id", "num_exons", "FPKM", "TPM"]]
117 | 
118 | print("Reference transcripts:")
119 | print(df3.sample(10))
120 | print("")
121 | 
122 | print("Novel transcripts:")
123 | print(df_novel_transcripts.sample(10))
124 | print("")
125 | 
126 | colnames=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18']
127 | dfA1 = pd.read_csv('ncbiRefSeqLink.txt', sep = '\t', low_memory=False, names=colnames, header=None)
128 | print(dfA1.head(10))
129 | 
130 | dfA2 = dfA1[['0', '1', '2', '3', '5', '14', '16']]
131 | 
132 | dfA2 = dfA2.rename(columns={'0': 'ref_id', '1': 'Annotation Status', '2' : 'NCBI RefSeq Gene ID', '3' : 'Transcript Description', '5' : 'NCBI RefSeq Protein ID', '14' : 'Alternative Gene Name', '16' : 'RefSeq Transcript Info'})
133 | print("ncbiRefSeqLink annotation:")
134 | print(dfA2.sample(10))
135 | print("")
136 | 
137 | colnames = ['qry_id', 'cds_seq', 'none']
138 | cds = pd.read_csv('transcripts_Isoform2.tab', sep = '\t', names=colnames)
139 | cds2 = cds[["qry_id", "cds_seq"]]
140 | print("transcripts file:")
141 | print(cds2.sample(10))
142 | print("")
143 | 
144 | result1 = pd.merge(df3, dfA2, on='ref_id', how='inner')
145 | result1.sample(10)
146 | result2 = pd.merge(result1, cds2, on='qry_id', how='inner')
147 | result2.sample(10)
148 | result3 = pd.merge(df_novel_transcripts, cds2, on='qry_id', how='inner')
149 | result3.sample(10)
150 | print("Number of Joined Transcripts (reference):", result2.shape[0])
151 | print("")
152 | print("Number of Joined Transcripts (novel):", result3.shape[0])
153 | print("")
154 | result2.to_csv('Ref_Transcript_Annotation.csv', index=False)
155 | result3.to_csv('Novel_Transcript_Annotation.csv', index=False)
156 | print(bcolors.OKGREEN + "::: Done. Ref_Transcript_Annotation.csv and Novel_Transcript_Annotation.csv were succesfully produced" + bcolors.ENDC)
157 | print("")
158 | END
159 | 
160 | rm -r -f transcripts_Isoform.tab transcripts_Isoform2.tab stringtie_for_script.tmap ncbiRefSeqLink.txt
161 | echo "::: All done. :::"
162 | }
163 | 


--------------------------------------------------------------------------------
/data_examples/transcripts.gtf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cfarkas/annotate_my_genomes/401f5ebae995aed4f07184601edc0c8a368221be/data_examples/transcripts.gtf.gz


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
  1 | name: annotate_my_genomes
  2 | channels:
  3 |   - anaconda
  4 |   - conda-forge
  5 |   - bioconda
  6 |   - defaults
  7 | dependencies:
  8 |   - _libgcc_mutex=0.1=conda_forge
  9 |   - _openmp_mutex=4.5=1_gnu
 10 |   - _r-mutex=1.0.1=anacondar_1
 11 |   - argtable2=2.13=h14c3975_1001
 12 |   - atk-1.0=2.36.0=h3371d22_4
 13 |   - bedtools=2.30.0=h468198e_3
 14 |   - binutils_impl_linux-64=2.36.1=h193b22a_2
 15 |   - binutils_linux-64=2.36=hf3e587d_7
 16 |   - bioconductor-seqlogo=1.60.0=r41hdfd78af_0
 17 |   - bwidget=1.9.14=ha770c72_1
 18 |   - bzip2=1.0.8=h7f98852_4
 19 |   - c-ares=1.18.1=h7f98852_0
 20 |   - ca-certificates=2020.10.14=0
 21 |   - cairo=1.16.0=ha12eb4b_1010
 22 |   - certifi=2020.6.20=py36_0
 23 |   - clustalo=1.2.4=h87f3376_5
 24 |   - coreutils=9.0=h7f98852_0
 25 |   - curl=7.82.0=h7bff187_0
 26 |   - emboss=6.6.0=h5a44aac_5
 27 |   - expat=2.4.7=h27087fc_0
 28 |   - fasta_ushuffle=0.2=hec16e2b_4
 29 |   - feelnc=0.2=pl526_0
 30 |   - font-ttf-dejavu-sans-mono=2.37=hab24e00_0
 31 |   - font-ttf-inconsolata=3.000=h77eed37_0
 32 |   - font-ttf-source-code-pro=2.038=h77eed37_0
 33 |   - font-ttf-ubuntu=0.83=hab24e00_0
 34 |   - fontconfig=2.13.96=h8e229c2_2
 35 |   - fonts-conda-ecosystem=1=0
 36 |   - fonts-conda-forge=1=0
 37 |   - freetype=2.10.4=h0708190_1
 38 |   - fribidi=1.0.10=h36c2ea0_0
 39 |   - gawk=5.1.0=h7b6447c_0
 40 |   - gcc_impl_linux-64=9.4.0=h03d3576_13
 41 |   - gcc_linux-64=9.4.0=h391b98a_7
 42 |   - gdk-pixbuf=2.42.6=h04a7f16_0
 43 |   - gettext=0.19.8.1=h73d1719_1008
 44 |   - gffcompare=0.11.2=h9f5acd7_3
 45 |   - gffread=0.12.7=hd03093a_1
 46 |   - gfortran_impl_linux-64=9.4.0=h0003116_13
 47 |   - gfortran_linux-64=9.4.0=hf0ab688_7
 48 |   - giflib=5.2.1=h36c2ea0_2
 49 |   - gmap=2021.08.25=pl5262h36cd882_0
 50 |   - graphite2=1.3.13=h58526e2_1001
 51 |   - graphviz=3.0.0=h5abf519_0
 52 |   - gsl=2.7=he838d99_0
 53 |   - gtk2=2.24.33=h90689f9_2
 54 |   - gts=0.7.6=h64030ff_2
 55 |   - gxx_impl_linux-64=9.4.0=h03d3576_13
 56 |   - gxx_linux-64=9.4.0=h0316aca_7
 57 |   - harfbuzz=3.4.0=hb4a5f5f_0
 58 |   - htslib=1.14=h9753748_2
 59 |   - icu=69.1=h9c3ff4c_0
 60 |   - jbig=2.1=h7f98852_2003
 61 |   - jpeg=9e=h7f98852_0
 62 |   - k8=0.2.5=hd03093a_2
 63 |   - kernel-headers_linux-64=2.6.32=he073ed8_15
 64 |   - keyutils=1.6.1=h166bdaf_0
 65 |   - kmerinshort=1.0.1=0
 66 |   - krb5=1.19.2=h3790be6_4
 67 |   - ld_impl_linux-64=2.36.1=hea4e1c9_2
 68 |   - lerc=3.0=h9c3ff4c_0
 69 |   - libblas=3.9.0=13_linux64_openblas
 70 |   - libcblas=3.9.0=13_linux64_openblas
 71 |   - libcurl=7.82.0=h7bff187_0
 72 |   - libdb=6.2.32=h9c3ff4c_0
 73 |   - libdeflate=1.10=h7f98852_0
 74 |   - libedit=3.1.20191231=he28a2e2_2
 75 |   - libev=4.33=h516909a_1
 76 |   - libffi=3.4.2=h7f98852_5
 77 |   - libgcc=7.2.0=h69d50b8_2
 78 |   - libgcc-devel_linux-64=9.4.0=hd854feb_13
 79 |   - libgcc-ng=11.2.0=h1d223b6_13
 80 |   - libgd=2.3.3=h283352f_2
 81 |   - libgfortran-ng=11.2.0=h69a702a_13
 82 |   - libgfortran5=11.2.0=h5c6108e_13
 83 |   - libglib=2.70.2=h174f98d_4
 84 |   - libgomp=11.2.0=h1d223b6_13
 85 |   - libiconv=1.16=h516909a_0
 86 |   - liblapack=3.9.0=13_linux64_openblas
 87 |   - libnghttp2=1.47.0=h727a467_0
 88 |   - libnsl=2.0.0=h7f98852_0
 89 |   - libopenblas=0.3.18=pthreads_h8fe5266_0
 90 |   - libpng=1.6.37=h21135ba_2
 91 |   - librsvg=2.52.5=h0a9e6e8_2
 92 |   - libsanitizer=9.4.0=h79bfe98_13
 93 |   - libssh2=1.10.0=ha56f1ee_2
 94 |   - libstdcxx-devel_linux-64=9.4.0=hd854feb_13
 95 |   - libstdcxx-ng=11.2.0=he4da1e4_13
 96 |   - libtiff=4.3.0=h542a066_3
 97 |   - libtool=2.4.6=h9c3ff4c_1008
 98 |   - libuuid=2.32.1=h7f98852_1000
 99 |   - libwebp=1.2.2=h3452ae3_0
100 |   - libwebp-base=1.2.2=h7f98852_1
101 |   - libxcb=1.13=h7f98852_1004
102 |   - libxml2=2.9.12=h885dcf4_1
103 |   - libxslt=1.1.33=h0ef7038_3
104 |   - libzlib=1.2.11=h36c2ea0_1013
105 |   - lz4-c=1.9.3=h9c3ff4c_1
106 |   - make=4.3=hd18ef5c_1
107 |   - minimap2=2.24=h7132678_1
108 |   - ncurses=6.2=h58526e2_4
109 |   - numpy=1.19.5=py36hfc0c790_2
110 |   - openssl=1.1.1l=h7f98852_0
111 |   - pandas=1.1.3=py36he6710b0_0
112 |   - pango=1.50.5=h4dcc4a0_0
113 |   - parallel=20220222=ha770c72_0
114 |   - pcre=8.45=h9c3ff4c_0
115 |   - pcre2=10.37=h032f7d1_0
116 |   - perl=5.26.2=h36c2ea0_1008
117 |   - perl-aceperl=1.92=pl526_2
118 |   - perl-algorithm-diff=1.1903=pl526_2
119 |   - perl-algorithm-munkres=0.08=pl526_1
120 |   - perl-apache-test=1.40=pl526_1
121 |   - perl-app-cpanminus=1.7044=pl526_1
122 |   - perl-appconfig=1.71=pl526_1
123 |   - perl-array-compare=3.0.1=pl526_1
124 |   - perl-autoloader=5.74=pl526_2
125 |   - perl-base=2.23=pl526_1
126 |   - perl-bio-asn1-entrezgene=1.73=pl5262hdfd78af_2
127 |   - perl-bio-featureio=1.6.905=pl5262hdfd78af_3
128 |   - perl-bio-phylo=0.58=pl5262hdfd78af_3
129 |   - perl-bio-samtools=1.43=pl526h1341992_1
130 |   - perl-bioperl=1.6.924=6
131 |   - perl-bioperl-core=1.007002=pl5262hdfd78af_3
132 |   - perl-bioperl-run=1.007002=pl5262hdfd78af_5
133 |   - perl-business-isbn=3.004=pl526_0
134 |   - perl-business-isbn-data=20140910.003=pl526_0
135 |   - perl-cache-cache=1.08=pl526_0
136 |   - perl-capture-tiny=0.48=pl526_0
137 |   - perl-carp=1.38=pl526_3
138 |   - perl-cgi=4.44=pl526h14c3975_1
139 |   - perl-class-data-inheritable=0.08=pl526_1
140 |   - perl-class-inspector=1.34=pl526_0
141 |   - perl-class-load=0.25=pl526_0
142 |   - perl-class-load-xs=0.10=pl526h6bb024c_2
143 |   - perl-class-method-modifiers=2.12=pl526_0
144 |   - perl-clone=0.42=pl526h516909a_0
145 |   - perl-common-sense=3.74=pl526_2
146 |   - perl-compress-raw-zlib=2.087=pl526hc9558a2_0
147 |   - perl-constant=1.33=pl526_1
148 |   - perl-convert-binary-c=0.78=pl526h6bb024c_3
149 |   - perl-convert-binhex=1.125=pl526_1
150 |   - perl-crypt-rc4=2.02=pl526_1
151 |   - perl-data-dumper=2.173=pl526_0
152 |   - perl-data-optlist=0.110=pl526_2
153 |   - perl-data-stag=0.14=pl526_1
154 |   - perl-date-format=2.30=pl526_2
155 |   - perl-db-file=1.855=pl526h516909a_0
156 |   - perl-dbd-sqlite=1.64=pl526h516909a_0
157 |   - perl-dbi=1.642=pl526_0
158 |   - perl-devel-globaldestruction=0.14=pl526_0
159 |   - perl-devel-overloadinfo=0.005=pl526_0
160 |   - perl-devel-stacktrace=2.04=pl526_0
161 |   - perl-digest-hmac=1.03=pl526_3
162 |   - perl-digest-md5=2.55=pl526_0
163 |   - perl-digest-perl-md5=1.9=pl526_1
164 |   - perl-digest-sha1=2.13=pl526h6bb024c_1
165 |   - perl-dist-checkconflicts=0.11=pl526_2
166 |   - perl-dynaloader=1.25=pl526_1
167 |   - perl-email-date-format=1.005=pl526_2
168 |   - perl-encode=2.88=pl526_1
169 |   - perl-encode-locale=1.05=pl526_6
170 |   - perl-error=0.17027=pl526_1
171 |   - perl-eval-closure=0.14=pl526h6bb024c_4
172 |   - perl-exception-class=1.44=pl526_0
173 |   - perl-exporter=5.72=pl526_1
174 |   - perl-exporter-tiny=1.002001=pl526_0
175 |   - perl-extutils-makemaker=7.36=pl526_1
176 |   - perl-file-listing=6.04=pl526_1
177 |   - perl-file-path=2.16=pl526_0
178 |   - perl-file-slurp-tiny=0.004=pl526_1
179 |   - perl-file-sort=1.01=pl526_2
180 |   - perl-file-temp=0.2304=pl526_2
181 |   - perl-file-which=1.23=pl526_0
182 |   - perl-font-afm=1.20=pl526_2
183 |   - perl-font-ttf=1.06=pl526_0
184 |   - perl-gd=2.68=pl526he941832_0
185 |   - perl-getopt-long=2.50=pl526_1
186 |   - perl-graph=0.9704=pl526_1
187 |   - perl-graphviz=2.20=1
188 |   - perl-html-element-extended=1.18=pl526_1
189 |   - perl-html-entities-numbered=0.04=pl526_1
190 |   - perl-html-formatter=2.16=pl526_0
191 |   - perl-html-parser=3.72=pl526h6bb024c_5
192 |   - perl-html-tableextract=2.13=pl526_2
193 |   - perl-html-tagset=3.20=pl526_3
194 |   - perl-html-tidy=1.60=pl526_0
195 |   - perl-html-tree=5.07=pl526_1
196 |   - perl-html-treebuilder-xpath=0.14=pl526_1
197 |   - perl-http-cookies=6.04=pl526_0
198 |   - perl-http-daemon=6.01=pl526_1
199 |   - perl-http-date=6.02=pl526_3
200 |   - perl-http-message=6.18=pl526_0
201 |   - perl-http-negotiate=6.01=pl526_3
202 |   - perl-image-info=1.38=pl526_1
203 |   - perl-image-size=3.300=pl526_2
204 |   - perl-io-html=1.001=pl526_2
205 |   - perl-io-sessiondata=1.03=pl526_1
206 |   - perl-io-socket-ssl=2.066=pl526_0
207 |   - perl-io-string=1.08=pl526_3
208 |   - perl-io-stringy=2.111=pl526_1
209 |   - perl-io-tty=1.12=pl526_1
210 |   - perl-ipc-run=20180523.0=pl526_0
211 |   - perl-ipc-sharelite=0.17=pl526h6bb024c_1
212 |   - perl-jcode=2.07=pl526_2
213 |   - perl-json=4.02=pl526_0
214 |   - perl-json-xs=2.34=pl526h6bb024c_3
215 |   - perl-libwww-perl=6.39=pl526_0
216 |   - perl-libxml-perl=0.08=pl526_2
217 |   - perl-list-moreutils=0.428=pl526_1
218 |   - perl-list-moreutils-xs=0.428=pl526_0
219 |   - perl-local-lib=2.000024=pl526_0
220 |   - perl-lwp-mediatypes=6.04=pl526_0
221 |   - perl-lwp-protocol-https=6.07=pl526_4
222 |   - perl-lwp-simple=6.15=pl526h470a237_4
223 |   - perl-mailtools=2.21=pl526_0
224 |   - perl-math-cdf=0.1=pl526h14c3975_5
225 |   - perl-math-derivative=1.01=pl526_0
226 |   - perl-math-random=0.72=pl526h14c3975_2
227 |   - perl-math-spline=0.02=pl526_2
228 |   - perl-mime-base64=3.15=pl526_1
229 |   - perl-mime-lite=3.030=pl526_1
230 |   - perl-mime-tools=5.508=pl526_1
231 |   - perl-mime-types=2.17=pl526_0
232 |   - perl-mldbm=2.05=pl526_1
233 |   - perl-module-build=0.4224=pl526h470a237_1
234 |   - perl-module-implementation=0.09=pl526_2
235 |   - perl-module-runtime=0.016=pl526_1
236 |   - perl-module-runtime-conflicts=0.003=pl526_0
237 |   - perl-moo=2.003004=pl526_0
238 |   - perl-moose=2.2011=pl526hf484d3e_1
239 |   - perl-mozilla-ca=20180117=pl526_1
240 |   - perl-mro-compat=0.13=pl526_0
241 |   - perl-net-http=6.19=pl526_0
242 |   - perl-net-ssleay=1.88=pl526h90d6eec_0
243 |   - perl-ntlm=1.09=pl526_4
244 |   - perl-ole-storage_lite=0.19=pl526_3
245 |   - perl-package-deprecationmanager=0.17=pl526_0
246 |   - perl-package-stash=0.38=pl526hf484d3e_1
247 |   - perl-package-stash-xs=0.28=pl526hf484d3e_1
248 |   - perl-parallel-forkmanager=2.02=pl526_0
249 |   - perl-params-util=1.07=pl526h6bb024c_4
250 |   - perl-parent=0.236=pl526_1
251 |   - perl-parse-recdescent=1.967015=pl526_0
252 |   - perl-pathtools=3.75=pl526h14c3975_1
253 |   - perl-pdf-api2=2.035=pl526_0
254 |   - perl-postscript=0.06=pl526_2
255 |   - perl-role-tiny=2.000008=pl526_0
256 |   - perl-scalar-list-utils=1.52=pl526h516909a_0
257 |   - perl-set-scalar=1.29=pl526_2
258 |   - perl-soap-lite=1.19=pl526_1
259 |   - perl-socket=2.027=pl526_1
260 |   - perl-sort-naturally=1.03=pl526_2
261 |   - perl-spreadsheet-parseexcel=0.65=pl526_2
262 |   - perl-spreadsheet-writeexcel=2.40=pl526_2
263 |   - perl-statistics-descriptive=3.0702=pl526_0
264 |   - perl-storable=3.15=pl526h14c3975_0
265 |   - perl-sub-exporter=0.987=pl526_2
266 |   - perl-sub-exporter-progressive=0.001013=pl526_0
267 |   - perl-sub-identify=0.14=pl526h14c3975_0
268 |   - perl-sub-install=0.928=pl526_2
269 |   - perl-sub-name=0.21=pl526_1
270 |   - perl-sub-quote=2.006003=pl526_1
271 |   - perl-sub-uplevel=0.2800=pl526h14c3975_2
272 |   - perl-svg=2.84=pl526_0
273 |   - perl-svg-graph=0.02=pl526_3
274 |   - perl-task-weaken=1.06=pl526_0
275 |   - perl-template-toolkit=2.26=pl526_1
276 |   - perl-test-deep=1.128=pl526_1
277 |   - perl-test-differences=0.67=pl526_0
278 |   - perl-test-exception=0.43=pl526_2
279 |   - perl-test-leaktrace=0.16=pl526h14c3975_2
280 |   - perl-test-most=0.35=pl526_0
281 |   - perl-test-pod=1.52=pl526_0
282 |   - perl-test-requiresinternet=0.05=pl526_0
283 |   - perl-test-warn=0.36=pl526_1
284 |   - perl-text-diff=1.45=pl526_0
285 |   - perl-threaded=5.32.1=hdfd78af_1
286 |   - perl-tie-ixhash=1.23=pl526_2
287 |   - perl-time-local=1.28=pl526_1
288 |   - perl-timedate=2.30=pl526_1
289 |   - perl-tree-dag_node=1.31=pl526_0
290 |   - perl-try-tiny=0.30=pl526_1
291 |   - perl-type-tiny=1.004004=pl526_0
292 |   - perl-types-serialiser=1.0=pl526_2
293 |   - perl-unicode-map=0.112=pl526h6bb024c_3
294 |   - perl-uri=1.76=pl526_0
295 |   - perl-www-robotrules=6.02=pl526_3
296 |   - perl-xml-dom=1.46=pl526_0
297 |   - perl-xml-dom-xpath=0.14=pl526_1
298 |   - perl-xml-filter-buffertext=1.01=pl526_2
299 |   - perl-xml-libxml=2.0132=pl526h7ec2d77_1
300 |   - perl-xml-libxslt=1.94=pl526_1
301 |   - perl-xml-namespacesupport=1.12=pl526_0
302 |   - perl-xml-parser=2.44_01=pl5262hc3e0081_1002
303 |   - perl-xml-regexp=0.04=pl526_2
304 |   - perl-xml-sax=1.02=pl526_0
305 |   - perl-xml-sax-base=1.09=pl526_0
306 |   - perl-xml-sax-expat=0.51=pl526_3
307 |   - perl-xml-sax-writer=0.57=pl526_0
308 |   - perl-xml-simple=2.25=pl526_1
309 |   - perl-xml-twig=3.52=pl526_2
310 |   - perl-xml-writer=0.625=pl526_2
311 |   - perl-xml-xpath=1.44=pl526_0
312 |   - perl-xml-xpathengine=0.14=pl526_2
313 |   - perl-xsloader=0.24=pl526_0
314 |   - perl-yaml=1.29=pl526_0
315 |   - pip=20.0.2=py36_1
316 |   - pixman=0.40.0=h36c2ea0_0
317 |   - pthread-stubs=0.4=h36c2ea0_1001
318 |   - python=3.6.15=hb7a2778_0_cpython
319 |   - python-dateutil=2.8.1=py_0
320 |   - python_abi=3.6=2_cp36m
321 |   - pytz=2020.1=py_0
322 |   - r-assertthat=0.2.1=r41hc72bb7e_2
323 |   - r-backports=1.4.1=r41hcfec24a_0
324 |   - r-base=4.1.2=h2553ce4_1
325 |   - r-bitops=1.0_7=r41hcfec24a_0
326 |   - r-brio=1.1.3=r41hcfec24a_0
327 |   - r-callr=3.7.0=r41hc72bb7e_0
328 |   - r-catools=1.18.2=r41h03ef668_0
329 |   - r-cli=3.2.0=r41h03ef668_0
330 |   - r-colorspace=2.0_3=r41h06615bd_0
331 |   - r-crayon=1.5.0=r41hc72bb7e_0
332 |   - r-desc=1.4.0=r41hc72bb7e_0
333 |   - r-diffobj=0.3.5=r41hcfec24a_0
334 |   - r-digest=0.6.29=r41h03ef668_0
335 |   - r-ellipsis=0.3.2=r41hcfec24a_0
336 |   - r-evaluate=0.15=r41hc72bb7e_0
337 |   - r-fansi=1.0.2=r41hcfec24a_0
338 |   - r-farver=2.1.0=r41h03ef668_0
339 |   - r-ggplot2=3.3.5=r41hc72bb7e_0
340 |   - r-glue=1.6.2=r41h06615bd_0
341 |   - r-gplots=3.1.1=r41hc72bb7e_0
342 |   - r-gtable=0.3.0=r41hc72bb7e_3
343 |   - r-gtools=3.9.2=r41hcfec24a_0
344 |   - r-isoband=0.2.5=r41h03ef668_0
345 |   - r-jsonlite=1.8.0=r41h06615bd_0
346 |   - r-kernsmooth=2.23_20=r41h742201e_0
347 |   - r-labeling=0.4.2=r41hc72bb7e_1
348 |   - r-lattice=0.20_45=r41hcfec24a_0
349 |   - r-lifecycle=1.0.1=r41hc72bb7e_0
350 |   - r-magrittr=2.0.2=r41hcfec24a_0
351 |   - r-mass=7.3_55=r41hcfec24a_0
352 |   - r-matrix=1.4_0=r41he454529_0
353 |   - r-mgcv=1.8_39=r41h0154571_0
354 |   - r-munsell=0.5.0=r41hc72bb7e_1004
355 |   - r-nlme=3.1_155=r41h859d828_0
356 |   - r-pillar=1.7.0=r41hc72bb7e_0
357 |   - r-pkgconfig=2.0.3=r41hc72bb7e_1
358 |   - r-pkgload=1.2.4=r41h03ef668_0
359 |   - r-praise=1.0.0=r41hc72bb7e_1005
360 |   - r-processx=3.5.2=r41hcfec24a_0
361 |   - r-ps=1.6.0=r41hcfec24a_0
362 |   - r-r6=2.5.1=r41hc72bb7e_0
363 |   - r-randomforest=4.6_14=r41h859d828_1004
364 |   - r-rcolorbrewer=1.1_2=r41h785f33e_1003
365 |   - r-rcpp=1.0.8=r41h03ef668_0
366 |   - r-rematch2=2.1.2=r41hc72bb7e_1
367 |   - r-rlang=0.4.12=r41hcfec24a_0
368 |   - r-rocr=1.0_11=r41hc72bb7e_1
369 |   - r-rprojroot=2.0.2=r41hc72bb7e_0
370 |   - r-rstudioapi=0.13=r41hc72bb7e_0
371 |   - r-scales=1.1.1=r41hc72bb7e_0
372 |   - r-testthat=3.1.2=r41h03ef668_0
373 |   - r-tibble=3.1.6=r41hcfec24a_0
374 |   - r-utf8=1.2.2=r41hcfec24a_0
375 |   - r-vctrs=0.3.8=r41hcfec24a_1
376 |   - r-viridislite=0.4.0=r41hc72bb7e_0
377 |   - r-waldo=0.3.1=r41hc72bb7e_0
378 |   - r-withr=2.5.0=r41hc72bb7e_0
379 |   - readline=8.1=h46c0cb4_0
380 |   - seqkit=2.1.0=h9ee0642_0
381 |   - setuptools=49.6.0=py36h5fab9bb_3
382 |   - six=1.15.0=py_0
383 |   - sqlite=3.37.0=h9cd32fc_0
384 |   - stringtie=2.2.1=hecb563c_2
385 |   - sysroot_linux-64=2.12=he073ed8_15
386 |   - tidyp=1.04=hec16e2b_4
387 |   - tk=8.6.12=h27826a3_0
388 |   - tktable=2.10=hb7b940f_3
389 |   - transdecoder=5.5.0=pl5262hdfd78af_4
390 |   - wheel=0.37.1=pyhd8ed1ab_0
391 |   - xorg-kbproto=1.0.7=h7f98852_1002
392 |   - xorg-libice=1.0.10=h7f98852_0
393 |   - xorg-libsm=1.2.3=hd9c2040_1000
394 |   - xorg-libx11=1.7.2=h7f98852_0
395 |   - xorg-libxau=1.0.9=h7f98852_0
396 |   - xorg-libxdmcp=1.1.3=h7f98852_0
397 |   - xorg-libxext=1.3.4=h7f98852_1
398 |   - xorg-libxrender=0.9.10=h7f98852_1003
399 |   - xorg-libxt=1.2.1=h7f98852_2
400 |   - xorg-renderproto=0.11.1=h7f98852_1002
401 |   - xorg-xextproto=7.3.0=h7f98852_1002
402 |   - xorg-xproto=7.0.31=h7f98852_1007
403 |   - xz=5.2.5=h516909a_1
404 |   - zlib=1.2.11=h36c2ea0_1013
405 |   - zstd=1.5.2=ha95c52a_0
406 | 


--------------------------------------------------------------------------------
/makefile.nf:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env nextflow
  2 | 
  3 | /*
  4 |  * pipeline input parameters
  5 |  */
  6 | 
  7 | params.workdir = './'
  8 | params.conda = './environment.yml'
  9 | 
 10 | println """\
 11 |                      M A K E F I L E
 12 |          =======================================
 13 |          working_directory  :  ${params.workdir}
 14 |          environment        :  ${params.conda}
 15 |          """
 16 |          .stripIndent()
 17 | 
 18 | 
 19 | process make_and_install {
 20 |   echo true
 21 |   stageInMode 'copy'
 22 |   conda "${params.conda}"
 23 | 
 24 |   shell:
 25 |   '''
 26 |   cd "!{params.workdir}"
 27 |   dir=$(cd -P -- "$(dirname -- "$0")" && pwd -P)
 28 |   rm -r -f swissprot
 29 |   mkdir swissprot
 30 |   cd swissprot
 31 |   wget --ignore-length ftp://ftp.ncbi.nlm.nih.gov/blast/db/swissprot.tar.gz
 32 |   gunzip swissprot.tar.gz
 33 |   tar -xvf swissprot.tar
 34 |   SWISSPROT_PATH=$PWD
 35 |   echo "$SWISSPROT_PATH"
 36 |   cd "!{params.workdir}"/test/
 37 |   exec 3<> gawn_config.sh
 38 |   #!/bin/bash >> gawn_config.sh
 39 |   echo "" >> gawn_config.sh
 40 |   # Modify the following parameter values according to your experiment >> gawn_config.sh
 41 |   # Do not modify the parameter names or remove parameters >> gawn_config.sh
 42 |   # Do not add spaces around the equal (=) sign >> gawn_config.sh
 43 |   echo "" >> gawn_config.sh
 44 |   # Global parameters >> gawn_config.sh
 45 |   NCPUS=10                    # Number of CPUs to use for analyses (int, 1+) >> gawn_config.sh
 46 |   echo "" >> gawn_config.sh
 47 |   # Genome indexing >> gawn_config.sh
 48 |   SKIP_GENOME_INDEXING=1      # 1 to skip genome indexing, 0 to index it >> gawn_config.sh
 49 |   echo "" >> gawn_config.sh
 50 |   # Genome annotation with transcriptome >> gawn_config.sh
 51 |   # NOTE: do not use compressed fasta files >> gawn_config.sh
 52 |   GENOME_NAME="genome.fasta"                  # Name of genome fasta file found in 03_data >> gawn_config.sh
 53 |   TRANSCRIPTOME_NAME="transcriptome.fasta"    # Name of transcriptome fasta file found in 03_data >> gawn_config.sh
 54 |   echo "" >> gawn_config.sh
 55 |   # Path to swissprot database >> gawn_config.sh
 56 |   echo 'SWISSPROT_DB="'$SWISSPROT_PATH'/swissprot"' >> gawn_config.sh
 57 |   echo '#' >> gawn_config.sh
 58 |   exec 3>&-
 59 |   cd "!{params.workdir}"
 60 |   mkdir bin
 61 |   mkdir genome_1
 62 |   mkdir get_transcripts
 63 |   cp "!{params.workdir}"/test/gawn_config.sh "!{params.workdir}"/genome_1/
 64 |   cp "!{params.workdir}"/test/gawn_config.sh "!{params.workdir}"/bash_scripts/
 65 |   cp "!{params.workdir}"/test/gawn_config.sh "!{params.workdir}"
 66 |   cd "!{params.workdir}"
 67 |   git clone https://github.com/cfarkas/shc.git
 68 |   cd shc/
 69 |   ./autogen.sh
 70 |   ./configure
 71 |   make
 72 |   # Install
 73 |   cd "!{params.workdir}"
 74 |   "!{params.workdir}"/shc/src/shc -f "!{params.workdir}"/bash_scripts/annotate_my_genomes.sh -o ./annotate-my-genomes
 75 |   "!{params.workdir}"/shc/src/shc -f "!{params.workdir}"/bash_scripts/annotate_my_genomes.sh -o ./annotate-my-genomes
 76 |   "!{params.workdir}"/shc/src/shc -f "!{params.workdir}"/bash_scripts/get_transcripts.sh -o ./get-transcripts
 77 |   "!{params.workdir}"/shc/src/shc -f "!{params.workdir}"/bash_scripts/genome_download.sh -o ./genome-download
 78 |   "!{params.workdir}"/shc/src/shc -f "!{params.workdir}"/bash_scripts/genome_download_macOSX.sh -o ./genome-download-macOSX
 79 |   "!{params.workdir}"/shc/src/shc -f "!{params.workdir}"/bash_scripts/add_ncbi_annotation.sh -o ./add-ncbi-annotation
 80 |   "!{params.workdir}"/shc/src/shc -f "!{params.workdir}"/bash_scripts/isoform_identification.sh -o ./isoform-identification
 81 |   mv "!{params.workdir}"/annotate-my-genomes "!{params.workdir}"/get-transcripts "!{params.workdir}"/genome-download "!{params.workdir}"/genome-download-macOSX "!{params.workdir}"/add-ncbi-annotation "!{params.workdir}"/isoform-identification "!{params.workdir}"/bin/
 82 |   cp "!{params.workdir}"/bin/annotate-my-genomes "!{params.workdir}"/test/
 83 |   cp "!{params.workdir}"/bin/annotate-my-genomes "!{params.workdir}"/genome_1/
 84 |   cp "!{params.workdir}"/bin/genome-download "!{params.workdir}"/test/
 85 |   cp "!{params.workdir}"/bin/genome-download "!{params.workdir}"/genome_1/
 86 |   cp "!{params.workdir}"/bin/genome-download-macOSX "!{params.workdir}"/test/
 87 |   cp "!{params.workdir}"/bin/genome-download-macOSX "!{params.workdir}"/genome_1/
 88 |   cp "!{params.workdir}"/bin/add-ncbi-annotation "!{params.workdir}"/test/
 89 |   cp "!{params.workdir}"/bin/add-ncbi-annotation "!{params.workdir}"/genome_1/
 90 |   cp "!{params.workdir}"/bin/isoform-identification "!{params.workdir}"/test/
 91 |   cp "!{params.workdir}"/bin/isoform-identification "!{params.workdir}"/genome_1/
 92 |   cp "!{params.workdir}"/bin/get-transcripts "!{params.workdir}"/get_transcripts/
 93 |   cp "!{params.workdir}"/bin/genome-download "!{params.workdir}"/get_transcripts/
 94 |   echo ""
 95 |   echo "::: All done. Binaries are located in "!{params.workdir}"/bin/ folder. :::"
 96 |   echo ""
 97 |   echo "::: With sudo privileges, users can do : sudo cp ./bin/* /usr/local/bin/ :::"
 98 |   echo ""
 99 |   echo ""
100 |   '''
101 | }
102 | 


--------------------------------------------------------------------------------
/makefile.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | dir=$(cd -P -- "$(dirname -- "$0")" && pwd -P)
 4 | rm -r -f swissprot
 5 | mkdir swissprot
 6 | cd swissprot
 7 | wget ftp://ftp.ncbi.nlm.nih.gov/blast/db/swissprot.tar.gz
 8 | gunzip swissprot.tar.gz
 9 | tar -xvf swissprot.tar
10 | SWISSPROT_PATH=$PWD
11 | echo "$SWISSPROT_PATH"
12 | cd ..
13 | cd test
14 | echo 'SWISSPROT_DB="'$SWISSPROT_PATH'/swissprot"' >> gawn_config.sh
15 | echo '#' >> gawn_config.sh
16 | cd ..
17 | mkdir bin
18 | mkdir genome_1
19 | mkdir get_transcripts
20 | cp ./test/gawn_config.sh ./genome_1/
21 | cp ./test/gawn_config.sh ./bash_scripts/
22 | cp ./test/gawn_config.sh ./
23 | git clone https://github.com/cfarkas/shc.git
24 | cd shc/
25 | ./autogen.sh
26 | ./configure
27 | make
28 | cd ..
29 | echo ""
30 | echo "make done. Continue with install"
31 | # Install
32 | ./shc/src/shc -f ./bash_scripts/annotate_my_genomes.sh -o ./annotate-my-genomes
33 | ./shc/src/shc -f ./bash_scripts/get_transcripts.sh -o ./get-transcripts
34 | ./shc/src/shc -f ./bash_scripts/genome_download.sh -o ./genome-download
35 | ./shc/src/shc -f ./bash_scripts/genome_download_macOSX.sh -o ./genome-download-macOSX
36 | ./shc/src/shc -f ./bash_scripts/add_ncbi_annotation.sh -o ./add-ncbi-annotation
37 | ./shc/src/shc -f ./bash_scripts/isoform_identification.sh -o ./isoform-identification
38 | mv annotate-my-genomes get-transcripts genome-download genome-download-macOSX add-ncbi-annotation isoform-identification ./bin/
39 | cp ./bin/annotate-my-genomes ./test/
40 | cp ./bin/annotate-my-genomes ./genome_1/
41 | cp ./bin/genome-download ./test/
42 | cp ./bin/genome-download ./genome_1/
43 | cp ./bin/genome-download-macOSX ./test/
44 | cp ./bin/genome-download-macOSX ./genome_1/
45 | cp ./bin/add-ncbi-annotation ./test/
46 | cp ./bin/add-ncbi-annotation ./genome_1/
47 | cp ./bin/isoform-identification ./test/
48 | cp ./bin/isoform-identification ./genome_1/
49 | cp ./bin/get-transcripts ./get_transcripts/
50 | cp ./bin/genome-download ./get_transcripts/
51 | echo "::: All done. Binaries are located in ./bin/ folder. :::" 
52 | echo ""
53 | echo "::: With sudo privileges, users can do : sudo cp ./bin/* /usr/local/bin/ :::"
54 | echo ""
55 | #
56 | 


--------------------------------------------------------------------------------
/nextflow_scripts/22.04_environment.yml:
--------------------------------------------------------------------------------
  1 | name: annotate_my_genomes
  2 | channels:
  3 |   - anaconda
  4 |   - conda-forge
  5 |   - bioconda
  6 |   - defaults
  7 |   - r
  8 | dependencies:
  9 |   - _libgcc_mutex=0.1
 10 |   - _openmp_mutex=4.5
 11 |   - _r-mutex=1.0.1
 12 |   - argtable2=2.13
 13 |   - atk-1.0=2.36.0
 14 |   - bedtools=2.30.0
 15 |   - binutils_impl_linux-64=2.39
 16 |   - binutils_linux-64=2.39
 17 |   - bottleneck=1.3.5
 18 |   - bwidget=1.9.14
 19 |   - bzip2=1.0.8
 20 |   - c-ares=1.18.1
 21 |   - ca-certificates=2022.07.19
 22 |   - cairo=1.16.0
 23 |   - clustalo=1.2.4
 24 |   - clustalw=2.1
 25 |   - coreutils=9.1
 26 |   - curl=7.83.1
 27 |   - emboss=6.6.0
 28 |   - expat=2.5.0
 29 |   - fasta_ushuffle=0.2
 30 |   - feelnc=0.2
 31 |   - font-ttf-dejavu-sans-mono=2.37
 32 |   - font-ttf-inconsolata=3.000
 33 |   - font-ttf-source-code-pro=2.038
 34 |   - font-ttf-ubuntu=0.83
 35 |   - fontconfig=2.14.0
 36 |   - fonts-conda-ecosystem=1
 37 |   - fonts-conda-forge=1
 38 |   - freetype=2.10.4
 39 |   - fribidi=1.0.10
 40 |   - gawk=5.1.0
 41 |   - gcc_impl_linux-64=10.4.0
 42 |   - gcc_linux-64=10.4.0
 43 |   - gdk-pixbuf=2.42.8
 44 |   - gettext=0.21.1
 45 |   - gffcompare=0.11.2
 46 |   - gffread=0.12.7
 47 |   - gfortran_impl_linux-64=10.4.0
 48 |   - gfortran_linux-64=10.4.0
 49 |   - giflib=5.2.1
 50 |   - gmap=2021.08.25
 51 |   - graphite2=1.3.13
 52 |   - graphviz=2.50.0
 53 |   - gsl=2.7
 54 |   - gtk2=2.24.33
 55 |   - gts=0.7.6
 56 |   - gxx_impl_linux-64=10.4.0
 57 |   - gxx_linux-64=10.4.0
 58 |   - harfbuzz=4.4.1
 59 |   - htslib=1.14
 60 |   - icu=70.1
 61 |   - jpeg=9e
 62 |   - k8=0.2.5
 63 |   - kernel-headers_linux-64=2.6.32
 64 |   - keyutils=1.6.1
 65 |   - kmerinshort=1.0.1
 66 |   - krb5=1.19.3
 67 |   - ld_impl_linux-64=2.39
 68 |   - lerc=3.0
 69 |   - libblas=3.9.0
 70 |   - libcblas=3.9.0
 71 |   - libcurl=7.83.1
 72 |   - libdb=6.2.32
 73 |   - libdeflate=1.10
 74 |   - libedit=3.1.20191231
 75 |   - libev=4.33
 76 |   - libffi=3.4.2
 77 |   - libgcc=7.2.0
 78 |   - libgcc-devel_linux-64=10.4.0
 79 |   - libgcc-ng=12.2.0
 80 |   - libgd=2.3.3
 81 |   - libgfortran-ng=12.2.0
 82 |   - libgfortran5=12.2.0
 83 |   - libglib=2.70.2
 84 |   - libgomp=12.2.0
 85 |   - libiconv=1.17
 86 |   - liblapack=3.9.0
 87 |   - libnghttp2=1.47.0
 88 |   - libnsl=2.0.0
 89 |   - libopenblas=0.3.21
 90 |   - libpng=1.6.37
 91 |   - librsvg=2.54.4
 92 |   - libsanitizer=10.4.0
 93 |   - libssh2=1.10.0
 94 |   - libstdcxx-devel_linux-64=10.4.0
 95 |   - libstdcxx-ng=12.2.0
 96 |   - libtiff=4.3.0
 97 |   - libtool=2.4.6
 98 |   - libuuid=2.32.1
 99 |   - libwebp=1.2.2
100 |   - libwebp-base=1.2.2
101 |   - libxcb=1.13
102 |   - libxml2=2.9.14
103 |   - libxslt=1.1.33
104 |   - libzlib=1.2.11
105 |   - lz4-c=1.9.3
106 |   - make=4.3
107 |   - minimap2=2.24
108 |   - ncurses=6.3
109 |   - nomkl=1.0
110 |   - numexpr=2.8.3
111 |   - numpy=1.21.6
112 |   - openssl=1.1.1q
113 |   - packaging=21.3
114 |   - paml=4.9
115 |   - pandas=1.3.5
116 |   - pango=1.50.8
117 |   - parallel=20220922
118 |   - pcre=8.45
119 |   - pcre2=10.37
120 |   - perl=5.26.2
121 |   - perl-aceperl=1.92
122 |   - perl-algorithm-diff=1.1903
123 |   - perl-algorithm-munkres=0.08
124 |   - perl-apache-test=1.40
125 |   - perl-app-cpanminus=1.7044
126 |   - perl-appconfig=1.71
127 |   - perl-array-compare=3.0.1
128 |   - perl-autoloader=5.74
129 |   - perl-base=2.23
130 |   - perl-bio-asn1-entrezgene=1.73
131 |   - perl-bio-coordinate=1.007001
132 |   - perl-bio-featureio=1.6.905
133 |   - perl-bio-phylo=0.58
134 |   - perl-bio-samtools=1.43
135 |   - perl-bio-tools-phylo-paml=1.7.3
136 |   - perl-bio-tools-run-alignment-clustalw=1.7.4
137 |   - perl-bio-tools-run-alignment-tcoffee=1.7.4
138 |   - perl-bioperl=1.7.2
139 |   - perl-bioperl-core=1.007002
140 |   - perl-bioperl-run=1.007002
141 |   - perl-business-isbn=3.004
142 |   - perl-business-isbn-data=20140910.003
143 |   - perl-cache-cache=1.08
144 |   - perl-capture-tiny=0.48
145 |   - perl-carp=1.38
146 |   - perl-cgi=4.44
147 |   - perl-class-data-inheritable=0.08
148 |   - perl-class-inspector=1.34
149 |   - perl-class-load=0.25
150 |   - perl-class-load-xs=0.10
151 |   - perl-class-method-modifiers=2.12
152 |   - perl-clone=0.42
153 |   - perl-common-sense=3.74
154 |   - perl-compress-raw-zlib=2.087
155 |   - perl-constant=1.33
156 |   - perl-convert-binary-c=0.78
157 |   - perl-convert-binhex=1.125
158 |   - perl-crypt-rc4=2.02
159 |   - perl-data-dumper=2.173
160 |   - perl-data-optlist=0.110
161 |   - perl-data-stag=0.14
162 |   - perl-date-format=2.30
163 |   - perl-db-file=1.855
164 |   - perl-dbd-sqlite=1.64
165 |   - perl-dbi=1.642
166 |   - perl-devel-globaldestruction=0.14
167 |   - perl-devel-overloadinfo=0.005
168 |   - perl-devel-stacktrace=2.04
169 |   - perl-digest-hmac=1.03
170 |   - perl-digest-md5=2.55
171 |   - perl-digest-perl-md5=1.9
172 |   - perl-digest-sha1=2.13
173 |   - perl-dist-checkconflicts=0.11
174 |   - perl-dynaloader=1.25
175 |   - perl-email-date-format=1.005
176 |   - perl-encode=2.88
177 |   - perl-encode-locale=1.05
178 |   - perl-error=0.17027
179 |   - perl-eval-closure=0.14
180 |   - perl-exception-class=1.44
181 |   - perl-exporter=5.72
182 |   - perl-exporter-tiny=1.002001
183 |   - perl-extutils-makemaker=7.36
184 |   - perl-file-listing=6.04
185 |   - perl-file-path=2.16
186 |   - perl-file-slurp-tiny=0.004
187 |   - perl-file-sort=1.01
188 |   - perl-file-temp=0.2304
189 |   - perl-file-which=1.23
190 |   - perl-font-afm=1.20
191 |   - perl-font-ttf=1.06
192 |   - perl-gd=2.68
193 |   - perl-getopt-long=2.50
194 |   - perl-graph=0.9704
195 |   - perl-graphviz=2.24
196 |   - perl-html-element-extended=1.18
197 |   - perl-html-entities-numbered=0.04
198 |   - perl-html-formatter=2.16
199 |   - perl-html-parser=3.72
200 |   - perl-html-tableextract=2.13
201 |   - perl-html-tagset=3.20
202 |   - perl-html-tidy=1.60
203 |   - perl-html-tree=5.07
204 |   - perl-html-treebuilder-xpath=0.14
205 |   - perl-http-cookies=6.04
206 |   - perl-http-daemon=6.01
207 |   - perl-http-date=6.02
208 |   - perl-http-message=6.18
209 |   - perl-http-negotiate=6.01
210 |   - perl-image-info=1.38
211 |   - perl-image-size=3.300
212 |   - perl-io-html=1.001
213 |   - perl-io-sessiondata=1.03
214 |   - perl-io-socket-ssl=2.066
215 |   - perl-io-string=1.08
216 |   - perl-io-stringy=2.111
217 |   - perl-io-tty=1.12
218 |   - perl-ipc-run=20180523.0
219 |   - perl-ipc-sharelite=0.17
220 |   - perl-jcode=2.07
221 |   - perl-json=4.02
222 |   - perl-json-xs=2.34
223 |   - perl-lib=0.63
224 |   - perl-libwww-perl=6.39
225 |   - perl-libxml-perl=0.08
226 |   - perl-list-moreutils=0.428
227 |   - perl-list-moreutils-xs=0.428
228 |   - perl-local-lib=2.000024
229 |   - perl-lwp-mediatypes=6.04
230 |   - perl-lwp-protocol-https=6.07
231 |   - perl-lwp-simple=6.15
232 |   - perl-mailtools=2.21
233 |   - perl-math-cdf=0.1
234 |   - perl-math-derivative=1.01
235 |   - perl-math-random=0.72
236 |   - perl-math-spline=0.02
237 |   - perl-mime-base64=3.15
238 |   - perl-mime-lite=3.030
239 |   - perl-mime-tools=5.508
240 |   - perl-mime-types=2.17
241 |   - perl-mldbm=2.05
242 |   - perl-module-build=0.4224
243 |   - perl-module-implementation=0.09
244 |   - perl-module-runtime=0.016
245 |   - perl-module-runtime-conflicts=0.003
246 |   - perl-moo=2.003004
247 |   - perl-moose=2.2011
248 |   - perl-mozilla-ca=20180117
249 |   - perl-mro-compat=0.13
250 |   - perl-net-http=6.19
251 |   - perl-net-ssleay=1.88
252 |   - perl-ntlm=1.09
253 |   - perl-ole-storage_lite=0.19
254 |   - perl-package-deprecationmanager=0.17
255 |   - perl-package-stash=0.38
256 |   - perl-package-stash-xs=0.28
257 |   - perl-parallel-forkmanager=2.02
258 |   - perl-params-util=1.07
259 |   - perl-parent=0.236
260 |   - perl-parse-recdescent=1.967015
261 |   - perl-pathtools=3.75
262 |   - perl-pdf-api2=2.035
263 |   - perl-pod-escapes=1.07
264 |   - perl-pod-usage=1.69
265 |   - perl-postscript=0.06
266 |   - perl-role-tiny=2.000008
267 |   - perl-scalar-list-utils=1.52
268 |   - perl-set-scalar=1.29
269 |   - perl-soap-lite=1.19
270 |   - perl-socket=2.027
271 |   - perl-sort-naturally=1.03
272 |   - perl-spreadsheet-parseexcel=0.65
273 |   - perl-spreadsheet-writeexcel=2.40
274 |   - perl-statistics-descriptive=3.0702
275 |   - perl-storable=3.15
276 |   - perl-sub-exporter=0.987
277 |   - perl-sub-exporter-progressive=0.001013
278 |   - perl-sub-identify=0.14
279 |   - perl-sub-install=0.928
280 |   - perl-sub-name=0.21
281 |   - perl-sub-quote=2.006003
282 |   - perl-sub-uplevel=0.2800
283 |   - perl-svg=2.84
284 |   - perl-svg-graph=0.02
285 |   - perl-task-weaken=1.06
286 |   - perl-template-toolkit=2.26
287 |   - perl-test=1.26
288 |   - perl-test-deep=1.128
289 |   - perl-test-differences=0.67
290 |   - perl-test-exception=0.43
291 |   - perl-test-harness=3.42
292 |   - perl-test-leaktrace=0.16
293 |   - perl-test-most=0.35
294 |   - perl-test-requiresinternet=0.05
295 |   - perl-test-warn=0.36
296 |   - perl-text-diff=1.45
297 |   - perl-tie-ixhash=1.23
298 |   - perl-time-hires=1.9760
299 |   - perl-time-local=1.28
300 |   - perl-timedate=2.30
301 |   - perl-tree-dag_node=1.31
302 |   - perl-try-tiny=0.30
303 |   - perl-type-tiny=1.004004
304 |   - perl-types-serialiser=1.0
305 |   - perl-unicode-map=0.112
306 |   - perl-uri=1.76
307 |   - perl-www-robotrules=6.02
308 |   - perl-xml-dom=1.46
309 |   - perl-xml-dom-xpath=0.14
310 |   - perl-xml-filter-buffertext=1.01
311 |   - perl-xml-libxml=2.0132
312 |   - perl-xml-libxslt=1.94
313 |   - perl-xml-namespacesupport=1.12
314 |   - perl-xml-parser=2.44_01
315 |   - perl-xml-regexp=0.04
316 |   - perl-xml-sax=1.02
317 |   - perl-xml-sax-base=1.09
318 |   - perl-xml-sax-expat=0.51
319 |   - perl-xml-sax-writer=0.57
320 |   - perl-xml-simple=2.25
321 |   - perl-xml-twig=3.52
322 |   - perl-xml-writer=0.625
323 |   - perl-xml-xpath=1.44
324 |   - perl-xml-xpathengine=0.14
325 |   - perl-xsloader=0.24
326 |   - perl-yaml=1.29
327 |   - pip=22.3.1
328 |   - pixman=0.40.0
329 |   - pthread-stubs=0.4
330 |   - pyparsing=3.0.4
331 |   - python=3.7.12
332 |   - python-dateutil=2.8.2
333 |   - python_abi=3.7
334 |   - pytz=2022.1
335 |   - r-base=4.1.3
336 |   - r-bitops=1.0_7
337 |   - r-catools=1.18.2
338 |   - r-gplots=3.1.3
339 |   - r-gtools=3.9.3
340 |   - r-kernsmooth=2.23_20
341 |   - r-randomforest=4.7_1.1
342 |   - r-rocr=1.0_11
343 |   - readline=8.1.2
344 |   - sed=4.8
345 |   - seqkit=2.3.1
346 |   - setuptools=65.5.1
347 |   - six=1.16.0
348 |   - sqlite=3.38.5
349 |   - stringtie=2.2.1
350 |   - sysroot_linux-64=2.12
351 |   - t_coffee=11.0.8
352 |   - tidyp=1.04
353 |   - tk=8.6.12
354 |   - tktable=2.10
355 |   - transdecoder=5.5.0
356 |   - wheel=0.38.4
357 |   - xorg-kbproto=1.0.7
358 |   - xorg-libice=1.0.10
359 |   - xorg-libsm=1.2.3
360 |   - xorg-libx11=1.7.2
361 |   - xorg-libxau=1.0.9
362 |   - xorg-libxdmcp=1.1.3
363 |   - xorg-libxext=1.3.4
364 |   - xorg-libxrender=0.9.10
365 |   - xorg-libxt=1.2.1
366 |   - xorg-renderproto=0.11.1
367 |   - xorg-xextproto=7.3.0
368 |   - xorg-xproto=7.0.31
369 |   - xz=5.2.6
370 |   - zlib=1.2.11
371 |   - zstd=1.5.2
372 | 


--------------------------------------------------------------------------------
/nextflow_scripts/environment.yml:
--------------------------------------------------------------------------------
  1 | name: annotate_my_genomes
  2 | channels:
  3 |   - anaconda
  4 |   - conda-forge
  5 |   - bioconda
  6 |   - defaults
  7 | dependencies:
  8 |   - _libgcc_mutex=0.1=conda_forge
  9 |   - _openmp_mutex=4.5=1_gnu
 10 |   - _r-mutex=1.0.1=anacondar_1
 11 |   - argtable2=2.13=h14c3975_1001
 12 |   - atk-1.0=2.36.0=h3371d22_4
 13 |   - bedtools=2.30.0=h468198e_3
 14 |   - binutils_impl_linux-64=2.36.1=h193b22a_2
 15 |   - binutils_linux-64=2.36=hf3e587d_7
 16 |   - bioconductor-seqlogo=1.60.0=r41hdfd78af_0
 17 |   - bwidget=1.9.14=ha770c72_1
 18 |   - bzip2=1.0.8=h7f98852_4
 19 |   - c-ares=1.18.1=h7f98852_0
 20 |   - ca-certificates=2020.10.14=0
 21 |   - cairo=1.16.0=ha12eb4b_1010
 22 |   - certifi=2020.6.20=py36_0
 23 |   - clustalo=1.2.4=h87f3376_5
 24 |   - coreutils=9.0=h7f98852_0
 25 |   - curl=7.82.0=h7bff187_0
 26 |   - emboss=6.6.0=h5a44aac_5
 27 |   - expat=2.4.7=h27087fc_0
 28 |   - fasta_ushuffle=0.2=hec16e2b_4
 29 |   - feelnc=0.2=pl526_0
 30 |   - font-ttf-dejavu-sans-mono=2.37=hab24e00_0
 31 |   - font-ttf-inconsolata=3.000=h77eed37_0
 32 |   - font-ttf-source-code-pro=2.038=h77eed37_0
 33 |   - font-ttf-ubuntu=0.83=hab24e00_0
 34 |   - fontconfig=2.13.96=h8e229c2_2
 35 |   - fonts-conda-ecosystem=1=0
 36 |   - fonts-conda-forge=1=0
 37 |   - freetype=2.10.4=h0708190_1
 38 |   - fribidi=1.0.10=h36c2ea0_0
 39 |   - gawk=5.1.0=h7b6447c_0
 40 |   - gcc_impl_linux-64=9.4.0=h03d3576_13
 41 |   - gcc_linux-64=9.4.0=h391b98a_7
 42 |   - gdk-pixbuf=2.42.6=h04a7f16_0
 43 |   - gettext=0.19.8.1=h73d1719_1008
 44 |   - gffcompare=0.11.2=h9f5acd7_3
 45 |   - gffread=0.12.7=hd03093a_1
 46 |   - gfortran_impl_linux-64=9.4.0=h0003116_13
 47 |   - gfortran_linux-64=9.4.0=hf0ab688_7
 48 |   - giflib=5.2.1=h36c2ea0_2
 49 |   - gmap=2021.08.25=pl5262h36cd882_0
 50 |   - graphite2=1.3.13=h58526e2_1001
 51 |   - graphviz=3.0.0=h5abf519_0
 52 |   - gsl=2.7=he838d99_0
 53 |   - gtk2=2.24.33=h90689f9_2
 54 |   - gts=0.7.6=h64030ff_2
 55 |   - gxx_impl_linux-64=9.4.0=h03d3576_13
 56 |   - gxx_linux-64=9.4.0=h0316aca_7
 57 |   - harfbuzz=3.4.0=hb4a5f5f_0
 58 |   - htslib=1.14=h9753748_2
 59 |   - icu=69.1=h9c3ff4c_0
 60 |   - jbig=2.1=h7f98852_2003
 61 |   - jpeg=9e=h7f98852_0
 62 |   - k8=0.2.5=hd03093a_2
 63 |   - kernel-headers_linux-64=2.6.32=he073ed8_15
 64 |   - keyutils=1.6.1=h166bdaf_0
 65 |   - kmerinshort=1.0.1=0
 66 |   - krb5=1.19.2=h3790be6_4
 67 |   - ld_impl_linux-64=2.36.1=hea4e1c9_2
 68 |   - lerc=3.0=h9c3ff4c_0
 69 |   - libblas=3.9.0=13_linux64_openblas
 70 |   - libcblas=3.9.0=13_linux64_openblas
 71 |   - libcurl=7.82.0=h7bff187_0
 72 |   - libdb=6.2.32=h9c3ff4c_0
 73 |   - libdeflate=1.10=h7f98852_0
 74 |   - libedit=3.1.20191231=he28a2e2_2
 75 |   - libev=4.33=h516909a_1
 76 |   - libffi=3.4.2=h7f98852_5
 77 |   - libgcc=7.2.0=h69d50b8_2
 78 |   - libgcc-devel_linux-64=9.4.0=hd854feb_13
 79 |   - libgcc-ng=11.2.0=h1d223b6_13
 80 |   - libgd=2.3.3=h283352f_2
 81 |   - libgfortran-ng=11.2.0=h69a702a_13
 82 |   - libgfortran5=11.2.0=h5c6108e_13
 83 |   - libglib=2.70.2=h174f98d_4
 84 |   - libgomp=11.2.0=h1d223b6_13
 85 |   - libiconv=1.16=h516909a_0
 86 |   - liblapack=3.9.0=13_linux64_openblas
 87 |   - libnghttp2=1.47.0=h727a467_0
 88 |   - libnsl=2.0.0=h7f98852_0
 89 |   - libopenblas=0.3.18=pthreads_h8fe5266_0
 90 |   - libpng=1.6.37=h21135ba_2
 91 |   - librsvg=2.52.5=h0a9e6e8_2
 92 |   - libsanitizer=9.4.0=h79bfe98_13
 93 |   - libssh2=1.10.0=ha56f1ee_2
 94 |   - libstdcxx-devel_linux-64=9.4.0=hd854feb_13
 95 |   - libstdcxx-ng=11.2.0=he4da1e4_13
 96 |   - libtiff=4.3.0=h542a066_3
 97 |   - libtool=2.4.6=h9c3ff4c_1008
 98 |   - libuuid=2.32.1=h7f98852_1000
 99 |   - libwebp=1.2.2=h3452ae3_0
100 |   - libwebp-base=1.2.2=h7f98852_1
101 |   - libxcb=1.13=h7f98852_1004
102 |   - libxml2=2.9.12=h885dcf4_1
103 |   - libxslt=1.1.33=h0ef7038_3
104 |   - libzlib=1.2.11=h36c2ea0_1013
105 |   - lz4-c=1.9.3=h9c3ff4c_1
106 |   - make=4.3=hd18ef5c_1
107 |   - minimap2=2.24=h7132678_1
108 |   - ncurses=6.2=h58526e2_4
109 |   - numpy=1.19.5=py36hfc0c790_2
110 |   - openssl=1.1.1l=h7f98852_0
111 |   - pandas=1.1.3=py36he6710b0_0
112 |   - pango=1.50.5=h4dcc4a0_0
113 |   - parallel=20220222=ha770c72_0
114 |   - pcre=8.45=h9c3ff4c_0
115 |   - pcre2=10.37=h032f7d1_0
116 |   - perl=5.26.2=h36c2ea0_1008
117 |   - perl-aceperl=1.92=pl526_2
118 |   - perl-algorithm-diff=1.1903=pl526_2
119 |   - perl-algorithm-munkres=0.08=pl526_1
120 |   - perl-apache-test=1.40=pl526_1
121 |   - perl-app-cpanminus=1.7044=pl526_1
122 |   - perl-appconfig=1.71=pl526_1
123 |   - perl-array-compare=3.0.1=pl526_1
124 |   - perl-autoloader=5.74=pl526_2
125 |   - perl-base=2.23=pl526_1
126 |   - perl-bio-asn1-entrezgene=1.73=pl5262hdfd78af_2
127 |   - perl-bio-featureio=1.6.905=pl5262hdfd78af_3
128 |   - perl-bio-phylo=0.58=pl5262hdfd78af_3
129 |   - perl-bio-samtools=1.43=pl526h1341992_1
130 |   - perl-bioperl=1.6.924=6
131 |   - perl-bioperl-core=1.007002=pl5262hdfd78af_3
132 |   - perl-bioperl-run=1.007002=pl5262hdfd78af_5
133 |   - perl-business-isbn=3.004=pl526_0
134 |   - perl-business-isbn-data=20140910.003=pl526_0
135 |   - perl-cache-cache=1.08=pl526_0
136 |   - perl-capture-tiny=0.48=pl526_0
137 |   - perl-carp=1.38=pl526_3
138 |   - perl-cgi=4.44=pl526h14c3975_1
139 |   - perl-class-data-inheritable=0.08=pl526_1
140 |   - perl-class-inspector=1.34=pl526_0
141 |   - perl-class-load=0.25=pl526_0
142 |   - perl-class-load-xs=0.10=pl526h6bb024c_2
143 |   - perl-class-method-modifiers=2.12=pl526_0
144 |   - perl-clone=0.42=pl526h516909a_0
145 |   - perl-common-sense=3.74=pl526_2
146 |   - perl-compress-raw-zlib=2.087=pl526hc9558a2_0
147 |   - perl-constant=1.33=pl526_1
148 |   - perl-convert-binary-c=0.78=pl526h6bb024c_3
149 |   - perl-convert-binhex=1.125=pl526_1
150 |   - perl-crypt-rc4=2.02=pl526_1
151 |   - perl-data-dumper=2.173=pl526_0
152 |   - perl-data-optlist=0.110=pl526_2
153 |   - perl-data-stag=0.14=pl526_1
154 |   - perl-date-format=2.30=pl526_2
155 |   - perl-db-file=1.855=pl526h516909a_0
156 |   - perl-dbd-sqlite=1.64=pl526h516909a_0
157 |   - perl-dbi=1.642=pl526_0
158 |   - perl-devel-globaldestruction=0.14=pl526_0
159 |   - perl-devel-overloadinfo=0.005=pl526_0
160 |   - perl-devel-stacktrace=2.04=pl526_0
161 |   - perl-digest-hmac=1.03=pl526_3
162 |   - perl-digest-md5=2.55=pl526_0
163 |   - perl-digest-perl-md5=1.9=pl526_1
164 |   - perl-digest-sha1=2.13=pl526h6bb024c_1
165 |   - perl-dist-checkconflicts=0.11=pl526_2
166 |   - perl-dynaloader=1.25=pl526_1
167 |   - perl-email-date-format=1.005=pl526_2
168 |   - perl-encode=2.88=pl526_1
169 |   - perl-encode-locale=1.05=pl526_6
170 |   - perl-error=0.17027=pl526_1
171 |   - perl-eval-closure=0.14=pl526h6bb024c_4
172 |   - perl-exception-class=1.44=pl526_0
173 |   - perl-exporter=5.72=pl526_1
174 |   - perl-exporter-tiny=1.002001=pl526_0
175 |   - perl-extutils-makemaker=7.36=pl526_1
176 |   - perl-file-listing=6.04=pl526_1
177 |   - perl-file-path=2.16=pl526_0
178 |   - perl-file-slurp-tiny=0.004=pl526_1
179 |   - perl-file-sort=1.01=pl526_2
180 |   - perl-file-temp=0.2304=pl526_2
181 |   - perl-file-which=1.23=pl526_0
182 |   - perl-font-afm=1.20=pl526_2
183 |   - perl-font-ttf=1.06=pl526_0
184 |   - perl-gd=2.68=pl526he941832_0
185 |   - perl-getopt-long=2.50=pl526_1
186 |   - perl-graph=0.9704=pl526_1
187 |   - perl-graphviz=2.20=1
188 |   - perl-html-element-extended=1.18=pl526_1
189 |   - perl-html-entities-numbered=0.04=pl526_1
190 |   - perl-html-formatter=2.16=pl526_0
191 |   - perl-html-parser=3.72=pl526h6bb024c_5
192 |   - perl-html-tableextract=2.13=pl526_2
193 |   - perl-html-tagset=3.20=pl526_3
194 |   - perl-html-tidy=1.60=pl526_0
195 |   - perl-html-tree=5.07=pl526_1
196 |   - perl-html-treebuilder-xpath=0.14=pl526_1
197 |   - perl-http-cookies=6.04=pl526_0
198 |   - perl-http-daemon=6.01=pl526_1
199 |   - perl-http-date=6.02=pl526_3
200 |   - perl-http-message=6.18=pl526_0
201 |   - perl-http-negotiate=6.01=pl526_3
202 |   - perl-image-info=1.38=pl526_1
203 |   - perl-image-size=3.300=pl526_2
204 |   - perl-io-html=1.001=pl526_2
205 |   - perl-io-sessiondata=1.03=pl526_1
206 |   - perl-io-socket-ssl=2.066=pl526_0
207 |   - perl-io-string=1.08=pl526_3
208 |   - perl-io-stringy=2.111=pl526_1
209 |   - perl-io-tty=1.12=pl526_1
210 |   - perl-ipc-run=20180523.0=pl526_0
211 |   - perl-ipc-sharelite=0.17=pl526h6bb024c_1
212 |   - perl-jcode=2.07=pl526_2
213 |   - perl-json=4.02=pl526_0
214 |   - perl-json-xs=2.34=pl526h6bb024c_3
215 |   - perl-libwww-perl=6.39=pl526_0
216 |   - perl-libxml-perl=0.08=pl526_2
217 |   - perl-list-moreutils=0.428=pl526_1
218 |   - perl-list-moreutils-xs=0.428=pl526_0
219 |   - perl-local-lib=2.000024=pl526_0
220 |   - perl-lwp-mediatypes=6.04=pl526_0
221 |   - perl-lwp-protocol-https=6.07=pl526_4
222 |   - perl-lwp-simple=6.15=pl526h470a237_4
223 |   - perl-mailtools=2.21=pl526_0
224 |   - perl-math-cdf=0.1=pl526h14c3975_5
225 |   - perl-math-derivative=1.01=pl526_0
226 |   - perl-math-random=0.72=pl526h14c3975_2
227 |   - perl-math-spline=0.02=pl526_2
228 |   - perl-mime-base64=3.15=pl526_1
229 |   - perl-mime-lite=3.030=pl526_1
230 |   - perl-mime-tools=5.508=pl526_1
231 |   - perl-mime-types=2.17=pl526_0
232 |   - perl-mldbm=2.05=pl526_1
233 |   - perl-module-build=0.4224=pl526h470a237_1
234 |   - perl-module-implementation=0.09=pl526_2
235 |   - perl-module-runtime=0.016=pl526_1
236 |   - perl-module-runtime-conflicts=0.003=pl526_0
237 |   - perl-moo=2.003004=pl526_0
238 |   - perl-moose=2.2011=pl526hf484d3e_1
239 |   - perl-mozilla-ca=20180117=pl526_1
240 |   - perl-mro-compat=0.13=pl526_0
241 |   - perl-net-http=6.19=pl526_0
242 |   - perl-net-ssleay=1.88=pl526h90d6eec_0
243 |   - perl-ntlm=1.09=pl526_4
244 |   - perl-ole-storage_lite=0.19=pl526_3
245 |   - perl-package-deprecationmanager=0.17=pl526_0
246 |   - perl-package-stash=0.38=pl526hf484d3e_1
247 |   - perl-package-stash-xs=0.28=pl526hf484d3e_1
248 |   - perl-parallel-forkmanager=2.02=pl526_0
249 |   - perl-params-util=1.07=pl526h6bb024c_4
250 |   - perl-parent=0.236=pl526_1
251 |   - perl-parse-recdescent=1.967015=pl526_0
252 |   - perl-pathtools=3.75=pl526h14c3975_1
253 |   - perl-pdf-api2=2.035=pl526_0
254 |   - perl-postscript=0.06=pl526_2
255 |   - perl-role-tiny=2.000008=pl526_0
256 |   - perl-scalar-list-utils=1.52=pl526h516909a_0
257 |   - perl-set-scalar=1.29=pl526_2
258 |   - perl-soap-lite=1.19=pl526_1
259 |   - perl-socket=2.027=pl526_1
260 |   - perl-sort-naturally=1.03=pl526_2
261 |   - perl-spreadsheet-parseexcel=0.65=pl526_2
262 |   - perl-spreadsheet-writeexcel=2.40=pl526_2
263 |   - perl-statistics-descriptive=3.0702=pl526_0
264 |   - perl-storable=3.15=pl526h14c3975_0
265 |   - perl-sub-exporter=0.987=pl526_2
266 |   - perl-sub-exporter-progressive=0.001013=pl526_0
267 |   - perl-sub-identify=0.14=pl526h14c3975_0
268 |   - perl-sub-install=0.928=pl526_2
269 |   - perl-sub-name=0.21=pl526_1
270 |   - perl-sub-quote=2.006003=pl526_1
271 |   - perl-sub-uplevel=0.2800=pl526h14c3975_2
272 |   - perl-svg=2.84=pl526_0
273 |   - perl-svg-graph=0.02=pl526_3
274 |   - perl-task-weaken=1.06=pl526_0
275 |   - perl-template-toolkit=2.26=pl526_1
276 |   - perl-test-deep=1.128=pl526_1
277 |   - perl-test-differences=0.67=pl526_0
278 |   - perl-test-exception=0.43=pl526_2
279 |   - perl-test-leaktrace=0.16=pl526h14c3975_2
280 |   - perl-test-most=0.35=pl526_0
281 |   - perl-test-pod=1.52=pl526_0
282 |   - perl-test-requiresinternet=0.05=pl526_0
283 |   - perl-test-warn=0.36=pl526_1
284 |   - perl-text-diff=1.45=pl526_0
285 |   - perl-threaded=5.32.1=hdfd78af_1
286 |   - perl-tie-ixhash=1.23=pl526_2
287 |   - perl-time-local=1.28=pl526_1
288 |   - perl-timedate=2.30=pl526_1
289 |   - perl-tree-dag_node=1.31=pl526_0
290 |   - perl-try-tiny=0.30=pl526_1
291 |   - perl-type-tiny=1.004004=pl526_0
292 |   - perl-types-serialiser=1.0=pl526_2
293 |   - perl-unicode-map=0.112=pl526h6bb024c_3
294 |   - perl-uri=1.76=pl526_0
295 |   - perl-www-robotrules=6.02=pl526_3
296 |   - perl-xml-dom=1.46=pl526_0
297 |   - perl-xml-dom-xpath=0.14=pl526_1
298 |   - perl-xml-filter-buffertext=1.01=pl526_2
299 |   - perl-xml-libxml=2.0132=pl526h7ec2d77_1
300 |   - perl-xml-libxslt=1.94=pl526_1
301 |   - perl-xml-namespacesupport=1.12=pl526_0
302 |   - perl-xml-parser=2.44_01=pl5262hc3e0081_1002
303 |   - perl-xml-regexp=0.04=pl526_2
304 |   - perl-xml-sax=1.02=pl526_0
305 |   - perl-xml-sax-base=1.09=pl526_0
306 |   - perl-xml-sax-expat=0.51=pl526_3
307 |   - perl-xml-sax-writer=0.57=pl526_0
308 |   - perl-xml-simple=2.25=pl526_1
309 |   - perl-xml-twig=3.52=pl526_2
310 |   - perl-xml-writer=0.625=pl526_2
311 |   - perl-xml-xpath=1.44=pl526_0
312 |   - perl-xml-xpathengine=0.14=pl526_2
313 |   - perl-xsloader=0.24=pl526_0
314 |   - perl-yaml=1.29=pl526_0
315 |   - pip=20.0.2=py36_1
316 |   - pixman=0.40.0=h36c2ea0_0
317 |   - pthread-stubs=0.4=h36c2ea0_1001
318 |   - python=3.6.15=hb7a2778_0_cpython
319 |   - python-dateutil=2.8.1=py_0
320 |   - python_abi=3.6=2_cp36m
321 |   - pytz=2020.1=py_0
322 |   - r-assertthat=0.2.1=r41hc72bb7e_2
323 |   - r-backports=1.4.1=r41hcfec24a_0
324 |   - r-base=4.1.2=h2553ce4_1
325 |   - r-bitops=1.0_7=r41hcfec24a_0
326 |   - r-brio=1.1.3=r41hcfec24a_0
327 |   - r-callr=3.7.0=r41hc72bb7e_0
328 |   - r-catools=1.18.2=r41h03ef668_0
329 |   - r-cli=3.2.0=r41h03ef668_0
330 |   - r-colorspace=2.0_3=r41h06615bd_0
331 |   - r-crayon=1.5.0=r41hc72bb7e_0
332 |   - r-desc=1.4.0=r41hc72bb7e_0
333 |   - r-diffobj=0.3.5=r41hcfec24a_0
334 |   - r-digest=0.6.29=r41h03ef668_0
335 |   - r-ellipsis=0.3.2=r41hcfec24a_0
336 |   - r-evaluate=0.15=r41hc72bb7e_0
337 |   - r-fansi=1.0.2=r41hcfec24a_0
338 |   - r-farver=2.1.0=r41h03ef668_0
339 |   - r-ggplot2=3.3.5=r41hc72bb7e_0
340 |   - r-glue=1.6.2=r41h06615bd_0
341 |   - r-gplots=3.1.1=r41hc72bb7e_0
342 |   - r-gtable=0.3.0=r41hc72bb7e_3
343 |   - r-gtools=3.9.2=r41hcfec24a_0
344 |   - r-isoband=0.2.5=r41h03ef668_0
345 |   - r-jsonlite=1.8.0=r41h06615bd_0
346 |   - r-kernsmooth=2.23_20=r41h742201e_0
347 |   - r-labeling=0.4.2=r41hc72bb7e_1
348 |   - r-lattice=0.20_45=r41hcfec24a_0
349 |   - r-lifecycle=1.0.1=r41hc72bb7e_0
350 |   - r-magrittr=2.0.2=r41hcfec24a_0
351 |   - r-mass=7.3_55=r41hcfec24a_0
352 |   - r-matrix=1.4_0=r41he454529_0
353 |   - r-mgcv=1.8_39=r41h0154571_0
354 |   - r-munsell=0.5.0=r41hc72bb7e_1004
355 |   - r-nlme=3.1_155=r41h859d828_0
356 |   - r-pillar=1.7.0=r41hc72bb7e_0
357 |   - r-pkgconfig=2.0.3=r41hc72bb7e_1
358 |   - r-pkgload=1.2.4=r41h03ef668_0
359 |   - r-praise=1.0.0=r41hc72bb7e_1005
360 |   - r-processx=3.5.2=r41hcfec24a_0
361 |   - r-ps=1.6.0=r41hcfec24a_0
362 |   - r-r6=2.5.1=r41hc72bb7e_0
363 |   - r-randomforest=4.6_14=r41h859d828_1004
364 |   - r-rcolorbrewer=1.1_2=r41h785f33e_1003
365 |   - r-rcpp=1.0.8=r41h03ef668_0
366 |   - r-rematch2=2.1.2=r41hc72bb7e_1
367 |   - r-rlang=0.4.12=r41hcfec24a_0
368 |   - r-rocr=1.0_11=r41hc72bb7e_1
369 |   - r-rprojroot=2.0.2=r41hc72bb7e_0
370 |   - r-rstudioapi=0.13=r41hc72bb7e_0
371 |   - r-scales=1.1.1=r41hc72bb7e_0
372 |   - r-testthat=3.1.2=r41h03ef668_0
373 |   - r-tibble=3.1.6=r41hcfec24a_0
374 |   - r-utf8=1.2.2=r41hcfec24a_0
375 |   - r-vctrs=0.3.8=r41hcfec24a_1
376 |   - r-viridislite=0.4.0=r41hc72bb7e_0
377 |   - r-waldo=0.3.1=r41hc72bb7e_0
378 |   - r-withr=2.5.0=r41hc72bb7e_0
379 |   - readline=8.1=h46c0cb4_0
380 |   - seqkit=2.1.0=h9ee0642_0
381 |   - setuptools=49.6.0=py36h5fab9bb_3
382 |   - six=1.15.0=py_0
383 |   - sqlite=3.37.0=h9cd32fc_0
384 |   - stringtie=2.2.1=hecb563c_2
385 |   - sysroot_linux-64=2.12=he073ed8_15
386 |   - tidyp=1.04=hec16e2b_4
387 |   - tk=8.6.12=h27826a3_0
388 |   - tktable=2.10=hb7b940f_3
389 |   - transdecoder=5.5.0=pl5262hdfd78af_4
390 |   - wheel=0.37.1=pyhd8ed1ab_0
391 |   - xorg-kbproto=1.0.7=h7f98852_1002
392 |   - xorg-libice=1.0.10=h7f98852_0
393 |   - xorg-libsm=1.2.3=hd9c2040_1000
394 |   - xorg-libx11=1.7.2=h7f98852_0
395 |   - xorg-libxau=1.0.9=h7f98852_0
396 |   - xorg-libxdmcp=1.1.3=h7f98852_0
397 |   - xorg-libxext=1.3.4=h7f98852_1
398 |   - xorg-libxrender=0.9.10=h7f98852_1003
399 |   - xorg-libxt=1.2.1=h7f98852_2
400 |   - xorg-renderproto=0.11.1=h7f98852_1002
401 |   - xorg-xextproto=7.3.0=h7f98852_1002
402 |   - xorg-xproto=7.0.31=h7f98852_1007
403 |   - xz=5.2.5=h516909a_1
404 |   - zlib=1.2.11=h36c2ea0_1013
405 |   - zstd=1.5.2=ha95c52a_0
406 | 


--------------------------------------------------------------------------------
/nextflow_scripts/genome-download.nf:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env nextflow
 2 | 
 3 | /* 
 4 |  * pipeline input parameters 
 5 |  */
 6 | 
 7 | 
 8 | params.genome = 'galGal6'
 9 | params.outdir = './'
10 | params.conda = '/home/wslab/test1_annotate/environment.yml'
11 | 
12 | println """\
13 |          G E N O M E - D O W N L O A D   P I P E L I N E    
14 |          ===============================================
15 |          genome        : ${params.genome}
16 |          outdir        : ${params.outdir}
17 |          environment   : ${params.conda}
18 |          """
19 |          .stripIndent()
20 | 
21 | process genome_download {
22 |   echo true
23 |   conda "${params.conda}"
24 |   
25 |   publishDir "${params.outdir}", mode: 'copy'
26 | 
27 |   output:
28 |   file "${params.genome}*"
29 |   
30 |   shell:
31 |   '''
32 |   #!/usr/bin/env bash
33 |   
34 |   genome="!{params.genome}"
35 |   wget http://hgdownload.cse.ucsc.edu/goldenpath/${genome}/bigZips/${genome}.2bit
36 |   wget http://hgdownload.soe.ucsc.edu/goldenPath/${genome}/database/refGene.txt.gz
37 |   wget http://hgdownload.soe.ucsc.edu/goldenPath/${genome}/database/ncbiRefSeq.txt.gz
38 |   if [ -f twoBitToFa ]; then
39 |       echo "twoBitToFa script found. Continue:"
40 |       echo ""
41 |       : 
42 |   else
43 |       echo "Downloading twoBitToFa script"
44 |       wget http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/twoBitToFa 
45 |   fi
46 |   chmod 755 twoBitToFa
47 |   ./twoBitToFa ${genome}.2bit ${genome}.fa
48 |   samtools faidx ${genome}.fa
49 | 
50 |   if [ -f genePredToGtf ]; then
51 |       echo "genePredToGtf script found. Continue:"
52 |       echo ""
53 |       : 
54 |   else
55 |       echo "Downloading genePredToGtf script"
56 |       wget http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/genePredToGtf 
57 |   fi
58 |   chmod 755 genePredToGtf
59 |   gunzip refGene.txt.gz
60 |   gunzip ncbiRefSeq.txt.gz
61 |   cut -f 2- refGene.txt | ./genePredToGtf file stdin -source=${genome}_Ref  ${genome}.gtf
62 |   cut -f 2- ncbiRefSeq.txt | ./genePredToGtf file stdin -source=${genome}_Ref  ${genome}_ncbiRefSeq.gtf
63 |   echo ""
64 |   echo "All done. ${genome} FASTA and GTF files are located in the current working directory (or specified directory with --outdir)"
65 |   '''
66 | }
67 | 
68 | 


--------------------------------------------------------------------------------
/nextflow_scripts/isoform-identification.nf:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env nextflow
  2 | 
  3 | /*
  4 |  * pipeline input parameters
  5 |  */
  6 | 
  7 | params.NCBI_tmap = '/home/wslab/test1_annotate/nextflow_scripts/NCBI_compare.stringtie_chr33.gtf.tmap'
  8 | params.NCBI_transcripts = '/home/wslab/test1_annotate/nextflow_scripts/NCBI_transcripts.fa'
  9 | params.genome_name = 'galGal6'
 10 | params.conda = '/home/wslab/test1_annotate/nextflow_scripts/environment.yml'
 11 | params.outdir ='./'
 12 | 
 13 | println """\
 14 |          I S O F O R M - I D E N T I F I C A T I O N   P I P E L I N E
 15 |          =============================================================
 16 |          NCBI_tmap         : ${params.NCBI_tmap}
 17 |          NCBI_transcripts  : ${params.NCBI_transcripts}
 18 |          genome_name       : ${params.genome_name}
 19 |          environment       : ${params.conda}
 20 |          outdir            : ${params.outdir}
 21 |          """
 22 |          .stripIndent()
 23 | 
 24 | 
 25 | process check_paths {
 26 |   echo true
 27 |   stageInMode 'copy'
 28 |   conda "${params.conda}"
 29 | 
 30 |   output:
 31 |   val '${m_DIR}' into records1
 32 |   val '${t_DIR}' into records2
 33 |   val '${c_DIR}' into records3
 34 |   val '${o_DIR}' into records4
 35 | 
 36 |   shell:
 37 |   '''
 38 |   # mandatory arguments
 39 |   if [ ! "!{params.NCBI_tmap}" ] || [ ! "!{params.NCBI_transcripts}" ] || [ ! "!{params.genome_name}" ] || [ ! "!{params.conda}" ] || [ ! "!{params.outdir}" ]; then
 40 |     echo ""
 41 |     echo "arguments -m, -t, -g, -c and -o must be provided"
 42 |     echo ""
 43 |     echo "$usage" >&2; exit 1
 44 |   fi
 45 | 
 46 |   # Conditions : output folder
 47 |   if [ ! -d "!{params.outdir}" ]; then
 48 |     echo ""
 49 |     echo "Output directory: !{params.outdir} not found. Please create the output directory first, before running the pipeline."
 50 |     echo ""
 51 |     exit 9999 # die with error code 9999
 52 |   fi
 53 | 
 54 |   # Conditions : Input existance
 55 |   if [ ! -e "!{params.NCBI_tmap}" ]; then
 56 |     echo ""
 57 |     echo "!{params.NCBI_tmap} does not exist. Check your -m input"
 58 |     echo ""
 59 |     exit 9999 # die with error code 9999
 60 |   fi
 61 | 
 62 |   if [ ! -e "!{params.NCBI_transcripts}" ]; then
 63 |     echo ""
 64 |     echo "!{params.NCBI_transcripts} does not exist. Check your -t input"
 65 |     echo ""
 66 |     exit 9999 # die with error code 9999
 67 |   fi
 68 | 
 69 |   if [ ! -e "!{params.conda}" ]; then
 70 |     echo ""
 71 |     echo "!{params.conda} does not exist. Check your -c input"
 72 |     echo ""
 73 |     exit 9999 # die with error code 9999
 74 |   fi
 75 | 
 76 |   # Conditions : Getting absolute path of inputs
 77 |   echo ""
 78 |   m_DIR="$( cd "$( dirname "!{params.NCBI_tmap}" )" && pwd )"
 79 |   echo ""
 80 |   echo "::: The absolute path of -m is $m_DIR"
 81 |   echo ""
 82 |   t_DIR="$( cd "$( dirname "!{params.NCBI_transcripts}" )" && pwd )"
 83 |   echo ""
 84 |   echo "::: The absolute path of -t is $t_DIR"
 85 |   echo ""
 86 |   c_DIR="$( cd "$( dirname "!{params.conda}" )" && pwd )"
 87 |   echo ""
 88 |   echo "::: The absolute path of -c is $c_DIR"
 89 |   echo ""
 90 |   o_DIR="$( cd "$( dirname "!{params.outdir}" )" && pwd )"
 91 |   echo ""
 92 |   echo "::: The absolute path of -o is $o_DIR"
 93 |   echo ""
 94 |   '''
 95 | }
 96 | 
 97 | 
 98 | process check_inputs {
 99 |   echo true
100 |   stageInMode 'copy'
101 |   conda "${params.conda}"
102 | 
103 |   output:
104 |   val '${NCBI_tmap}' into records5
105 |   val '${NCBI_transcripts}' into records6
106 |   val '${genome_name}' into records7
107 |   val '${anaconda_env}' into records8
108 |   val '${outdir}' into records9
109 |   file 'ncbiRefSeqLink.txt' into ncbiRefSeqLink
110 | 
111 |   shell:
112 |   '''
113 |   printf "::: Defining Variables :::\n"
114 |   echo""
115 |   FILE1="!{params.NCBI_tmap}"
116 |   basename "$FILE1"
117 |   NCBI_tmap="$(basename -- $FILE1)"
118 |   echo "The NCBI tmap file used as input is the following: $NCBI_tmap"
119 |   echo ""
120 |   FILE2="!{params.NCBI_transcripts}"
121 |   basename "$FILE2"
122 |   NCBI_transcripts="$(basename -- $FILE2)"
123 |   echo "The NCBI transcripts used as input is the following: $NCBI_transcripts"
124 |   echo ""
125 |   FILE3="!{params.conda}"
126 |   basename "$FILE3"
127 |   anaconda_env="$(basename -- $FILE3)"
128 |   echo "The anaconda environment file is the following: $anaconda_env"
129 |   echo ""
130 |   FILE4="!{params.outdir}"
131 |   basename "$FILE4"
132 |   outdir="$(basename -- $FILE4)"
133 |   echo "The outdir folder name is the following: $outdir"
134 |   echo ""
135 |   genome_name="!{params.genome_name}"
136 | 
137 |   if [ -f ncbiRefSeqLink.txt ]; then
138 |       echo "::: ncbiRefSeqLink.txt file found. Continue:"
139 |       echo ""
140 |       :
141 |   else
142 |       echo "::: Downloading ncbiRefSeqLink.txt file"
143 |       wget http://hgdownload.cse.ucsc.edu/goldenpath/${genome_name}/database/ncbiRefSeqLink.txt.gz
144 |       gunzip ncbiRefSeqLink.txt.gz
145 |       echo ""
146 |       echo "Number of lines in ncbiRefSeqLink.txt:"
147 |       cat ncbiRefSeqLink.txt | wc -l
148 |       echo "Continue with python processing steps:"
149 |       echo ""
150 |   fi
151 |   '''
152 | }
153 | 
154 | 
155 | process python_inputs {
156 |   echo true
157 |   stageInMode 'copy'
158 |   conda "${params.conda}"
159 | 
160 |   input:
161 |   val '${m_DIR}' from records1
162 |   val '${t_DIR}' from records2
163 |   val '${NCBI_tmap}' from records5
164 |   val '${NCBI_transcripts}' from records6
165 | 
166 |   output:
167 |   file 'stringtie_for_script.tmap' into records10
168 |   file 'transcripts_Isoform2.tab' into records11
169 | 
170 |   shell:
171 |   '''
172 |   # Inputs for python
173 |   cp "!{params.NCBI_tmap}" ./stringtie_for_script.tmap
174 |   seqkit fx2tab "!{params.NCBI_transcripts}" > transcripts_Isoform.tab
175 |   # Formatting transcripts_Isoform.tab if gene= is present in file
176 |   sed -i 's/gene=/\t/'g transcripts_Isoform.tab
177 |   awk '{print $1"\t"$NF}' transcripts_Isoform.tab > transcripts_Isoform2.tab
178 |   '''
179 | }
180 | 
181 | 
182 | process gffcompare_parser {
183 |   echo true
184 |   stageInMode 'copy'
185 |   conda "${params.conda}"
186 | 
187 |   input:
188 |   file 'ncbiRefSeqLink.txt' from ncbiRefSeqLink
189 |   file 'stringtie_for_script.tmap' from records10
190 |   file 'transcripts_Isoform2.tab' from records11
191 | 
192 |   output:
193 |   file 'Ref_Transcript_Annotation.csv' into records12
194 |   file 'Novel_Transcript_Annotation.csv' into records13
195 | 
196 |   shell:
197 |   '''
198 |   python << END
199 | 
200 |   import sys
201 |   import pandas as pd
202 |   df = pd.read_csv('stringtie_for_script.tmap', sep = '\t')
203 |   print(df.sample(10))
204 |   print("Total number of transcripts:", df.shape[0])
205 |   print("")
206 |   df2 = df[~df.ref_id.astype(str).str.contains('-')]
207 |   novel_transcripts = df[df.ref_id.astype(str).str.contains('-')]
208 |   df3 = df2[["ref_gene_id", "ref_id", "class_code", "qry_gene_id", "qry_id", "num_exons", "FPKM", "TPM"]]
209 |   df_novel_transcripts = novel_transcripts[["ref_gene_id", "ref_id", "class_code", "qry_gene_id", "qry_id", "num_exons", "FPKM", "TPM"]]
210 |   print("Reference transcripts:")
211 |   print(df3.sample(10))
212 |   print("")
213 |   print("Novel transcripts:")
214 |   print(df_novel_transcripts.sample(10))
215 |   print("")
216 |   colnames=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18']
217 |   dfA1 = pd.read_csv('ncbiRefSeqLink.txt', sep = '\t', low_memory=False, names=colnames, header=None)
218 |   print(dfA1.head(10))
219 |   dfA2 = dfA1[['0', '1', '2', '3', '5', '14', '16']]
220 |   dfA2 = dfA2.rename(columns={'0': 'ref_id', '1': 'Annotation Status', '2' : 'NCBI RefSeq Gene ID', '3' : 'Transcript Description', '5' : 'NCBI RefSeq Protein ID', '14' : 'Alternative Gene Name', '16' : 'RefSeq Transcript Info'})
221 |   print("ncbiRefSeqLink annotation:")
222 |   print(dfA2.sample(10))
223 |   print("")
224 |   colnames = ['qry_id', 'cds_seq', 'none']
225 |   cds = pd.read_csv('transcripts_Isoform2.tab', sep = '\t', names=colnames)
226 |   cds2 = cds[["qry_id", "cds_seq"]]
227 |   print("transcripts file:")
228 |   print(cds2.sample(10))
229 |   print("")
230 |   result1 = pd.merge(df3, dfA2, on='ref_id', how='inner')
231 |   result1.sample(10)
232 |   result2 = pd.merge(result1, cds2, on='qry_id', how='inner')
233 |   result2.sample(10)
234 |   result3 = pd.merge(df_novel_transcripts, cds2, on='qry_id', how='inner')
235 |   result3.sample(10)
236 |   print("Number of Joined Transcripts (reference):", result2.shape[0])
237 |   print("")
238 |   print("Number of Joined Transcripts (novel):", result3.shape[0])
239 |   print("")
240 |   result2.to_csv('Ref_Transcript_Annotation.csv', index=False)
241 |   result3.to_csv('Novel_Transcript_Annotation.csv', index=False)
242 |   print("::: Done. Ref_Transcript_Annotation.csv and Novel_Transcript_Annotation.csv were succesfully produced")
243 |   print("")
244 |   END
245 |   '''
246 | }
247 | 
248 | 
249 | process output_pipeline {
250 |   echo true
251 |   stageInMode 'copy'
252 |   conda "${params.conda}"
253 | 
254 |   input:
255 |   file 'Ref_Transcript_Annotation.csv' from records12
256 |   file 'Novel_Transcript_Annotation.csv' from records13
257 | 
258 |   shell:
259 |   '''
260 |   echo ""
261 |   printf "::: Moving results to the output directory :::\n"
262 |   mv Ref_Transcript_Annotation.csv "!{params.outdir}"
263 |   mv Novel_Transcript_Annotation.csv "!{params.outdir}"
264 |   echo ""
265 |   echo "------------------------------------------------------------"
266 |   echo "------------------------------------------------------------"
267 |   echo "::: INFO: all done" :::
268 |   echo ""
269 |   echo "The following files are available in the output directory : "
270 |   echo ""
271 |   echo "Ref_Transcript_Annotation.csv contains annotation and coordinates of known transcripts"
272 |   echo ""
273 |   echo "Novel_Transcript_Annotation.csv contains collection of novel transcripts"
274 |   echo ""
275 |   echo "------------------------------------------------------------"
276 |   echo "------------------------------------------------------------"
277 |   echo ""
278 |   '''
279 | }
280 | 


--------------------------------------------------------------------------------
/nextflow_scripts/old/add-ncbi-annotation.nf:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env nextflow
 2 | 
 3 | /*
 4 |  * pipeline input parameters
 5 |  */
 6 | 
 7 | params.stringtie = '/home/wslab/test1_annotate/nextflow_scripts/stringtie_chr33.gtf'     
 8 | params.NCBI_annotation = '/home/wslab/test1_annotate/nextflow_scripts/galGal6_ncbiRefSeq.gtf'
 9 | params.ref_annotation = '/home/wslab/test1_annotate/nextflow_scripts/galGal6.gtf'
10 | params.genome = '/home/wslab/test1_annotate/nextflow_scripts/galGal6.fa'
11 | params.config = '/home/wslab/test1_annotate/nextflow_scripts/gawn_config.sh'
12 | params.threads = '10'
13 | params.outdir = '/home/wslab/test1_annotate/nextflow_scripts/'
14 | params.conda = '/home/wslab/test1_annotate/nextflow_scripts/environment.yml'
15 | 
16 | println """\
17 | 	 A D D - N C B I - A N N O T A T I O N   P I P E L I N E
18 |          =======================================================
19 |          stringtie         : ${params.stringtie}
20 |          NCBI_annotation   : ${params.NCBI_annotation}
21 |          ref_annotation    : ${params.ref_annotation}    
22 |          genome            : ${params.genome}
23 |          config_file       : ${params.config}
24 |          threads           : ${params.threads}
25 |          outdir            : ${params.outdir}
26 |          environment       : ${params.conda}
27 |          """
28 |          .stripIndent()
29 | 
30 | process add_ncbi_annotation {
31 |   echo true
32 |   stageInMode 'copy'
33 |   conda "${params.conda}"
34 | 
35 |   shell:
36 |   '''
37 |   add-ncbi-annotation -a "!{params.stringtie}" -n "!{params.NCBI_annotation}" -r "!{params.ref_annotation}" -g "!{params.genome}" -c "!{params.config}" -t "!{params.threads}" -o "!{params.outdir}"
38 |   '''
39 | }
40 | 


--------------------------------------------------------------------------------
/nextflow_scripts/old/annotate-my-genomes.nf:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env nextflow
 2 | 
 3 | /*
 4 |  * pipeline input parameters
 5 |  */
 6 | 
 7 | params.stringtie = '/home/wslab/test1_annotate/nextflow_scripts/stringtie_chr33.gtf'     
 8 | params.NCBI_annotation = '/home/wslab/test1_annotate/nextflow_scripts/galGal6_ncbiRefSeq.gtf'
 9 | params.ref_annotation = '/home/wslab/test1_annotate/nextflow_scripts/galGal6.gtf'
10 | params.genome = '/home/wslab/test1_annotate/nextflow_scripts/galGal6.fa'
11 | params.config = '/home/wslab/test1_annotate/nextflow_scripts/gawn_config.sh'
12 | params.threads = '10'
13 | params.outdir = '/home/wslab/test1_annotate/nextflow_scripts/'
14 | params.conda = '/home/wslab/test1_annotate/nextflow_scripts/environment.yml'
15 | 
16 | println """\
17 |          A N N O T A T E - M Y - G E N O M E S   P I P E L I N E
18 |          =======================================================
19 |          stringtie         : ${params.stringtie}
20 |          ref_annotation    : ${params.ref_annotation}    
21 |          genome            : ${params.genome}
22 |          config_file       : ${params.config}
23 |          threads           : ${params.threads}
24 |          outdir            : ${params.outdir}
25 |          environment       : ${params.conda}
26 |          """
27 |          .stripIndent()
28 | 
29 | process annotate_my_genomes {
30 |   echo true
31 |   stageInMode 'copy'
32 |   conda "${params.conda}"
33 | 
34 |   shell:
35 |   '''
36 |   annotate-my-genomes -a "!{params.stringtie}" -r "!{params.ref_annotation}" -g "!{params.genome}" -c "!{params.config}" -t "!{params.threads}" -o "!{params.outdir}"
37 |   '''
38 | }
39 | 
40 | 


--------------------------------------------------------------------------------
/nextflow_scripts/old/isoform-identification.nf:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env nextflow
 2 | 
 3 | /*
 4 |  * pipeline input parameters
 5 |  */
 6 | 
 7 | params.NCBI_tmap = '/home/wslab/test1_annotate/nextflow_scripts/NCBI_compare.stringtie_chr33.gtf.tmap'     
 8 | params.NCBI_transcripts = '/home/wslab/test1_annotate/nextflow_scripts/NCBI_transcripts.fa'
 9 | params.genome_name = 'galGal6'
10 | params.conda = '/home/wslab/test1_annotate/nextflow_scripts/environment.yml'
11 | params.outdir ='./'
12 | 
13 | println """\
14 | 	 I S O F O R M - I D E N T I F I C A T I O N   P I P E L I N E
15 |          =============================================================
16 |          NCBI_tmap         : ${params.NCBI_tmap}
17 |          NCBI_transcripts  : ${params.NCBI_transcripts}
18 |          genome_name       : ${params.genome_name}    
19 |          environment       : ${params.conda}
20 |          outdir            : ${params.outdir}
21 |          """
22 |          .stripIndent()
23 | 
24 | process isoform_identification {
25 |   echo true
26 |   stageInMode 'copy'
27 |   conda "${params.conda}"
28 | 
29 |   publishDir "${params.outdir}", mode: 'copy'
30 | 
31 |   shell:
32 |   '''
33 |   isoform-identification -m "!{params.NCBI_tmap}" -t "!{params.NCBI_transcripts}" -g "!{params.genome_name}"
34 |   cp *csv "!{params.outdir}"
35 |   '''
36 | }
37 | 
38 | 


--------------------------------------------------------------------------------
/test/gawn_config.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Modify the following parameter values according to your experiment
 4 | # Do not modify the parameter names or remove parameters
 5 | # Do not add spaces around the equal (=) sign
 6 | 
 7 | # Global parameters
 8 | NCPUS=10                    # Number of CPUs to use for analyses (int, 1+)
 9 | 
10 | # Genome indexing
11 | SKIP_GENOME_INDEXING=1      # 1 to skip genome indexing, 0 to index it
12 | 
13 | # Genome annotation with transcriptome
14 | # NOTE: do not use compressed fasta files
15 | GENOME_NAME="genome.fasta"                  # Name of genome fasta file found in 03_data
16 | TRANSCRIPTOME_NAME="transcriptome.fasta"    # Name of transcriptome fasta file found in 03_data
17 | 
18 | # Path to swissprot database
19 | 


--------------------------------------------------------------------------------