├── 22.04_environment.yml ├── Dockerfile ├── LICENSE ├── README.md ├── additional_scripts ├── annotate_gtf.py ├── blast_parser.py ├── dexseq_count.py ├── dexseq_prepare_annotation_fixed.py ├── download_proteome_uniprot.pl ├── homolog_parser.py └── transcriptome_metrics.sh ├── bash_scripts ├── add_ncbi_annotation.sh ├── annotate_my_genomes.sh ├── genome_download.sh ├── genome_download_macOSX.sh ├── get_transcripts.sh └── isoform_identification.sh ├── data_examples ├── braker_chr33.gtf ├── gene_counts ├── gene_counts_GSE114129 └── transcripts.gtf.gz ├── environment.yml ├── makefile.nf ├── makefile.sh ├── nextflow_scripts ├── 22.04_environment.yml ├── add-ncbi-annotation.nf ├── annotate-my-genomes.nf ├── environment.yml ├── genome-download.nf ├── isoform-identification.nf └── old │ ├── add-ncbi-annotation.nf │ ├── annotate-my-genomes.nf │ └── isoform-identification.nf └── test ├── gawn_config.sh └── stringtie_chr33.gtf /22.04_environment.yml: -------------------------------------------------------------------------------- 1 | name: annotate_my_genomes 2 | channels: 3 | - anaconda 4 | - conda-forge 5 | - bioconda 6 | - defaults 7 | - r 8 | dependencies: 9 | - _libgcc_mutex=0.1 10 | - _openmp_mutex=4.5 11 | - _r-mutex=1.0.1 12 | - argtable2=2.13 13 | - atk-1.0=2.36.0 14 | - bedtools=2.30.0 15 | - binutils_impl_linux-64=2.39 16 | - binutils_linux-64=2.39 17 | - bottleneck=1.3.5 18 | - bwidget=1.9.14 19 | - bzip2=1.0.8 20 | - c-ares=1.18.1 21 | - ca-certificates=2022.07.19 22 | - cairo=1.16.0 23 | - clustalo=1.2.4 24 | - clustalw=2.1 25 | - coreutils=9.1 26 | - curl=7.83.1 27 | - emboss=6.6.0 28 | - expat=2.5.0 29 | - fasta_ushuffle=0.2 30 | - feelnc=0.2 31 | - font-ttf-dejavu-sans-mono=2.37 32 | - font-ttf-inconsolata=3.000 33 | - font-ttf-source-code-pro=2.038 34 | - font-ttf-ubuntu=0.83 35 | - fontconfig=2.14.0 36 | - fonts-conda-ecosystem=1 37 | - fonts-conda-forge=1 38 | - freetype=2.10.4 39 | - fribidi=1.0.10 40 | - gawk=5.1.0 41 | - gcc_impl_linux-64=10.4.0 42 | - gcc_linux-64=10.4.0 43 | - gdk-pixbuf=2.42.8 44 | - gettext=0.21.1 45 | - gffcompare=0.11.2 46 | - gffread=0.12.7 47 | - gfortran_impl_linux-64=10.4.0 48 | - gfortran_linux-64=10.4.0 49 | - giflib=5.2.1 50 | - gmap=2021.08.25 51 | - graphite2=1.3.13 52 | - graphviz=2.50.0 53 | - gsl=2.7 54 | - gtk2=2.24.33 55 | - gts=0.7.6 56 | - gxx_impl_linux-64=10.4.0 57 | - gxx_linux-64=10.4.0 58 | - harfbuzz=4.4.1 59 | - htslib=1.14 60 | - icu=70.1 61 | - jpeg=9e 62 | - k8=0.2.5 63 | - kernel-headers_linux-64=2.6.32 64 | - keyutils=1.6.1 65 | - kmerinshort=1.0.1 66 | - krb5=1.19.3 67 | - ld_impl_linux-64=2.39 68 | - lerc=3.0 69 | - libblas=3.9.0 70 | - libcblas=3.9.0 71 | - libcurl=7.83.1 72 | - libdb=6.2.32 73 | - libdeflate=1.10 74 | - libedit=3.1.20191231 75 | - libev=4.33 76 | - libffi=3.4.2 77 | - libgcc=7.2.0 78 | - libgcc-devel_linux-64=10.4.0 79 | - libgcc-ng=12.2.0 80 | - libgd=2.3.3 81 | - libgfortran-ng=12.2.0 82 | - libgfortran5=12.2.0 83 | - libglib=2.70.2 84 | - libgomp=12.2.0 85 | - libiconv=1.17 86 | - liblapack=3.9.0 87 | - libnghttp2=1.47.0 88 | - libnsl=2.0.0 89 | - libopenblas=0.3.21 90 | - libpng=1.6.37 91 | - librsvg=2.54.4 92 | - libsanitizer=10.4.0 93 | - libssh2=1.10.0 94 | - libstdcxx-devel_linux-64=10.4.0 95 | - libstdcxx-ng=12.2.0 96 | - libtiff=4.3.0 97 | - libtool=2.4.6 98 | - libuuid=2.32.1 99 | - libwebp=1.2.2 100 | - libwebp-base=1.2.2 101 | - libxcb=1.13 102 | - libxml2=2.9.14 103 | - libxslt=1.1.33 104 | - libzlib=1.2.11 105 | - lz4-c=1.9.3 106 | - make=4.3 107 | - minimap2=2.24 108 | - ncurses=6.3 109 | - nomkl=1.0 110 | - numexpr=2.8.3 111 | - numpy=1.21.6 112 | - openssl=1.1.1q 113 | - packaging=21.3 114 | - paml=4.9 115 | - pandas=1.3.5 116 | - pango=1.50.8 117 | - parallel=20220922 118 | - pcre=8.45 119 | - pcre2=10.37 120 | - perl=5.26.2 121 | - perl-aceperl=1.92 122 | - perl-algorithm-diff=1.1903 123 | - perl-algorithm-munkres=0.08 124 | - perl-apache-test=1.40 125 | - perl-app-cpanminus=1.7044 126 | - perl-appconfig=1.71 127 | - perl-array-compare=3.0.1 128 | - perl-autoloader=5.74 129 | - perl-base=2.23 130 | - perl-bio-asn1-entrezgene=1.73 131 | - perl-bio-coordinate=1.007001 132 | - perl-bio-featureio=1.6.905 133 | - perl-bio-phylo=0.58 134 | - perl-bio-samtools=1.43 135 | - perl-bio-tools-phylo-paml=1.7.3 136 | - perl-bio-tools-run-alignment-clustalw=1.7.4 137 | - perl-bio-tools-run-alignment-tcoffee=1.7.4 138 | - perl-bioperl=1.7.2 139 | - perl-bioperl-core=1.007002 140 | - perl-bioperl-run=1.007002 141 | - perl-business-isbn=3.004 142 | - perl-business-isbn-data=20140910.003 143 | - perl-cache-cache=1.08 144 | - perl-capture-tiny=0.48 145 | - perl-carp=1.38 146 | - perl-cgi=4.44 147 | - perl-class-data-inheritable=0.08 148 | - perl-class-inspector=1.34 149 | - perl-class-load=0.25 150 | - perl-class-load-xs=0.10 151 | - perl-class-method-modifiers=2.12 152 | - perl-clone=0.42 153 | - perl-common-sense=3.74 154 | - perl-compress-raw-zlib=2.087 155 | - perl-constant=1.33 156 | - perl-convert-binary-c=0.78 157 | - perl-convert-binhex=1.125 158 | - perl-crypt-rc4=2.02 159 | - perl-data-dumper=2.173 160 | - perl-data-optlist=0.110 161 | - perl-data-stag=0.14 162 | - perl-date-format=2.30 163 | - perl-db-file=1.855 164 | - perl-dbd-sqlite=1.64 165 | - perl-dbi=1.642 166 | - perl-devel-globaldestruction=0.14 167 | - perl-devel-overloadinfo=0.005 168 | - perl-devel-stacktrace=2.04 169 | - perl-digest-hmac=1.03 170 | - perl-digest-md5=2.55 171 | - perl-digest-perl-md5=1.9 172 | - perl-digest-sha1=2.13 173 | - perl-dist-checkconflicts=0.11 174 | - perl-dynaloader=1.25 175 | - perl-email-date-format=1.005 176 | - perl-encode=2.88 177 | - perl-encode-locale=1.05 178 | - perl-error=0.17027 179 | - perl-eval-closure=0.14 180 | - perl-exception-class=1.44 181 | - perl-exporter=5.72 182 | - perl-exporter-tiny=1.002001 183 | - perl-extutils-makemaker=7.36 184 | - perl-file-listing=6.04 185 | - perl-file-path=2.16 186 | - perl-file-slurp-tiny=0.004 187 | - perl-file-sort=1.01 188 | - perl-file-temp=0.2304 189 | - perl-file-which=1.23 190 | - perl-font-afm=1.20 191 | - perl-font-ttf=1.06 192 | - perl-gd=2.68 193 | - perl-getopt-long=2.50 194 | - perl-graph=0.9704 195 | - perl-graphviz=2.24 196 | - perl-html-element-extended=1.18 197 | - perl-html-entities-numbered=0.04 198 | - perl-html-formatter=2.16 199 | - perl-html-parser=3.72 200 | - perl-html-tableextract=2.13 201 | - perl-html-tagset=3.20 202 | - perl-html-tidy=1.60 203 | - perl-html-tree=5.07 204 | - perl-html-treebuilder-xpath=0.14 205 | - perl-http-cookies=6.04 206 | - perl-http-daemon=6.01 207 | - perl-http-date=6.02 208 | - perl-http-message=6.18 209 | - perl-http-negotiate=6.01 210 | - perl-image-info=1.38 211 | - perl-image-size=3.300 212 | - perl-io-html=1.001 213 | - perl-io-sessiondata=1.03 214 | - perl-io-socket-ssl=2.066 215 | - perl-io-string=1.08 216 | - perl-io-stringy=2.111 217 | - perl-io-tty=1.12 218 | - perl-ipc-run=20180523.0 219 | - perl-ipc-sharelite=0.17 220 | - perl-jcode=2.07 221 | - perl-json=4.02 222 | - perl-json-xs=2.34 223 | - perl-lib=0.63 224 | - perl-libwww-perl=6.39 225 | - perl-libxml-perl=0.08 226 | - perl-list-moreutils=0.428 227 | - perl-list-moreutils-xs=0.428 228 | - perl-local-lib=2.000024 229 | - perl-lwp-mediatypes=6.04 230 | - perl-lwp-protocol-https=6.07 231 | - perl-lwp-simple=6.15 232 | - perl-mailtools=2.21 233 | - perl-math-cdf=0.1 234 | - perl-math-derivative=1.01 235 | - perl-math-random=0.72 236 | - perl-math-spline=0.02 237 | - perl-mime-base64=3.15 238 | - perl-mime-lite=3.030 239 | - perl-mime-tools=5.508 240 | - perl-mime-types=2.17 241 | - perl-mldbm=2.05 242 | - perl-module-build=0.4224 243 | - perl-module-implementation=0.09 244 | - perl-module-runtime=0.016 245 | - perl-module-runtime-conflicts=0.003 246 | - perl-moo=2.003004 247 | - perl-moose=2.2011 248 | - perl-mozilla-ca=20180117 249 | - perl-mro-compat=0.13 250 | - perl-net-http=6.19 251 | - perl-net-ssleay=1.88 252 | - perl-ntlm=1.09 253 | - perl-ole-storage_lite=0.19 254 | - perl-package-deprecationmanager=0.17 255 | - perl-package-stash=0.38 256 | - perl-package-stash-xs=0.28 257 | - perl-parallel-forkmanager=2.02 258 | - perl-params-util=1.07 259 | - perl-parent=0.236 260 | - perl-parse-recdescent=1.967015 261 | - perl-pathtools=3.75 262 | - perl-pdf-api2=2.035 263 | - perl-pod-escapes=1.07 264 | - perl-pod-usage=1.69 265 | - perl-postscript=0.06 266 | - perl-role-tiny=2.000008 267 | - perl-scalar-list-utils=1.52 268 | - perl-set-scalar=1.29 269 | - perl-soap-lite=1.19 270 | - perl-socket=2.027 271 | - perl-sort-naturally=1.03 272 | - perl-spreadsheet-parseexcel=0.65 273 | - perl-spreadsheet-writeexcel=2.40 274 | - perl-statistics-descriptive=3.0702 275 | - perl-storable=3.15 276 | - perl-sub-exporter=0.987 277 | - perl-sub-exporter-progressive=0.001013 278 | - perl-sub-identify=0.14 279 | - perl-sub-install=0.928 280 | - perl-sub-name=0.21 281 | - perl-sub-quote=2.006003 282 | - perl-sub-uplevel=0.2800 283 | - perl-svg=2.84 284 | - perl-svg-graph=0.02 285 | - perl-task-weaken=1.06 286 | - perl-template-toolkit=2.26 287 | - perl-test=1.26 288 | - perl-test-deep=1.128 289 | - perl-test-differences=0.67 290 | - perl-test-exception=0.43 291 | - perl-test-harness=3.42 292 | - perl-test-leaktrace=0.16 293 | - perl-test-most=0.35 294 | - perl-test-requiresinternet=0.05 295 | - perl-test-warn=0.36 296 | - perl-text-diff=1.45 297 | - perl-tie-ixhash=1.23 298 | - perl-time-hires=1.9760 299 | - perl-time-local=1.28 300 | - perl-timedate=2.30 301 | - perl-tree-dag_node=1.31 302 | - perl-try-tiny=0.30 303 | - perl-type-tiny=1.004004 304 | - perl-types-serialiser=1.0 305 | - perl-unicode-map=0.112 306 | - perl-uri=1.76 307 | - perl-www-robotrules=6.02 308 | - perl-xml-dom=1.46 309 | - perl-xml-dom-xpath=0.14 310 | - perl-xml-filter-buffertext=1.01 311 | - perl-xml-libxml=2.0132 312 | - perl-xml-libxslt=1.94 313 | - perl-xml-namespacesupport=1.12 314 | - perl-xml-parser=2.44_01 315 | - perl-xml-regexp=0.04 316 | - perl-xml-sax=1.02 317 | - perl-xml-sax-base=1.09 318 | - perl-xml-sax-expat=0.51 319 | - perl-xml-sax-writer=0.57 320 | - perl-xml-simple=2.25 321 | - perl-xml-twig=3.52 322 | - perl-xml-writer=0.625 323 | - perl-xml-xpath=1.44 324 | - perl-xml-xpathengine=0.14 325 | - perl-xsloader=0.24 326 | - perl-yaml=1.29 327 | - pip=22.3.1 328 | - pixman=0.40.0 329 | - pthread-stubs=0.4 330 | - pyparsing=3.0.4 331 | - python=3.7.12 332 | - python-dateutil=2.8.2 333 | - python_abi=3.7 334 | - pytz=2022.1 335 | - r-base=4.1.3 336 | - r-bitops=1.0_7 337 | - r-catools=1.18.2 338 | - r-gplots=3.1.3 339 | - r-gtools=3.9.3 340 | - r-kernsmooth=2.23_20 341 | - r-randomforest=4.7_1.1 342 | - r-rocr=1.0_11 343 | - readline=8.1.2 344 | - sed=4.8 345 | - seqkit=2.3.1 346 | - setuptools=65.5.1 347 | - six=1.16.0 348 | - sqlite=3.38.5 349 | - stringtie=2.2.1 350 | - sysroot_linux-64=2.12 351 | - t_coffee=11.0.8 352 | - tidyp=1.04 353 | - tk=8.6.12 354 | - tktable=2.10 355 | - transdecoder=5.5.0 356 | - wheel=0.38.4 357 | - xorg-kbproto=1.0.7 358 | - xorg-libice=1.0.10 359 | - xorg-libsm=1.2.3 360 | - xorg-libx11=1.7.2 361 | - xorg-libxau=1.0.9 362 | - xorg-libxdmcp=1.1.3 363 | - xorg-libxext=1.3.4 364 | - xorg-libxrender=0.9.10 365 | - xorg-libxt=1.2.1 366 | - xorg-renderproto=0.11.1 367 | - xorg-xextproto=7.3.0 368 | - xorg-xproto=7.0.31 369 | - xz=5.2.6 370 | - zlib=1.2.11 371 | - zstd=1.5.2 372 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | ARG UBUNTU_VER=20.04 2 | ARG CONDA_VER=latest 3 | ARG OS_TYPE=x86_64 4 | ARG PY_VER=3.8.11 5 | ARG TF_VER=2.5.0 6 | 7 | FROM ubuntu:${UBUNTU_VER} 8 | 9 | # System packages 10 | ARG DEBIAN_FRONTEND=noninteractive 11 | ENV TZ=Etc 12 | RUN apt-get update && apt-get install -yq build-essential g++ python-dev autotools-dev libicu-dev libbz2-dev libboost-all-dev zlib1g-dev curl wget unzip sed jq vim nano libidn11 libnet-perl perl-doc liblmdb-dev && apt-get install -y git && apt install -y make && apt install -y autoconf 13 | RUN apt install -y parallel 14 | 15 | 16 | # Install make 17 | RUN apt update && apt install -y make && apt install -y autoconf 18 | 19 | # cmake 20 | RUN wget https://github.com/Kitware/CMake/releases/download/v3.17.3/cmake-3.17.3.tar.gz --no-check-certificate && tar -zxvf cmake-3.17.3.tar.gz && cd cmake-3.17.3 && apt-get install libssl-dev && ./bootstrap && make && make install && cd / 21 | 22 | # R and dependences 23 | RUN apt install -y dirmngr gnupg apt-transport-https ca-certificates software-properties-common && apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 && add-apt-repository 'deb https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/' && apt install -y r-base 24 | RUN R -e "install.packages('ROCR',dependencies=TRUE, repos='http://cran.rstudio.com/')" 25 | RUN R -e "install.packages('randomForest',dependencies=TRUE, repos='http://cran.rstudio.com/')" 26 | 27 | # stringtie 28 | RUN git clone https://github.com/gpertea/stringtie && cd stringtie && make release && make test && ./run_tests.sh && cp stringtie /usr/local/bin/ && cp stringtie /usr/bin/ && cd SuperReads_RNA && ./install.sh && cd / 29 | 30 | # FEELnc 31 | RUN apt-get install -y libcurl4 libcurl4-openssl-dev && apt-get install -y libxml-dom-xpath-perl && apt-get install -y cpanminus 32 | RUN cpanm Parallel::ForkManager Bio::DB::SeqFeature 33 | 34 | # Installing KmerInShort 35 | RUN git clone --recursive https://github.com/rizkg/KmerInShort && cd KmerInShort && mkdir build; cd build; cmake ..; make -j 8 && cp KmerInShort /usr/local/bin/ && cp KmerInShort /usr/bin/ && cd / 36 | 37 | # Installing fasta_ushuffle 38 | RUN wget -O fasta_ushuffle.zip https://github.com/agordon/fasta_ushuffle/archive/refs/heads/master.zip --no-check-certificate && unzip fasta_ushuffle.zip && cd fasta_ushuffle-master/ && make && cp fasta_ushuffle ushuffle /usr/local/bin/ && cp fasta_ushuffle ushuffle /usr/bin/ && cd / 39 | 40 | # Installing FEELnc 41 | RUN git clone https://github.com/tderrien/FEELnc.git && cd /FEELnc && export FEELNCPATH=$(pwd) && export PERL5LIB=$PERL5LIB:${FEELNCPATH}/lib/ && export PATH=$PATH:${FEELNCPATH}/scripts/ && export PATH=$PATH:${FEELNCPATH}/utils/ && export PATH=$PATH:${FEELNCPATH}/bin/LINUX/ && cp -r ${FEELNCPATH}/bin/LINUX/ ~/bin/ 42 | ENV PATH=/FEELnc/bin/LINUX:${PATH} 43 | ENV FEELNCPATH=/FEELnc 44 | ENV PERL5LIB=:/FEELnc/lib/ 45 | ENV PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/FEELnc/bin/LINUX:/FEELnc/utils:/FEELnc/scripts/ 46 | 47 | # FEELnc Test 48 | RUN cd /FEELnc/test/ && FEELnc_filter.pl -i transcript_chr38.gtf -a annotation_chr38.gtf -b transcript_biotype=protein_coding > candidate_lncRNA.gtf && FEELnc_codpot.pl -i candidate_lncRNA.gtf -a annotation_chr38.gtf -b transcript_biotype=protein_coding -g genome_chr38.fa --mode=shuffle && FEELnc_classifier.pl -i feelnc_codpot_out/candidate_lncRNA.gtf.lncRNA.gtf -a annotation_chr38.gtf > candidate_lncRNA_classes.txt && cd / 49 | 50 | # Installing gffcompare and gclib 51 | RUN git clone https://github.com/gpertea/gclib && git clone https://github.com/gpertea/gffcompare && git clone https://github.com/gpertea/gffread 52 | RUN cd /gffcompare && make release && cp gffcompare trmap /usr/local/bin/ && cp gffcompare trmap /usr/bin/ && cd / 53 | RUN cd /gffread && make release && cp gffread /usr/local/bin/ && cd / 54 | 55 | # Installing ncbi-blast+ 56 | RUN apt-get remove -y ncbi-blast+ 57 | RUN apt-get install -y ncbi-blast+ 58 | 59 | # gmap 60 | RUN apt-get install -y gmap 61 | 62 | # bedtools 63 | RUN apt-get install -y bedtools 64 | 65 | # samtools 66 | RUN apt-get install -y samtools && apt-get install -y bcftools 67 | 68 | # transdecoder (TransDecoder.LongOrfs 5.5.0) 69 | RUN wget https://github.com/TransDecoder/TransDecoder/archive/refs/tags/TransDecoder-v5.5.0.zip --no-check-certificate && unzip TransDecoder-v5.5.0.zip && mv TransDecoder-TransDecoder-v5.5.0 TransDecoder-v5.5.0 && apt-get install -y hmmer 70 | RUN cd /TransDecoder-v5.5.0 && ln -s /TransDecoder-v5.5.0/TransDecoder.LongOrfs /usr/local/bin/ && ln -s /TransDecoder-v5.5.0/TransDecoder.Predict /usr/local/bin/ && ln -s /TransDecoder-v5.5.0/TransDecoder.LongOrfs /usr/bin/ && ln -s /TransDecoder-v5.5.0/TransDecoder.Predict /usr/bin/ 71 | 72 | # seqkit 73 | RUN wget https://github.com/shenwei356/seqkit/releases/download/v0.12.1/seqkit_linux_386.tar.gz --no-check-certificate && gunzip seqkit_linux_386.tar.gz && tar -xvf seqkit_linux_386.tar && cp seqkit /usr/local/bin/ && cp seqkit /usr/bin/ && cd / 74 | 75 | # emboss 76 | RUN apt-get install -y emboss 77 | 78 | # Clustalo 79 | RUN apt-get install -y clustalo 80 | 81 | # Cufflinks 82 | RUN apt-get install -y cufflinks 83 | 84 | # gawk 85 | RUN apt install -y gawk 86 | 87 | # minimap2 88 | RUN apt-get -y install minimap2 89 | 90 | # pandas 91 | RUN apt-get -y install python3-pip 92 | RUN pip install pandas 93 | RUN pip install numpy 94 | 95 | # annotate_my_genomes 96 | RUN git clone https://github.com/cfarkas/annotate_my_genomes.git && cd annotate_my_genomes && chmod 755 ./makefile.sh && ./makefile.sh 97 | RUN ln -s /annotate_my_genomes/bash_scripts/ /usr/local/bin/add_ncbi_annotation.sh 98 | RUN ln -s /annotate_my_genomes/bash_scripts/ /usr/local/bin/annotate_my_genomes.sh 99 | RUN ln -s /annotate_my_genomes/bash_scripts/ /usr/local/bin/genome_download.sh 100 | RUN ln -s /annotate_my_genomes/bash_scripts/ /usr/local/bin/get_transcripts.sh 101 | RUN ln -s /annotate_my_genomes/bash_scripts/ /usr/bin/add_ncbi_annotation.sh 102 | RUN ln -s /annotate_my_genomes/bash_scripts/ /usr/bin/annotate_my_genomes.sh 103 | RUN ln -s /annotate_my_genomes/bash_scripts/ /usr/bin/genome_download.sh 104 | RUN ln -s /annotate_my_genomes/bash_scripts/ /usr/bin/get_transcripts.sh 105 | ENV PATH=/annotate_my_genomes/bin:${PATH} 106 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 cfarkas 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # annotate_my_genomes 2 | 3 | Transcriptome annotation pipeline using short and long sequencing reads from non-model (and model) animal organisms. 4 | 5 | ![image](https://user-images.githubusercontent.com/7016350/108611599-a6319f00-73a5-11eb-89b7-3cfd44b00cc5.png) 6 | 7 | #### See publication here: https://doi.org/10.1093/gigascience/giac099 8 | 9 | ## I) Pipeline Outline 10 | ```annotate_my_genomes``` is a pipeline that aims to annotate genome-guided transcriptome assemblies from StringTie, coming from long read RNA-Seq alignments in vertebrate genomes (i.e. PacBio technology). Transcripts are classified by its coding potential, probable gene function and identified as novel or reconciliated with the current reference annotation from RefSeq/NCBI, without loosing isoform and exon information. Also, known/novel coding sequences in nucleotides and correspondent proteins will be resolved. 11 | 12 | This pipeline requieres to run: 13 | 14 | 1) StringTie assembled transcripts (in GTF format). Check here: https://github.com/cfarkas/annotate_my_genomes/wiki#ii-obtaining-stringtie-gtf-file-for-annotation 15 | 16 | 2) At minimum, coding UCSC/NCBI reference genome annotations (in GTF format) and genome assembly (non-masked fasta from UCSC). All these requirements can be downloaded once by using the ```genome-download``` program provided in this repository and inputting a genome prefix as follows: 17 | ``` 18 | ./genome-download [genome] # mm10 for mouse, hg38 for human, galGal6 for chicken, etc. Use genome-download-macOSX instead in macOSX 19 | ``` 20 | - In example, ```./genome-download mm10 ``` , will output: ```mm10.fa```, ```mm10.gtf``` and ```mm10_ncbiRefSeq.gtf``` files. 21 | - ```mm10.gtf``` contains coding genes and ```mm10_ncbiRefSeq.gtf``` contains all NCBI annotations. 22 | 23 | - For genomes, check UCSC genome prefixes here: http://hgdownload.soe.ucsc.edu/downloads.html 24 | 25 | 3) Finally, the basic pipeline can be runned using a mouse transcriptome as example (stringtie.gtf) and 20 threads, as follows: 26 | ``` 27 | mkdir output1 28 | ./annotate-my-genomes -a /path/to/stringtie.gtf -r /path/to/mm10.gtf -g /path/to/mm10.fa -c /path/to/annotate_my_genomes/gawn_config.sh -t 20 -o /path/to/output1 29 | ``` 30 | The latter will output inside output1 folder: 31 | ``` 32 | - final_annotated.gtf: an annotated GTF file in the "gene_id" field, containing novel genes and lncRNA classification (second field in GTF file). 33 | - transcripts.fa : associated transcripts from final_annotated.gtf 34 | - cds. fa: associated coding sequences to final_annotated.gtf 35 | - prot.fa associated protein sequences to final_annotated.gtf 36 | - coding_transcripts.gtf: GTF file containing cds sequences. 37 | - novel coding sequences (novel-cds.fa) and correspondent novel protein sequences (novel-prot.fa). 38 | ``` 39 | * Users can also employ ```mm10_ncbiRefSeq.gtf``` by using ```add-ncbi-annotation``` instead of ```annotate-my-genomes``` binary. See an example here: https://github.com/cfarkas/annotate_my_genomes/blob/master/README.md#v-adding-ncbi-annotations-to-increase-annotation-of-transcripts 40 | 41 | ## II) Installation: 42 | 43 | ### Option 1: Via Nextflow (recommended) 44 | 45 | - Nextflow (https://www.nextflow.io/) is a great workflow framework and a programming DSL that eases the writing of data-intensive computational pipelines. We encourage and support the usage of this framework across different platforms for reproducibility. 46 | 47 | ### Requirements: 48 | 49 | - Nextflow can be installed as depicted here (https://www.nextflow.io/) or via anaconda as follows: 50 | 51 | ``` 52 | conda install -c bioconda nextflow 53 | ``` 54 | Also install (not through conda): 55 | 56 | - ```wget``` Comes by default with Linux/Ubuntu distros 57 | - ```sed``` editor. Comes by default with Linux/Ubuntu distros 58 | - ```ncbi-blast+``` version equal or higher than v2.7.1. To install it, see here: https://github.com/cfarkas/annotate_my_genomes/wiki#5-installing-up-to-date-ncbi-blast-version-v271 59 | - ```SAMtools``` . To install it, see here: https://github.com/cfarkas/annotate_my_genomes/wiki#9-obtaining-and-installing-up-to-date-samtools-with-htslib-version--19 60 | 61 | 62 | ### Installation: 63 | 64 | In a given directory: 65 | ``` 66 | git clone https://github.com/cfarkas/annotate_my_genomes.git # clone repository 67 | cd annotate_my_genomes # enter repository 68 | current_dir=$(pwd) # set working directory 69 | echo $current_dir # check working directory 70 | nextflow run makefile.nf --workdir $current_dir --conda ./22.04_environment.yml # make & install; use environment.yml for Ubuntu < 22.04 71 | ``` 72 | 73 | ### Option 2: Installing dependences via anaconda (tested in Ubuntu 16.05, 18.04, 20.04 and 22.04 LTS) 74 | 75 | ### Requirements: 76 | - requires miniconda, python2.7 and/or python>=3. To install miniconda, see: https://docs.conda.io/en/latest/miniconda.html 77 | 78 | Also install (not through conda): 79 | 80 | - ```wget``` Comes by default with Linux/Ubuntu distros 81 | - ```sed``` editor. Comes by default with Linux/Ubuntu distros 82 | - ```ncbi-blast+``` version equal or higher than v2.7.1. To install it, see here: https://github.com/cfarkas/annotate_my_genomes/wiki#5-installing-up-to-date-ncbi-blast-version-v271 83 | - ```SAMtools``` . To install it, see here: https://github.com/cfarkas/annotate_my_genomes/wiki#9-obtaining-and-installing-up-to-date-samtools-with-htslib-version--19 84 | 85 | ### Installation: 86 | 87 | In a given directory: 88 | ``` 89 | git clone https://github.com/cfarkas/annotate_my_genomes.git # clone repository 90 | cd annotate_my_genomes # enter repository 91 | conda config --add channels bioconda # add bioconda channel (if you haven't already done so) 92 | conda config --add channels conda-forge # add conda-forge channel (if you haven't already done so) 93 | conda env create -f 22.04_environment.yml # create and install environment; use environment.yml for Ubuntu < 22.04 94 | conda activate annotate_my_genomes # activate environment 95 | bash makefile.sh # make & install 96 | ``` 97 | - Copy binaries to ```/usr/local/bin``` 98 | ``` 99 | sudo cp ./bin/* /usr/local/bin/ 100 | ``` 101 | 102 | After these steps, a conda enviroment called ```annotate_my_genomes``` can be managed as follows: 103 | ``` 104 | # To activate this environment, use 105 | # 106 | # $ conda activate annotate_my_genomes 107 | # 108 | # To deactivate an active environment, use 109 | # 110 | # $ conda deactivate 111 | ``` 112 | 113 | #### Notes: 114 | 115 | - By activating annotate_my_genomes enviroment, all binaries in the annotate_my_genomes repository can be executed. 116 | - To install optional programs for downstream analysis, please see here: https://github.com/cfarkas/annotate_my_genomes/wiki#optional-dependences-to-run-all-the-downstream-analysis 117 | 118 | - Uninstall environment as follows: 119 | ``` 120 | conda remove --name annotate_my_genomes --all 121 | ``` 122 | 123 | - Inside the repository, there is a file called ```gawn_config.sh```. Optionally, edit and increase/decrease the number of cpus for blast processing: 124 | ``` 125 | NCPUS=10 126 | ``` 127 | To a value according to the computational capacity of your machine. 128 | 129 | ### Option 3: Run through docker: 130 | - See installation and pipeline run here: https://hub.docker.com/r/carlosfarkas/annotate_my_genomes 131 | ``` 132 | # Run docker without sudo privileges as follows: 133 | sudo chmod 666 /var/run/docker.sock 134 | 135 | # Downloading the docker image 136 | docker pull carlosfarkas/annotate_my_genomes:latest 137 | 138 | # Downloading repository 139 | git clone https://github.com/cfarkas/annotate_my_genomes.git && cd annotate_my_genomes 140 | 141 | # make & install using workdir 142 | chmod 755 makefile.sh 143 | docker run --volume $HOME:$HOME --workdir $(pwd) carlosfarkas/annotate_my_genomes ./makefile.sh # make & install 144 | 145 | OR 146 | 147 | # make & install using -it (interactively) 148 | docker run -v $(pwd):/annotate_my_genomes -it carlosfarkas/annotate_my_genomes:latest 149 | cd annotate_my_genomes/ 150 | bash makefile.sh 151 | ``` 152 | ### Option 4: Without using conda, program by program: 153 | 154 | - See detailed installation steps in our wiki here: https://github.com/cfarkas/annotate_my_genomes/wiki 155 | 156 | ## III) Running the whole pipeline via nextflow (recommended) 157 | 158 | - Inside ```annotate_my_genomes``` folder, enter into ```nextflow_scripts``` subdirectory and run the full pipeline using ```--flags``` parameters. 159 | - NOTE 1: Users **must provide full paths to inputs in the command line**. We recommed to split the flags with backslashes and run the pipeline exactly as follows: 160 | - NOTE 2: Use environment.yml for Ubuntu < 22.04 161 | ``` 162 | cd nextflow_scripts/ 163 | ``` 164 | 2.1) Run ```genome-download.nf``` (i.e : output galGal6 genome) 165 | ``` 166 | nextflow run genome-download.nf \ 167 | --genome galGal6 \ 168 | --conda /path/to/22.04_environment.yml --outdir /path/to/output_folder/ 169 | ``` 170 | 2.2) Run ```annotate-my-genomes.nf``` . Details here: https://github.com/cfarkas/annotate_my_genomes/blob/master/README.md#b-simplest-usage 171 | ``` 172 | nextflow run annotate-my-genomes.nf \ 173 | --stringtie /path/to/stringtie.gtf \ 174 | --ref_annotation /path/to/galGal6.gtf \ 175 | --genome /path/to/galGal6.fa \ 176 | --config /path/to/annotate_my_genomes/gawn_config.sh \ 177 | --threads 20 \ 178 | --conda /path/to/22.04_environment.yml --outdir /path/to/output_folder/ 179 | ``` 180 | 2.3) Run ```add-ncbi-annotation.nf``` . Details here: https://github.com/cfarkas/annotate_my_genomes/blob/master/README.md#c-adding-ncbi-annotations-to-increase-annotation-of-transcripts 181 | ``` 182 | nextflow run add-ncbi-annotation.nf \ 183 | --stringtie /path/to/stringtie.gtf \ 184 | --NCBI_annotation /path/to/galGal6_ncbiRefSeq.gtf \ 185 | --ref_annotation /path/to/galGal6.gtf \ 186 | --genome /path/to/galGal6.fa \ 187 | --config /path/to/annotate_my_genomes/gawn_config.sh \ 188 | --threads 20 \ 189 | --conda /path/to/22.04_environment.yml --outdir /path/to/output_folder/ 190 | ``` 191 | 2.4) Run ```isoform-identification.nf``` . Details here: https://github.com/cfarkas/annotate_my_genomes/blob/master/README.md#d-post-processing-add-ncbi-annotation-outputs 192 | ``` 193 | nextflow run isoform-identification.nf \ 194 | --NCBI_tmap /path/to/gffcompare.tmap \ 195 | --NCBI_transcripts /path/to/NCBI_transcripts.fa \ 196 | --genome_name galGal6 \ 197 | --conda /path/to/22.04_environment.yml --outdir /path/to/output_folder/ 198 | ``` 199 | 200 | #### Notes: 201 | 202 | - Users must provide full paths to files when running nextflow scripts. 203 | 204 | - Inside the repository, there is a file called gawn_config.sh. Optionally, edit and increase/decrease the number of cpus for blast processing: 205 | ``` 206 | NCPUS=10 207 | ``` 208 | To a value according to the computational capacity of your machine. 209 | 210 | 211 | ## IV) Running the whole pipeline via anaconda + binaries: 212 | 213 | ### A) Quickstart (Running the test) 214 | 215 | - Inside ```test``` folder, run the pipeline with a provided set of transcripts from chromosome 33, Gallus gallus genome version "6", in GTF format. 216 | - Users need to specify the stringtie output (GTF format), UCSC reference genome (GTF annotation and fasta file), gawn_config.sh file (check NCPUS for blast, default = 10), number of threads for text processing (20 for this example) and the output folder. 217 | 218 | Go to ```annotate_my_genomes/test``` directory and execute the following: 219 | 220 | ``` 221 | # Download Gallus gallus v6 fasta assembly (non masked) with matched GTF files (UCSC/Ensembl) 222 | ./genome-download galGal6 223 | 224 | # Execute pipeline on stringtie_chr33.gtf (provided file) with 20 threads: 225 | mkdir output1 226 | ./annotate-my-genomes -a stringtie_chr33.gtf -r galGal6.gtf -g galGal6.fa -c gawn_config.sh -t 20 -o output1 227 | 228 | # Include NCBI annptations on stringtie_chr33.gtf (provided file) with 20 threads: 229 | mkdir output2 230 | ./add-ncbi-annotation -a stringtie_chr33.gtf -n galGal6_ncbiRefSeq.gtf -r galGal6.gtf -g galGal6.fa -c gawn_config.sh -t 20 -o output2 231 | ``` 232 | 233 | ### B) Simplest usage 234 | (Optional) Edit NCPUS value in ```gawn_config.sh``` file inside the repository. Default is 10 235 | - As example, to annotate a chicken GTF file (i.e: "target.gtf") using 20 threads for cpu processing: 236 | 237 | ``` 238 | mkdir output1 239 | ./genome-download galGal6 240 | ./annotate-my-genomes -a /path/to/target.gtf -r /path/to/galGal6.gtf -g /path/to/galGal6.fa -c /path/to/gawn_config.sh -t 20 -o /path/to/output1 241 | ``` 242 | - ```final_annotated.gtf``` (located in output1/) will contained the merged NCBI-updated annotation (in UCSC coordinates) 243 | - To produce ```target.gtf``` assembly, check stringtie parameters here: https://github.com/cfarkas/annotate_my_genomes/wiki#ii-obtaining-stringtie-gtf-file-for-annotation 244 | 245 | ### C) Adding NCBI annotations to increase annotation of transcripts 246 | Users can add annotations from NCBI by using the three outputs from ./genome-download program as inputs into ./add-ncbi-annotation. 247 | - Resuming the previous example, using add-ncbi-annotation instead of annotate-my-genomes: 248 | ``` 249 | mkdir output2 250 | ./genome-download galGal6 251 | ./add-ncbi-annotation -a /path/to/target.gtf -n /path/to/galGal6_ncbiRefSeq.gtf -r /path/to/galGal6.gtf -g /path/to/galGal6.fa -c /path/to/gawn_config.sh -t 20 -o /path/to/output2 252 | ``` 253 | - ```final_annotated.gtf``` (located in output2/) will contained the merged NCBI-updated annotation (in UCSC coordinates). 254 | 255 | As example for mouse genome, change galGal6 prefix to mm10. Using 30 threads for processing "mouse.gtf" assembly: 256 | ``` 257 | mkdir output3 258 | ./genome-download mm10 259 | ./add-ncbi-annotation -a /path/to/mouse.gtf -n /path/to/mm10_ncbiRefSeq.gtf -r /path/to/mm10.gtf -g /path/to/mm10.fa -c /path/to/gawn_config.sh -t 30 -o /path/to/output3 260 | ``` 261 | ### D) Post processing add-ncbi-annotation outputs 262 | 263 | If ```stringtie.gtf``` (as an example of input GTF) was annotated with ```add-ncbi-annotation```, users can produce transcripts annotation tables (csv format) using two outputs from add-ncbi-annotation pipeline as follows: 264 | 265 | - gffcompare.tmap (inside ```output_files``` subdirectory) 266 | - NCBI_transcripts.fa (inside ```gffcompare_outputs_NCBI``` subdirectory) 267 | 268 | By using isoform-identification pipeline, as follows: 269 | 270 | ``` 271 | isoform-identification -m /path/to/gffcompare.tmap -t /path/to/NCBI_transcripts.fa -g galGal6 272 | ``` 273 | In this example: 274 | - ```gffcompare.tmap``` correspond to the transcript map output from gffcompare 275 | - ```NCBI_transcripts.fa``` correspond to the transcripts sequences from ```stringtie.gtf```, in fasta format 276 | - ```galGal6``` correspond to the NCBI genome name (in this example, Gallus gallus 6 genome, galGal6). 277 | 278 | The outputs ```Ref_Transcript_Annotation.csv``` and ```Novel_Transcript_Annotation.csv``` files will contain detailed annotation of transcripts. Ref_Transcript_Annotation.csv should look like this: 279 | 280 | ``` 281 | ref_gene_id ref_id class_code qry_gene_id qry_id num_exons FPKM TPM Annotation Status NCBI RefSeq Gene ID Transcript Description NCBI RefSeq Protein ID Alternative Gene Name RefSeq Transcript Info cds_seq 282 | OR14J1L40 XM_025145345.1 x STRG.16902 STRG.16902.1 3 0.089321 0.347251 Model OR14J1L40 olfactory receptor 14J1-like 40 XP_025001113.1 AATTTCATTGGAATTAAATTTATTATACGTATGACAAACTGatatgaagaagaaacagaaacaccacATAAAATCTATCAGGCTTTTCCTAAATTTTCTGTAGTCTTGAGAGCATGATGAACATCTTTCTGATAGTGAAACCGGGTATGTTGGAGTATCTTCCTGAGGGAacccttgagctcctggttcctcatgctgtagatgagggggttcaaAGCTGGAGGCACCACTGTGTATAGAAATGACACCACCAGGTCCagagatggggaggagatggagggaggcttcaggtaggcaaacatggcagtgctgacaaacagggagagcacagccaggtgagggaggcacgtggagaaggttttgtgctgtccctgctcagagggcatcctcagcacggccctgaagatctgcacataggagaagagaatgaaagcaaagcaccCAGATGCTAAAGAGGCACTGACAATAAGAAGCCAAATGTCTTTGAGATAGGAGTGTGAGCaagagagcttgaggatctgggggatttcacagaagaactgatccacagcattgccttggcacagaggcagggaaaatgtattggcagtgtgcagcagggaattaaggacccccgtgccccaggcagctgctgccatggtggcacacgctctgctgcccagcagggtccggtagtgcaggggcttgcagatggcaac 283 | LOC100857209 XM_015272533.2 x STRG.16904 STRG.16904.1 3 0.099526 0.386921 Model LOC100857209 olfactory receptor 14A16-like XP_015128019.2 catctgcagttcctgggcatggagtcctgttcagacTGCAGGAGATAATGATGAGTCGATACCATTCTCAGAGACACTCCTCCTGCAcactttgaaaatgcatttaactCCATAGCAtgagtttattttcatgagcttcAGAATCATGTAAGAAGTAGAAACTTAAGGAGCATTTAGTTTCCTATCATTTCCTAATCATATCCCAGGCTCCTGGattttttcctcataggagCTGTTTCCACATCTCTTTTCTttacccctaaccctaacttcTATGTTCTTCAACTTCTGTTAGAGAAATCTGTTTGATTGGAGGCTAAGTACATTATTCATGACTGCAGAGAATGACAATAAtttcagctggtgctgtcctttgggggaggagaggctgaaagcacatgAGGAGATTGTTCATATAACAGCAGACTGAGAAAGGTACAATTCAGGGTACTCAGAGATGTGTTCATATTTTCTGGCTCCcttcagatttctgcctccaatccttttcccttctcttagggtataaaagaaaaatccctgccctgtctctcctcttgcaaagAGGAGCAAACACCTTTGGAAACACCCTATGGTGCAGCtgtagctgtgatACCCCTGGCTCAGGCAgaagctgtggcagcagaaggccccttCCCTGCCGGGGGGCttcttccccccacacgtctccctgcagcgccctgggcagctccccgggcaggctgagtgctgagcctggcaggcggcagagtccctgccccggcacacagcccctggggcacagcagggaccctgctctgcactacagccctgggcacccggctgcacccaaacagcacagcctgcagccgtcctgggacacgcagccctcagggctgtgctctgatgctgcagcacagaagcccTCATCTGGAACAGTAGTCTTTTTCCATAGCAAGGAAACATGAAGTACTTTCAGCCAGATCTGCTATGGGATATCCCTGATTCAGTGATCCCTCCTGGAAAAACAGCTTCATTGCCTACTGCAAGAGACTTACCCTGTCAAGCGCTGTGAGCAAtgctcctccagtgagctcacatCCTACTCACACTGTACACATCCTGtaatctctttctcttttctcttctatcTTCATGTCACCTGCAGATCATGTCTatagccctgctgtgctgtacagaagagctgctcctgtgcaCAGCTGTCTCTCCGCAGCGCTGCCTGCTTTTatgagctccctgtgtcccaggagcctggcccagctcagcagc 284 | LOC112530844 XM_025145380.1 p STRG.16906 STRG.16906.1 1 0.192245 0.747381 Model LOC112530844 olfactory receptor 14A16-like XP_025001148.1 aaatcagcgggagacaagtctcatgctttcatgatcaacaagtctcagctttattgAAGCACACGCAGGCATTTATACGATAGTTAATGAGCTACTACATATGCCAAATTGGGTTCTCTTATTGGTTAGTTCTTTACGTGAGAAAGTAACCTTCAACGCTAGATACCGTGACAGTCCCGTGATGAATGCCCGATTGTTTACCGCATACCACTCAATTTTCTTAACTGCAGCATGTTcttatcacttccttgctcctgagtGAGGGCAGCACGACCTTGCCTGGTTTAATGAGCAGGGCCCTATctccttaccagctgcatcccatCATGGCCCCTCTCCCGGAGCCAGTGCTCCGGGTCCCAAAAGCTCTCCACACTTCCCCCGTTTTCTTTTGGTACGAGCCAGGTTGTATGAATCGCATCTTGAACCACCTTTTGCTAGCATTACAGTAAACAAAGCATGATTATCAGCATACCAATCACTATCTATAAGAATACACTAGATTTATgttacacacttctacaaagcattccttgtcagtaaactaacagtaaagactacacagcacaccagtattaactacagtttcaatatcccgatgaataaaataccacagtccCCACTCTGGATCAACCACTGTACCTGACCCCCACAATTAGTGCGCTTCTGAGTCTCATAACCGccaattgctcctggcagttcccagtgtCCAAGAGACCTTtctgatgagatgttttctgcaatCTGCTAAGGGAATACCAGTCGCAGCTCAGGAGTCACGGCACTGTATATGATGTCTTGCACACCATGCGGCTATCGCTCGCCGGAGTCGCCGTTGTTGTCATCGGGTTGAGATGGGTTGTTGATGTTCGGGGCTGGCTTAgtccatttactgggaacccataatgggccagatcctgtggAAACACAGCTCTCTCCTGGaagcctcccatgatgtttacaaaattccTATTGATTCCTAATTCactcaaagtttccacaaacccTTAACACCGTACagtgatattgttcagttataaacacttgggaacagatctcacagaagcttgTCCATGTTCCCTTACACGCTTCCATgcaatcagaacacagtactagATAAACAGGTtgacactcattccctgaaaggaacacatctcactcacaccacactcactctgacatttagaacaaaaaacatAGTTTATACATAACccacaatgctgacgacgtcttttAGCTTGTATCTTAATAACACTAGTGCATTAGTCAATTAGTTGCAATtcctaccccagccggcaatctaacctgtgagctcacgtatctcggggggggggggggaagcaggcacgctccttcataccctgcgtaggacgtctcctcacgccttacgggcacccccttttctatacacatacctgaTACACcaatggatggtccttgtctgtccctgcagtgatcgggtgaggaagggagaccttccaagaaatcttggggcgcgccaaaggtgtcccctctctcaatCGATCCCGCAGCCGAACAGAGCGGATCTATTCTCGTTGCAAAATTGAGTTGtagaaatcagaccctatatccggtaaggatatagagcaggcatgcGTCTATTGATGTCTATTGAtagtgcaagggggatcactccacctaacttgcacaccgtcaggagaaattgtactatagatataggtcaaactaatacataaccaatagttgacaggaattcagatacattttcattacgtccctgaaagacacattttcatgcagtataatgagacagaagaacagagggtAGTGCTGGCGCAGTTCTCATaatttgcagttgcttgcagcttgactcacagcacctggcacagcggtctctatcacagctctgcattcctttcgcctactcccatcattgttctgtgtgagacagtgatccatagcagctgttttacttgcactgacccagggggagaaaaacatgacctcgCTGGGTCAGCCGTCCATCCACAATTTCCCTGTTCTACTATTGCCTGGCCTGTGGGTGAGTTTGGGATACCCGTACTGTGTTTTACTCCCCATGTTTGCAGAAACTCCCCAAGCCTACGACTAGTGTAGGCTGGGccattgtctgtttttattcGTAGTGATATACCCATAACTGCAAAGCAACAACTGAGATGCTTTTCTACATACAtagccttttctccaggttgagcGGTGGCCCACATAAGATGACTATATGTATCTATAGACACGTGTACATATTTCAGCTGCCCGAACTCACCCACATGCATCACATCCATCTGCCTATTTTCGTTAGCTCTAAGTCCCCTGGGGTTAACTCCTAGCCCGAGACCCATACTGCCATTATGGTGGCTGCACACTGGGCACGATCTAACAATTACCTTAGCATCCTCATATGTTATCTGATATTCCCTTCTTAGCCCCTTGGCATTCTGGTGAAACATAGAGTACGCCTCTCGGGCCAGGACATGCCGGGAGACTAAAGGTCTCTGCGCCAGTGACACCAAGCGATCAGCTCTCGCATTTCCCTCTCCCAAGTCTATCTCCCATTTATGACCTCGAACATGTATTACTGCATATGAGTGCTCCCTAATtctgattgctctctgcaactgcacgAACAGCTTGTACAGCCGCcgattctgcacttcctttatgTAGGCTTCCTCTATTTGGTGGCATACTCCAGCTACATAAAGGGAGTCGGTGACCACATTAAGGGGGCCGATTAAGTTCATCATGGCCCATACAACGGCCACCAGCTCCAATGTTTGCAATAAGTCCTTATCATCGTCTGCAATGAGGTGATGTCTCCAGGAGCCgccctgctgccaggtcactgctgctgttctagacTTCTGTCCCGCATCCGTGTAAGCCGTGATTGTGTTCTGCAAGGGCGTCTCATGCTGCTTTGGTATCCGGAGCCAACTCCATTGACCAATCCAATGTAGCGGCACGTTCGGAATCTTTTCCACTGAAACCGTACTTCCAGCTCCTAAGAGAGCATCCTGTAACTCTGGACTATGCTGCACATACCATGTCAGAGTGTCCTTCTGCATTGGCAGCTGTACACACACAGGCTCCATACCTATGATCTGCAGGGTACGTTCTCGCCCTTTcttaatcacttctgccaggagttcagttttttgaagaagtgtttttgattgctgcagtgagggacagATCCACTCTAGTACCCATACctcccccgttttctttttagattgtgCCAACGCTCCTAAAAGGTACTTTGGTCCATACCATACCATAACCTGTATGGGGAGGTCAGGGTCACGTCTCCGAACACTGCCGTGTATAATGCAGTCCATAATCTGTTGTAGTAGACGTTTGTGCTGCGTTGTCACCGTTACAGGCTGGGCCGGGTCAGTGCCCTGTAACAAAGGTCGCAACGACTCTAAGAGTTCGTTTGGGATGCCCACCACAGGGCTCAACCACTTTAAGTCCCCCAGTAACCTTTGGGCATCATGTAGAGTCTCTAGTTTAGTATCcagttgcagtttctgtggggTTACTATCGTGTTAGTCAGTGTCCATCCTAAGTACTTCCGGGGCGCGGAGAGTTGTACCTTTTCAGGGGCAAACATAAGTTCTTCCCTATTTAGGGTCTTTTCTATTTGCCaaatttgttcctgtgtgaaggcCTCTGGCTGGGCAAAAAGGATGTCCTCCATGTAATGATAAAtgaccatttgtttccattctcgCCGGAGTGGTTGTAGAGCATGATCGACATATAGTTGACATCGCGTGGGGCTATTTTTCATCCTTTGAGGTAATACTGTCCATTCAAAACGTTGATCAGGGTGTTCTCGATTCAATGCAGGCAATGTGAAGGCAAATCGTTTAGTGTCCTGAGGGTGCAGGGTAATAGTaaagaaacagtcctttaaGTCACTAATTAGTAATGGCCAATTGTAAGGTAGCATGGCAGGATTAGGCAGGGCGGGTTGAAGTGCCCCCAGTTGAGAGAGCACATTGTGGCCAATTAAGCATTGAACAGTGGGGGGTAGAGGTGCCACCGAGACAGAGGTATGGACTACTTGTTCATCAAGGTGGATTTGCAGGGGAGGTGACTTTTTCGCTAAGGATAGTCCACCTGTACCCGTCACTGTGGCTATGGCCGCTTGCAGTGGCCATTGAGGCGGCCAAATTTCTGGGCTCAATATGCTGTTGTCGGCCCCTGTATCTAATAGACCttgaagtttgatttcttcctctctgtgtttAAGTGTCACTGGTTTTTTAGGTCGATCATGCAAATTTAGTGATAGCAATGCTAAGTCCCCTGAGGAGCCAAACCCTTGCTCCCCTCGGGGAGACGATTGACACGGTGTTAAGGCTTTGGTCAATTGCTCTAGGGGTACTAACTGCGCTATCCGTTGccctttctcaatttttattggAGGAAACGGGGTGTATACCATAATCTGGATCTCACCCTGAAAGTCCGCATCTATTACCCCAGGGAGGACAAAAAGTCCGAGCATCGATGCTGAAGAACGCCCCAATAAAAGGGCCCCAACAGCGGTTCCATTTATCATTACTGGTCCCCTGATCCCTGTAGACACCCGCTCAGGTTTTGTGGTCATTAAGGTCGTGGTCACTGCGGCTGCCAAGTCCAAGCCGAGGCTTCCTGGTGTGGCTgattgcagggctgctgctggctggaaacGGCTACTTGTGTCTGTGCGTGGCCGTCGTTTCTTTCTCGCGCTGGGCTGGGGGTTTCCTGACCGGCGTCGACAGGCATTGGTATTGTGGTTGTCCATACGACATGTGTGACACCATGAACCGGTGGTTTGACACTGACGACGCATATGTCCCATGCCGCCACAGCGATAGCATTTGATGCGACCAGCAACAGGCGATCTCGGGCCTAAATTTGTTATCGCAGACGCTTGTAAGGATGCAAGAGCTGCTAGCACTTGATTGTGAGAGGCCTCAGCTTGCGCCTTTAAACTTGCCCCTAACTCCTTAATAGCCTCAATCAGAAATGCTTGGGGCCCGACTGGCACGCTTGATagcttttccagtgcctcttcAATAGTCCAATTACTCCTCAAAGTACTCAGAGTACTACGTGCTGTTGAATTACAATTTTGGAGCGCGCATTGTTTTAACATTACTCCTCTCATATACTCTGGCACCCCTGCTTTTTCAATAGCCCCGGCTACCTTATCTATGAATGCCCCAAAGTCCTCATCTCTACCTTGTCGGATCCCCATATAAAATGGCAATCCATCAGGCACCTTAATCTTGTCCATGGCCTGTCTAGCTAAATACATCGTTTCTCGACATTTATCTGGCCCTAATAATGCTTGGGCTTGTGTTCTGAAAAAAGGCCCTAGCCCTAAGAGTTCTTCGATAGTTACACCATGTAGTGGGTCTCCCGGCTGCCTAGCCTTTGAGACACTCTGATGGCACAGTTCTTGCCAATATGCattaaacaacagctgttgATGTTGTGAAGAGATCAATTTTGCTATTGCCCGACAATCGGATGGCAGCAATATCTGCGTACTCCAAATATAATCCAATATCTGCTTAGCTGGCTCGCTTTTTACCCCAAACTGACTAACTGTAGATCGTAGCTGCGATAATAATTTCCAATCTAAAGCTGTGATGGTGGCCTGCATCCCTCCCGCAGGATTAGAGGCATATATCACTGGAAACGCCATGTGCCGCACGGCCTCC 285 | ``` 286 | 287 | ## V) Annotate and identify homologs in novel proteins from transcriptome 288 | 289 | - See this example: https://github.com/cfarkas/annotate_my_genomes/wiki#5-annotate-and-identify-homologs-in-novel-proteins-from-transcriptome 290 | 291 | ## VI Annotation of BRAKER2 / TSEBRA gtf output 292 | 293 | - The output ```braker.gtf``` from BRAKER2 pipeline (https://github.com/Gaius-Augustus/BRAKER) or ```tsebra.gtf``` from TSEBRA pipeline (https://github.com/Gaius-Augustus/TSEBRA) can be annotated using a few tools before running the pipeline. 294 | 295 | As a requirement, the AGAT toolkit (https://github.com/NBISweden/AGAT) must be installed: 296 | ``` 297 | conda activate annotate_my_genomes 298 | conda install -c bioconda agat 299 | ``` 300 | - Suppose you recently annotated the Gallus gallus genome (galGal6) using BRAKER2 or TSEBRA. The ```braker.gtf / tsebra.gtf``` output can be pre-processed as follows: 301 | 302 | #### BRAKER2 run 303 | ``` 304 | agat_convert_sp_gff2gtf.pl --gff braker.gtf -o braker_fixed.gtf # clean and fix braker.gtf with AGAT 305 | stringtie --merge -G galGal6_ncbiRefSeq.gtf braker_fixed.gtf -o braker_merged.gtf # merge braker.gtf with reference genome GTF (i.e.: galGal6_ncbiRefSeq.gtf) 306 | sed 's/ gene_name.*//'g braker_merged.gtf > braker_fixed.gtf # fix additional entries 307 | grep "StringTie" braker_fixed.gtf > braker_stringtie.gtf # Exclude reference transcripts not found in braker annotation 308 | ``` 309 | - Now, ``` braker_stringtie.gtf``` can annotated as follows (i.e. using 30 threads for processing): 310 | ``` 311 | mkdir braker_annotated 312 | add-ncbi-annotation -a braker_stringtie.gtf -n galGal6_ncbiRefSeq.gtf -r galGal6.gtf -g galGal6.fa -c gawn_config.sh -t 30 -o braker_annotated/ 313 | ``` 314 | 315 | #### TSEBRA run 316 | ``` 317 | agat_convert_sp_gff2gtf.pl --gff tsebra.gtf -o tsebra_fixed.gtf # clean and fix tsebra.gtf with AGAT 318 | stringtie --merge -G galGal6_ncbiRefSeq.gtf tsebra_fixed.gtf -o tsebra_merged.gtf # merge tsebra.gtf with reference genome GTF (i.e.: galGal6_ncbiRefSeq.gtf) 319 | sed 's/ gene_name.*//'g tsebra_merged.gtf > tsebra_fixed.gtf # fix additional entries 320 | grep "StringTie" tsebra_fixed.gtf > tsebra_stringtie.gtf # Exclude reference transcripts not found in braker annotation 321 | ``` 322 | - Now, ``` tsebra_stringtie.gtf``` can annotated as follows (i.e. using 30 threads for processing): 323 | ``` 324 | mkdir tsebra_annotated 325 | add-ncbi-annotation -a tsebra_stringtie.gtf -n galGal6_ncbiRefSeq.gtf -r galGal6.gtf -g galGal6.fa -c gawn_config.sh -t 30 -o tsebra_annotated/ 326 | ``` 327 | ### More Scenarios? 328 | 329 | - For downstream analysis and examples, please visit our wiki page : https://github.com/cfarkas/annotate_my_genomes/wiki 330 | 331 | ### Notes 332 | Compiling automatically uses Shell script compiler shc to make binaries, please check: https://github.com/neurobin/shc. 333 | -------------------------------------------------------------------------------- /additional_scripts/annotate_gtf.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from tqdm import tqdm 3 | 4 | # Define file names from command line arguments 5 | gtf_file = sys.argv[1] # GTF file 6 | hits_file = sys.argv[2] # Hits file 7 | annotation_table_file = sys.argv[3] # Transcriptome annotation table file 8 | output_file = sys.argv[4] # Output GTF file with annotations 9 | 10 | # Function to append blastx information and gawn_name to GTF entries 11 | def append_annotations_to_gtf(gtf_line, blastx_info, gawn_names): 12 | try: 13 | if 'transcript' in gtf_line: 14 | # Ensure there's a semicolon at the end of the original GTF line 15 | if not gtf_line.endswith(';'): 16 | gtf_line += ';' 17 | 18 | transcript_id = gtf_line.split('transcript_id "')[1].split('"')[0] 19 | annotations = [] 20 | if transcript_id in blastx_info: 21 | annotations.append(f'blastx "{blastx_info[transcript_id]}";') 22 | if transcript_id in gawn_names: 23 | annotations.append(f'gawn_name "{gawn_names[transcript_id]}";') 24 | if annotations: 25 | gtf_line += ' ' + ' '.join(annotations) 26 | return gtf_line 27 | except IndexError as e: 28 | print(f"Warning: Malformed line skipped: {gtf_line}") 29 | return None 30 | 31 | # Read the hits file and store the blastx info in a dictionary 32 | blastx_info = {} 33 | with open(hits_file, 'r') as hits: 34 | for line in hits: 35 | transcript_id, blastx_id = line.strip().split(' ') 36 | blastx_info[transcript_id] = blastx_id 37 | 38 | # Read the annotation table and store the gawn_names in a dictionary 39 | gawn_names = {} 40 | with open(annotation_table_file, 'r') as table: 41 | for line in table: 42 | parts = line.strip().split('\t') 43 | if len(parts) > 2: 44 | transcript_id = parts[0] 45 | gawn_name = parts[2] 46 | gawn_names[transcript_id] = gawn_name 47 | 48 | # Read the GTF file, modify entries with annotations, and write to the output file 49 | with open(gtf_file, 'r') as gtf, open(output_file, 'w') as out_gtf: 50 | for line in tqdm(gtf, desc="Annotating GTF"): 51 | modified_line = append_annotations_to_gtf(line.strip(), blastx_info, gawn_names) 52 | if modified_line: 53 | out_gtf.write(modified_line + '\n') 54 | 55 | print("Annotation completed. Output is in", output_file) 56 | -------------------------------------------------------------------------------- /additional_scripts/blast_parser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Takes a .xml formatted blast results file as input and prints the query and hit ids 4 | # for sequences passing the thresholds passed via the command line arguments. For sequences 5 | # with no hits below the thresholds, the program returns "no hits below threshold" rather 6 | # than the hit id. 7 | 8 | import getopt, sys 9 | from Bio import SeqIO 10 | from Bio.Blast import NCBIXML 11 | 12 | ## Function to parse an XML format BLAST results file. 13 | 14 | def parse_results(result_file, e_val_thresh, ident_thresh, align_thresh): 15 | result_handle = open(result_file, 'r') ## The XML file to parse. 16 | blast_records = NCBIXML.parse(result_handle) 17 | print('query_id\thit_id\tpercentage_identity\tquery_length\talignment_length\te_value') 18 | 19 | for record in blast_records: ## Loop through each query. 20 | query_id = record.query 21 | if len(record.alignments) > 0: ## Check whether there are hits. 22 | e_val = record.alignments[0].hsps[0].expect 23 | if e_val < e_val_thresh: ## Is hit below E-value? 24 | tot_ident = sum([hsp.identities for hsp in record.alignments[0].hsps]) ## Sum of all identities for all hsps. 25 | query_len = record.query_length ## Length of query 26 | align_len = sum([hsp.align_length for hsp in record.alignments[0].hsps]) ## Length of query alignment to hit. 27 | pct_ident = tot_ident/float(align_len)*100 ## Calculates percentage identity. 28 | top_hit = record.alignments[0].hit_id + record.alignments[0].hit_def 29 | if pct_ident > ident_thresh: ## Checks whether above percentage identity cutoff. 30 | if align_len > align_thresh: 31 | print('%s\t%s\t%f\t%i\t%i\t%s' % (query_id, top_hit, pct_ident, query_len, align_len, str(e_val))) 32 | else: 33 | print('%s\t%s\t%s\t%s\t%s\t%s' % (query_id, '', '', '', '', '')) 34 | else: 35 | print('%s\t%s\t%s\t%s\t%s\t%s' % (query_id, '', '', '', '', '')) 36 | else: 37 | print('%s\t%s\t%s\t%s\t%s\t%s' % (query_id, '', '', '', '', '')) 38 | else: 39 | print('%s\t%s\t%s\t%s\t%s\t%s' % (query_id, '', '', '', '', '')) 40 | 41 | result_handle.close() 42 | 43 | ## How to use this. 44 | 45 | def usage(): 46 | print(""" 47 | \nblast_parser.py.\n 48 | Takes a .xml formatted blast results file as input and prints the query and hit ids 49 | for sequences passing the thresholds passed via the command line arguments. For sequences 50 | with no hits below the thresholds, the program returns "no hits below threshold" rather 51 | than the hit id.\n 52 | Basic usage: 53 | \tpython blast_parser.py -i -e 1e-20 -p 97 -a 100 > parsed_results.txt\n 54 | Arguments: 55 | \t-h, --help\t\t\tPrint this information. 56 | \t-i, --in \t\tXML format BLAST results file. 57 | \t-e, --evalue \t\tExpect value. 58 | \t-p, --pct_ident \t\tPercentage identity cutoff. 59 | \t-a, --align_len \t\t Minimum alignment length. 60 | """) 61 | 62 | ## The main program. 63 | 64 | def main(): 65 | try: ## Parses the command line arguments. 66 | opts, args = getopt.getopt(sys.argv[1:], 'e:i:p:a:h', ['evalue=', 'in=', 'pct_ident=', 'align_len=', 'help']) 67 | except getopt.GetoptError: 68 | usage() 69 | sys.exit(2) 70 | 71 | ## Creates variables from the arguments. 72 | 73 | for opt, arg in opts: 74 | if opt in ('-e', '--evalue'): 75 | e_val_thresh = float(arg) 76 | elif opt in ('-p', '--pct_ident'): 77 | ident_thresh = float(arg) 78 | elif opt in ('-a', '--align_len'): 79 | align_thresh = float(arg) 80 | elif opt in ('-i', '--in'): 81 | result_file = arg 82 | elif opt in ('-h', '--help'): 83 | usage() 84 | sys.exit(0) 85 | else: 86 | usage() 87 | sys.exit(2) 88 | 89 | try: ## Tries to parse the results file. 90 | parse_results(result_file, e_val_thresh, ident_thresh, align_thresh) 91 | except: ## Otherwise, shows usage. 92 | sys.exit(1) 93 | 94 | if __name__ == "__main__": 95 | main() 96 | -------------------------------------------------------------------------------- /additional_scripts/dexseq_count.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import sys, itertools, optparse, warnings 3 | 4 | optParser = optparse.OptionParser( 5 | 6 | usage = "python %prog [options] ", 7 | 8 | description= 9 | "This script counts how many reads in fall onto each exonic " + 10 | "part given in and outputs a list of counts in " + 11 | ", for further analysis with the DEXSeq Bioconductor package. " + 12 | "Notes: Use dexseq_prepare_annotation.py to produce . " + 13 | " may be '-' to indicate standard input.", 14 | 15 | epilog = 16 | "Written by Simon Anders (sanders@fs.tum.de) and Alejandro Reyes (reyes@embl.de), " + 17 | "European Molecular Biology Laboratory (EMBL). (c) 2010-2013. Released under the " + 18 | " terms of the GNU General Public License v3. Part of the 'DEXSeq' package." ) 19 | 20 | optParser.add_option( "-p", "--paired", type="choice", dest="paired", 21 | choices = ( "no", "yes" ), default = "no", 22 | help = "'yes' or 'no'. Indicates whether the data is paired-end (default: no)" ) 23 | 24 | optParser.add_option( "-s", "--stranded", type="choice", dest="stranded", 25 | choices = ( "yes", "no", "reverse" ), default = "yes", 26 | help = "'yes', 'no', or 'reverse'. Indicates whether the data is " + 27 | "from a strand-specific assay (default: yes ). " + 28 | "Be sure to switch to 'no' if you use a non strand-specific RNA-Seq library " + 29 | "preparation protocol. 'reverse' inverts strands and is needed for certain " + 30 | "protocols, e.g. paired-end with circularization." ) 31 | 32 | optParser.add_option( "-a", "--minaqual", type="int", dest="minaqual", 33 | default = 10, 34 | help = "skip all reads with alignment quality lower than the given " + 35 | "minimum value (default: 10)" ) 36 | 37 | optParser.add_option( "-f", "--format", type="choice", dest="alignment", 38 | choices=("sam", "bam"), default="sam", 39 | help = "'sam' or 'bam'. Format of (default: sam)" ) 40 | 41 | optParser.add_option( "-r", "--order", type="choice", dest="order", 42 | choices=("pos", "name"), default="name", 43 | help = "'pos' or 'name'. Sorting order of (default: name). Paired-end sequencing " + 44 | "data must be sorted either by position or by read name, and the sorting order " + 45 | "must be specified. Ignored for single-end data." ) 46 | 47 | 48 | if len( sys.argv ) == 1: 49 | optParser.print_help() 50 | sys.exit(1) 51 | 52 | (opts, args) = optParser.parse_args() 53 | 54 | if len( args ) != 3: 55 | sys.stderr.write( sys.argv[0] + ": Error: Please provide three arguments.\n" ) 56 | sys.stderr.write( " Call with '-h' to get usage information.\n" ) 57 | sys.exit( 1 ) 58 | 59 | try: 60 | import HTSeq 61 | except ImportError: 62 | sys.stderr.write( "Could not import HTSeq. Please install the HTSeq Python framework\n" ) 63 | sys.stderr.write( "available from http://www-huber.embl.de/users/anders/HTSeq\n" ) 64 | sys.exit(1) 65 | 66 | gff_file = args[0] 67 | sam_file = args[1] 68 | out_file = args[2] 69 | stranded = opts.stranded == "yes" or opts.stranded == "reverse" 70 | reverse = opts.stranded == "reverse" 71 | is_PE = opts.paired == "yes" 72 | alignment = opts.alignment 73 | minaqual = opts.minaqual 74 | order = opts.order 75 | 76 | if alignment == "bam": 77 | try: 78 | import pysam 79 | except ImportError: 80 | sys.stderr.write( "Could not import pysam, which is needed to process BAM file (though\n" ) 81 | sys.stderr.write( "not to process text SAM files). Please install the 'pysam' library from\n" ) 82 | sys.stderr.write( "https://code.google.com/p/pysam/\n" ) 83 | sys.exit(1) 84 | 85 | 86 | 87 | if sam_file == "-": 88 | sam_file = sys.stdin 89 | 90 | 91 | # Step 1: Read in the GFF file as generated by aggregate_genes.py 92 | # and put everything into a GenomicArrayOfSets 93 | 94 | features = HTSeq.GenomicArrayOfSets( "auto", stranded=stranded ) 95 | for f in HTSeq.GFF_Reader( gff_file ): 96 | if f.type == "exonic_part": 97 | f.name = f.attr['gene_id'] + ":" + f.attr['exonic_part_number'] 98 | features[f.iv] += f.name 99 | 100 | # initialise counters 101 | num_reads = 0 102 | counts = {} 103 | counts[ '_empty' ] = 0 104 | counts[ '_ambiguous' ] = 0 105 | counts[ '_lowaqual' ] = 0 106 | counts[ '_notaligned' ] = 0 107 | counts['_ambiguous_readpair_position'] = 0 108 | 109 | # put a zero for each feature ID 110 | for iv, s in features.steps(): 111 | for f in s: 112 | counts[ f ] = 0 113 | 114 | #We need this little helper below: 115 | def reverse_strand( s ): 116 | if s == "+": 117 | return "-" 118 | elif s == "-": 119 | return "+" 120 | else: 121 | raise SystemError("illegal strand") 122 | 123 | def update_count_vector( counts, rs ): 124 | if( type(rs) == str): 125 | counts[ rs ] += 1 126 | else: 127 | for f in rs: 128 | counts[f] += 1 129 | return counts 130 | 131 | 132 | def map_read_pair(af, ar): 133 | rs = set() 134 | if af and ar and not af.aligned and not ar.aligned: 135 | return '_notaligned' 136 | if af and ar and not af.aQual < minaqual and ar.aQual < minaqual: 137 | return '_lowaqual' 138 | if af and af.aligned and af.aQual >= minaqual and af.iv.chrom in list(features.chrom_vectors.keys()): 139 | for cigop in af.cigar: 140 | if cigop.type != "M": 141 | continue 142 | if reverse: 143 | cigop.ref_iv.strand = reverse_strand( cigop.ref_iv.strand ) 144 | for iv, s in features[cigop.ref_iv].steps(): 145 | rs = rs.union( s ) 146 | if ar and ar.aligned and ar.aQual >= minaqual and ar.iv.chrom in list(features.chrom_vectors.keys()): 147 | for cigop in ar.cigar: 148 | if cigop.type != "M": 149 | continue 150 | if not reverse: 151 | cigop.ref_iv.strand = reverse_strand( cigop.ref_iv.strand ) 152 | for iv, s in features[cigop.ref_iv].steps(): 153 | rs = rs.union( s ) 154 | set_of_gene_names = set( [ f.split(":")[0] for f in rs ] ) 155 | if len( set_of_gene_names ) == 0: 156 | return '_empty' 157 | elif len( set_of_gene_names ) > 1: 158 | return '_ambiguous' 159 | else: 160 | return rs 161 | 162 | 163 | def clean_read_queue( queue, current_position ): 164 | clean_queue = dict( queue ) 165 | for i in queue: 166 | if queue[i].mate_start.pos < current_position: 167 | warnings.warn( "Read "+ i + " claims to have an aligned mate that could not be found in the same chromosome." ) 168 | del clean_queue[i] 169 | return clean_queue 170 | 171 | 172 | if alignment == "sam": 173 | reader = HTSeq.SAM_Reader 174 | else: 175 | reader = HTSeq.BAM_Reader 176 | 177 | 178 | # Now go through the aligned reads 179 | num_reads = 0 180 | 181 | if not is_PE: 182 | for a in reader( sam_file ): 183 | if not a.aligned: 184 | counts[ '_notaligned' ] += 1 185 | continue 186 | if "NH" in a.optional_fields and a.optional_field("NH") > 1: 187 | continue 188 | if a.aQual < minaqual: 189 | counts[ '_lowaqual' ] += 1 190 | continue 191 | rs = set() 192 | for cigop in a.cigar: 193 | if cigop.type != "M": 194 | continue 195 | if reverse: 196 | cigop.ref_iv.strand = reverse_strand( cigop.ref_iv.strand ) 197 | for iv, s in features[cigop.ref_iv].steps( ): 198 | rs = rs.union( s ) 199 | set_of_gene_names = set( [ f.split(":")[0] for f in rs ] ) 200 | if len( set_of_gene_names ) == 0: 201 | counts[ '_empty' ] += 1 202 | elif len( set_of_gene_names ) > 1: 203 | counts[ '_ambiguous' ] +=1 204 | else: 205 | for f in rs: 206 | counts[ f ] += 1 207 | num_reads += 1 208 | if num_reads % 100000 == 0: 209 | sys.stderr.write( "%d reads processed.\n" % num_reads ) 210 | 211 | else: # paired-end 212 | alignments = dict() 213 | if order == "name": 214 | for af, ar in HTSeq.pair_SAM_alignments( reader( sam_file ) ): 215 | if af == None or ar == None: 216 | continue 217 | if not ar.aligned: 218 | continue 219 | if not af.aligned: 220 | continue 221 | elif ar.optional_field("NH") > 1 or af.optional_field("NH") > 1: 222 | continue 223 | elif af.iv.chrom != ar.iv.chrom: 224 | counts['_ambiguous_readpair_position'] += 1 225 | continue 226 | else: 227 | rs = map_read_pair( af, ar ) 228 | counts = update_count_vector( counts, rs ) 229 | num_reads += 1 230 | if num_reads % 100000 == 0: 231 | sys.stderr.write( "%d reads processed.\n" % num_reads ) 232 | 233 | else: 234 | processed_chromosomes = dict() 235 | num_reads = 0 236 | current_chromosome='' 237 | current_position='' 238 | for a in reader( sam_file ): 239 | if not a.aligned: 240 | continue 241 | if a.optional_field("NH") > 1: 242 | continue 243 | if current_chromosome != a.iv.chrom: 244 | if current_chromosome in processed_chromosomes: 245 | raise SystemError("A chromosome that had finished to be processed before was found again in the alignment file, is your alignment file properly sorted by position?") 246 | processed_chromosomes[current_chromosome] = 1 247 | alignments = clean_read_queue( alignments, current_position ) 248 | del alignments 249 | alignments = dict() 250 | if current_chromosome == a.iv.chrom and a.iv.start < current_position: 251 | raise SystemError("Current read position is smaller than previous reads, is your alignment file properly sorted by position?") 252 | current_chromosome = a.iv.chrom 253 | current_position = a.iv.start 254 | if a.read.name and a.mate_aligned: 255 | if a.read.name in alignments: 256 | b = alignments[ a.read.name ] 257 | if a.pe_which == "first" and b.pe_which == "second": 258 | af=a 259 | ar=b 260 | else: 261 | af=b 262 | ar=a 263 | rs = map_read_pair(af, ar) 264 | del alignments[ a.read.name ] 265 | counts = update_count_vector(counts, rs) 266 | else: 267 | if a.mate_start.chrom != a.iv.chrom: 268 | counts['_ambiguous_readpair_position'] += 1 269 | continue 270 | else: 271 | alignments[ a.read.name ] = a 272 | else: 273 | continue 274 | num_reads += 1 275 | if num_reads % 200000 == 0: 276 | alignments = clean_read_queue( alignments, current_position ) 277 | sys.stderr.write( "%d reads processed.\n" % (num_reads / 2) ) 278 | 279 | 280 | 281 | # Step 3: Write out the results 282 | 283 | fout = open( out_file, "w" ) 284 | for fn in sorted( counts.keys() ): 285 | fout.write( "%s\t%d\n" % ( fn, counts[fn] ) ) 286 | fout.close() 287 | -------------------------------------------------------------------------------- /additional_scripts/dexseq_prepare_annotation_fixed.py: -------------------------------------------------------------------------------- 1 | import sys, collections, itertools, os.path, optparse 2 | 3 | optParser = optparse.OptionParser( 4 | 5 | usage = "python %prog [options] ", 6 | 7 | description= 8 | "Script to prepare annotation for DEXSeq." + 9 | "This script takes an annotation file in Ensembl GTF format" + 10 | "and outputs a 'flattened' annotation file suitable for use " + 11 | "with the count_in_exons.py script ", 12 | 13 | epilog = 14 | "Written by Simon Anders (sanders@fs.tum.de), European Molecular Biology " + 15 | "Laboratory (EMBL). (c) 2010. Released under the terms of the GNU General " + 16 | "Public License v3. Part of the 'DEXSeq' package." ) 17 | 18 | optParser.add_option( "-r", "--aggregate", type="choice", dest="aggregate", 19 | choices = ( "no", "yes" ), default = "yes", 20 | help = "'yes' or 'no'. Indicates whether two or more genes sharing an exon should be merged into an 'aggregate gene'. If 'no', the exons that can not be assiged to a single gene are ignored." ) 21 | 22 | (opts, args) = optParser.parse_args() 23 | 24 | if len( args ) != 2: 25 | sys.stderr.write( "Script to prepare annotation for DEXSeq.\n\n" ) 26 | sys.stderr.write( "Usage: python %s \n\n" % os.path.basename(sys.argv[0]) ) 27 | sys.stderr.write( "This script takes an annotation file in Ensembl GTF format\n" ) 28 | sys.stderr.write( "and outputs a 'flattened' annotation file suitable for use\n" ) 29 | sys.stderr.write( "with the count_in_exons.py script.\n" ) 30 | sys.exit(1) 31 | 32 | try: 33 | import HTSeq 34 | except ImportError: 35 | sys.stderr.write( "Could not import HTSeq. Please install the HTSeq Python framework\n" ) 36 | sys.stderr.write( "available from http://www-huber.embl.de/users/anders/HTSeq\n" ) 37 | sys.exit(1) 38 | 39 | 40 | 41 | 42 | gtf_file = args[0] 43 | out_file = args[1] 44 | 45 | aggregateGenes = opts.aggregate == "yes" 46 | 47 | # Step 1: Store all exons with their gene and transcript ID 48 | # in a GenomicArrayOfSets 49 | 50 | exons = HTSeq.GenomicArrayOfSets( "auto", stranded=True ) 51 | for f in HTSeq.GFF_Reader( gtf_file ): 52 | if f.type != "exon": 53 | continue 54 | f.attr['gene_id'] = f.iv.chrom + '_' + f.attr['gene_id'].replace( ":", "_" ) + f.iv.strand 55 | exons[f.iv] += ( f.attr['gene_id'], f.attr['transcript_id'] ) 56 | 57 | 58 | # Step 2: Form sets of overlapping genes 59 | 60 | # We produce the dict 'gene_sets', whose values are sets of gene IDs. Each set 61 | # contains IDs of genes that overlap, i.e., share bases (on the same strand). 62 | # The keys of 'gene_sets' are the IDs of all genes, and each key refers to 63 | # the set that contains the gene. 64 | # Each gene set forms an 'aggregate gene'. 65 | 66 | if aggregateGenes == True: 67 | gene_sets = collections.defaultdict( lambda: set() ) 68 | for iv, s in exons.steps(): 69 | # For each step, make a set, 'full_set' of all the gene IDs occuring 70 | # in the present step, and also add all those gene IDs, whch have been 71 | # seen earlier to co-occur with each of the currently present gene IDs. 72 | full_set = set() 73 | for gene_id, transcript_id in s: 74 | full_set.add( gene_id ) 75 | full_set |= gene_sets[ gene_id ] 76 | # Make sure that all genes that are now in full_set get associated 77 | # with full_set, i.e., get to know about their new partners 78 | for gene_id in full_set: 79 | assert gene_sets[ gene_id ] <= full_set 80 | gene_sets[ gene_id ] = full_set 81 | 82 | 83 | # Step 3: Go through the steps again to get the exonic sections. Each step 84 | # becomes an 'exonic part'. The exonic part is associated with an 85 | # aggregate gene, i.e., a gene set as determined in the previous step, 86 | # and a transcript set, containing all transcripts that occur in the step. 87 | # The results are stored in the dict 'aggregates', which contains, for each 88 | # aggregate ID, a list of all its exonic_part features. 89 | 90 | aggregates = collections.defaultdict( lambda: list() ) 91 | for iv, s in exons.steps( ): 92 | # Skip empty steps 93 | if len(s) == 0: 94 | continue 95 | gene_id = list(s)[0][0] 96 | ## if aggregateGenes=FALSE, ignore the exons associated to more than one gene ID 97 | if aggregateGenes == False: 98 | check_set = set() 99 | for geneID, transcript_id in s: 100 | check_set.add( geneID ) 101 | if( len( check_set ) > 1 ): 102 | continue 103 | else: 104 | aggregate_id = gene_id 105 | # Take one of the gene IDs, find the others via gene sets, and 106 | # form the aggregate ID from all of them 107 | else: 108 | assert set( gene_id for gene_id, transcript_id in s ) <= gene_sets[ gene_id ] 109 | aggregate_id = '+'.join( gene_sets[ gene_id ] ) 110 | # Make the feature and store it in 'aggregates' 111 | f = HTSeq.GenomicFeature( aggregate_id, "exonic_part", iv ) 112 | f.source = os.path.basename( sys.argv[0] ) 113 | # f.source = "camara" 114 | f.attr = {} 115 | f.attr[ 'gene_id' ] = aggregate_id 116 | transcript_set = set( ( transcript_id for gene_id, transcript_id in s ) ) 117 | f.attr[ 'transcripts' ] = '+'.join( transcript_set ) 118 | aggregates[ aggregate_id ].append( f ) 119 | 120 | 121 | # Step 4: For each aggregate, number the exonic parts 122 | 123 | aggregate_features = [] 124 | for l in list(aggregates.values()): 125 | for i in range( len(l)-1 ): 126 | assert l[i].name == l[i+1].name, str(l[i+1]) + " has wrong name" 127 | assert l[i].iv.end <= l[i+1].iv.start, str(l[i+1]) + " starts too early" 128 | if l[i].iv.chrom != l[i+1].iv.chrom: 129 | raise ValueError("Same name found on two chromosomes: %s, %s" % ( str(l[i]), str(l[i+1]) )) 130 | if l[i].iv.strand != l[i+1].iv.strand: 131 | raise ValueError("Same name found on two strands: %s, %s" % ( str(l[i]), str(l[i+1]) )) 132 | aggr_feat = HTSeq.GenomicFeature( l[0].name, "aggregate_gene", 133 | HTSeq.GenomicInterval( l[0].iv.chrom, l[0].iv.start, 134 | l[-1].iv.end, l[0].iv.strand ) ) 135 | aggr_feat.source = os.path.basename( sys.argv[0] ) 136 | aggr_feat.attr = { 'gene_id': aggr_feat.name } 137 | for i in range( len(l) ): 138 | l[i].attr['exonic_part_number'] = "%03d" % ( i+1 ) 139 | aggregate_features.append( aggr_feat ) 140 | 141 | 142 | # Step 5: Sort the aggregates, then write everything out 143 | 144 | aggregate_features.sort( key = lambda f: ( f.iv.chrom, f.iv.start ) ) 145 | 146 | fout = open( out_file, "w" ) 147 | for aggr_feat in aggregate_features: 148 | fout.write( aggr_feat.get_gff_line() ) 149 | for f in aggregates[ aggr_feat.name ]: 150 | fout.write( f.get_gff_line() ) 151 | 152 | fout.close() 153 | -------------------------------------------------------------------------------- /additional_scripts/download_proteome_uniprot.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use warnings; 4 | use strict; 5 | use LWP::UserAgent; 6 | use HTTP::Date; 7 | 8 | # Check that a taxonomy identifier was passed as a command line argument 9 | if (!$ARGV[0]) { 10 | die "Error: No taxonomy identifier specified.\nUsage: perl download_proteome_uniprot.pl \n"; 11 | } 12 | 13 | # Taxonomy identifier of top node for query, e.g. 2 for Bacteria, 2157 for Archaea, etc. 14 | # (see https://www.uniprot.org/taxonomy) 15 | my $top_node = $ARGV[0]; 16 | 17 | # Create a user agent for making HTTP requests 18 | my $agent = LWP::UserAgent->new; 19 | 20 | # Get a list of all reference proteomes of organisms below the given taxonomy node. 21 | my $query_list = "https://rest.uniprot.org/proteomes/stream?query=reference:true+taxonomy_id:$top_node&format=list"; 22 | 23 | my $response_list = $agent->get($query_list); 24 | 25 | # Check for HTTP errors 26 | if (!$response_list->is_success) { 27 | die 'Failed to get proteome list: ' . $response_list->status_line . 28 | ' for ' . $response_list->request->uri . "\n"; 29 | } 30 | 31 | # For each proteome, mirror its set of UniProt entries in compressed FASTA format. 32 | for my $proteome (split(/\n/, $response_list->content)) { 33 | my $file = $proteome . '.fasta.gz'; 34 | my $query_proteome = "https://rest.uniprot.org/uniprotkb/stream?query=proteome:$proteome&format=fasta&compressed=true"; 35 | my $response_proteome = $agent->mirror($query_proteome, $file); 36 | 37 | # Check for HTTP errors 38 | if ($response_proteome->is_success) { 39 | my $release = $response_proteome->header('x-uniprot-release'); 40 | my $date = $response_proteome->header('x-uniprot-release-date'); 41 | print "File $file: downloaded entries of UniProt release $release ($date)\n"; 42 | } 43 | elsif ($response_proteome->code == HTTP::Status::RC_NOT_MODIFIED) { 44 | print "File $file: up-to-date\n"; 45 | } 46 | else { 47 | die 'Failed to download proteome: ' . $response_proteome->status_line . 48 | ' for ' . $response_proteome->request->uri . "\n"; 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /additional_scripts/homolog_parser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | ### Libraries 5 | 6 | # conda install -c anaconda pandas 7 | # conda install --channel conda-forge --channel bioconda pybedtools 8 | # pip install xlrd==1.2.0 9 | 10 | import sys 11 | import csv 12 | import argparse 13 | import pandas as pd 14 | import numpy as np 15 | import pybedtools 16 | 17 | np.random.seed(5) 18 | 19 | parser = argparse.ArgumentParser(description="This script will annotate and add genomic coordinates to Novel proteins not included in reference GTF annotations.") 20 | parser.add_argument('--Ref', help="Ref_Transcript_Annotation.csv file, obtained with isoform-identification pipeline", type=str) 21 | parser.add_argument('--blastp', help="parsed_results.tab file, blastp result from Novel proteins against UniProt database", type=str) 22 | parser.add_argument('--bed', help="final_annotated.format.bed, which correspond to final_annotation.gtf file as BED format", type=str) 23 | parser.add_argument('--eggnog', help="out.emapper.annotations.xlsx file, which correspond to eggNOG-mapper annotations", type=str) 24 | args = parser.parse_args() 25 | 26 | name = sys.argv[0] 27 | REFERENCE = str(sys.argv[2]) 28 | BLASTP = str(sys.argv[4]) 29 | BED = str(sys.argv[6]) 30 | EGGNOG = str(sys.argv[8]) 31 | 32 | class bcolors: 33 | HEADER = '\033[95m' 34 | OKBLUE = '\033[94m' 35 | OKCYAN = '\033[96m' 36 | OKGREEN = '\033[92m' 37 | OKRED = '\033[91m' 38 | FAIL = '\033[91m' 39 | ENDC = '\033[0m' 40 | BOLD = '\033[1m' 41 | UNDERLINE = '\033[4m' 42 | 43 | 44 | print(bcolors.OKGREEN + "1 ::: Reading Ref_Transcript_Annotation.csv ::: " + bcolors.ENDC) 45 | print("") 46 | 47 | 48 | df1 = pd.read_csv(REFERENCE, sep = ',') 49 | df1 = df1.rename(columns={'qry_id': 'transcript'}) 50 | print("Number of transcripts matching reference:", df1.shape[0]) 51 | print("") 52 | 53 | 54 | df1.head(5) 55 | 56 | print(bcolors.OKGREEN + "2 ::: Reading parsed_results.tab ::: " + bcolors.ENDC) 57 | print("") 58 | 59 | 60 | df2 = pd.read_csv(BLASTP, sep = '\t') 61 | print("Number of novel proteins sharing >=90% identity with uniprot:", df2.shape[0]) 62 | print("") 63 | 64 | 65 | df2.head(5) 66 | 67 | 68 | print(bcolors.OKGREEN + " ::: formatting columns of blastp_results :::" + bcolors.ENDC) 69 | print("") 70 | 71 | df2[['hit_id','Transcript Description']] = df2['hit_id'].str.split(" ", 1 ,expand=True) 72 | df2 73 | 74 | 75 | df2[['Transcript Description', 'organism']] = df2['Transcript Description'].str.split('OS=',expand=True) 76 | df2 77 | 78 | print(df2.head(5)) 79 | 80 | blastp_results = df2.iloc[:, 0:7] 81 | blastp_results.columns = ['transcript', 'hit_id', 'percentage_identity', 'query_length', 'alignment_length', 'e_value', 'Transcript Description'] 82 | 83 | 84 | print(blastp_results.head(5)) 85 | 86 | 87 | print(bcolors.OKGREEN + "3 ::: Reading final_annotated.format.bed ::: " + bcolors.ENDC) 88 | print("") 89 | 90 | 91 | df3 = pd.read_csv(BED, sep = '\t') 92 | print("Number of trascripts in bed format:", df3.shape[0]) 93 | print("") 94 | df3.columns = ['chr', 'start', 'end', 'transcript', 'gene'] 95 | 96 | 97 | print(df3.head(5)) 98 | 99 | 100 | print(bcolors.OKGREEN + "4 ::: Adding genomic coordinates to blastp_results ::: " + bcolors.ENDC) 101 | print("") 102 | 103 | 104 | result1 = pd.merge(blastp_results, df3, on='transcript', how='inner') 105 | result1 106 | 107 | 108 | print(bcolors.OKGREEN + "5 ::: Adding genomic coordinates to Novel proteins and Ref_Transcript_Annotation ::: " + bcolors.ENDC) 109 | print("") 110 | 111 | 112 | result2 = pd.merge(df1, df3, on='transcript', how='inner') 113 | result2 114 | 115 | 116 | Novel_protein_hits = result1.loc[:, ['chr', 'start', 'end', 'transcript', 'Transcript Description', 'percentage_identity']] 117 | print("Novel protein hits:") 118 | print(Novel_protein_hits.head(5)) 119 | print("") 120 | 121 | Reference_annotation = result2.loc[:, ['chr', 'start', 'end', 'transcript', 'Transcript Description', 'NCBI RefSeq Gene ID']] 122 | print("Reference annotations:") 123 | print(Reference_annotation.head(5)) 124 | print("") 125 | 126 | print(bcolors.OKGREEN + " 6 ::: Creating bed files with annotations ::: " + bcolors.ENDC) 127 | print("") 128 | a = pybedtools.BedTool.from_dataframe(Novel_protein_hits) 129 | b = pybedtools.BedTool.from_dataframe(Reference_annotation) 130 | a.saveas('Novel_protein_with_coordinates.bed') 131 | b.saveas('Reference_annotation_with_coordinates.bed') 132 | 133 | print(bcolors.OKGREEN + " 7 ::: Parsing eggNOG-mapper annotations and intersect with blastp results::: " + bcolors.ENDC) 134 | print("") 135 | 136 | df1 = pd.read_excel(EGGNOG) 137 | 138 | df1.head(5) 139 | 140 | df1 = df1.rename(columns={'query': 'transcript'}) 141 | 142 | intersect1 = pd.merge(df1, blastp_results, on='transcript', how='inner') 143 | Novel_protein_hits_coords = result1.loc[:, ['chr', 'start', 'end', 'transcript']] 144 | intersect2 = pd.merge(Novel_protein_hits_coords, intersect1, on='transcript', how='inner') 145 | intersect2 = intersect2.rename(columns={'hit_id': 'blastp_hit_id', 'percentage_identity': 'blastp_percentage_identity', 'e_value': 'blastp_e_value', 'Transcript Description': 'blastp_transcript_description'}) 146 | intersect2.to_csv('eggNOG-mapper-blastp-intersections.csv') 147 | 148 | print(intersect2.head(5)) 149 | print("") 150 | 151 | print("filtering table with intersections with assigned Gene IDs: ") 152 | intersect3 = intersect2.replace('-', np.nan) 153 | intersect3 = intersect3.dropna(subset=['Preferred_name']) 154 | c = pybedtools.BedTool.from_dataframe(intersect3) 155 | c.saveas('eggNOG-mapper-blastp-with_coordinates.bed') 156 | 157 | print("") 158 | print(bcolors.OKGREEN + "Novel_protein_with_coordinates.bed ==> corresponds to a BED file containing novel proteins with mapped coordinates" + bcolors.ENDC) 159 | print("") 160 | print(bcolors.OKGREEN + "Reference_annotation_with_coordinates.bed ==> corresponds to a BED file containing annotated proteins with mapped coordinates" + bcolors.ENDC) 161 | print("") 162 | print(bcolors.OKGREEN + "eggNOG-mapper-blastp-intersections.csv ==> corresponds to a csv file containing eggNOG-mapper + blastp intersections" + bcolors.ENDC) 163 | print("") 164 | print(bcolors.OKGREEN + "eggNOG-mapper-blastp-with_coordinates.bed ==> corresponds to a BED file containing eggNOG-mapper + blastp intersections" + bcolors.ENDC) 165 | print("") 166 | print("All Done") 167 | print("") 168 | -------------------------------------------------------------------------------- /additional_scripts/transcriptome_metrics.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | usage="$(basename "$0") [-h] [-f ] [-g ] 6 | This script will obtain metrics from the annotated StringTie transcripts (final_annotated.gtf) and output them into -transcriptome_metrics- sudirectory. 7 | Arguments: 8 | -h show this help text 9 | -f Name of the StringTie annotated GTF from the pipeline 10 | -g Reference genome (in fasta format)" 11 | options=':hf:g:' 12 | while getopts $options option; do 13 | case "$option" in 14 | h) echo "$usage"; exit;; 15 | f) f=$OPTARG;; 16 | g) g=$OPTARG;; 17 | :) printf "missing argument for -%s\n" "$OPTARG" >&2; echo "$usage" >&2; exit 1;; 18 | \?) printf "illegal option: -%s\n" "$OPTARG" >&2; echo "$usage" >&2; exit 1;; 19 | esac 20 | done 21 | 22 | # mandatory arguments 23 | if [ ! "$f" ] || [ ! "$g" ]; then 24 | echo "arguments -f and -g must be provided" 25 | echo "$usage" >&2; exit 1 26 | fi 27 | 28 | begin=`date +%s` 29 | # .---------- constant part! 30 | # vvvv vvvv-- the code from above 31 | YELLOW='\033[1;33m' 32 | PURPLE='\033[0;35m' 33 | CYAN='\033[0;36m' 34 | NC='\033[0m' # No Color 35 | echo "Cleaning directory..." 36 | rm -r -f transcriptome_metrics 37 | echo "" 38 | echo "done" 39 | echo "" 40 | echo "===> Working on transcriptome metrics" 41 | echo "" 42 | perl -lne 'print "@m" if @m=(/((?:transcript_id|gene_id)\s+\S+)/g);' ${f} > final_annotated.tab 43 | sed -i 's/transcript_id //g' final_annotated.tab 44 | sed -i 's/;/\t/g' final_annotated.tab 45 | sed -i 's/gene_id//g' final_annotated.tab 46 | sed -i 's/"//g' final_annotated.tab 47 | awk '!a[$0]++' final_annotated.tab > genes_and_transcripts.tab && rm final_annotated.tab 48 | awk '{print $1"\t"$2}' genes_and_transcripts.tab > genes-and-transcripts.tab && rm genes_and_transcripts.tab 49 | awk '{print $1}' genes-and-transcripts.tab > genes.tab 50 | # Novel genes list 51 | grep "STRG." genes.tab > novel-genes.tab 52 | # Known genes list 53 | grep -v "STRG." genes.tab > known-genes.tab 54 | echo "::: Parsing final_annotated.gtf file to obtain novel/known and coding/lncRNA transcripts, respectively." 55 | echo "" 56 | grep -w -F -f novel-genes.tab final_annotated.gtf > novel-genes.gtf 57 | grep -w -F -f known-genes.tab final_annotated.gtf > known-genes.gtf 58 | grep "coding" known-genes.gtf > known-genes-coding.gtf 59 | grep "lncRNA" known-genes.gtf > known-genes-lncRNA.gtf 60 | grep "StringTie" known-genes.gtf > known-genes-other.gtf # other = no lncRNA and no protein-coding 61 | grep "coding" novel-genes.gtf > novel-genes-coding.gtf 62 | grep "lncRNA" novel-genes.gtf > novel-genes-lncRNA.gtf 63 | grep "StringTie" novel-genes.gtf > novel-genes-other.gtf # other = no lncRNA and no protein-coding 64 | echo "::: We will use gffread to obtain reconciled and novel transcripts in the parsed GTF file" 65 | echo "" 66 | gffread -w known-transcripts-coding.fa -g ${g} known-genes-coding.gtf 67 | gffread -w known-transcripts-lncRNA.fa -g ${g} known-genes-lncRNA.gtf 68 | gffread -w known-transcripts-other.fa -g ${g} known-genes-other.gtf 69 | gffread -w novel-transcripts-coding.fa -g ${g} novel-genes-coding.gtf 70 | gffread -w novel-transcripts-lncRNA.fa -g ${g} novel-genes-lncRNA.gtf 71 | gffread -w novel-transcripts-other.fa -g ${g} novel-genes-other.gtf 72 | exec 3<> transcriptome_metrics.txt 73 | echo "Number of reconciled coding transcripts:" >> transcriptome_metrics.txt 74 | grep ">" known-transcripts-coding.fa -c >> transcriptome_metrics.txt 75 | echo "" >> transcriptome_metrics.txt 76 | echo "Number of reconciled non-coding transcripts:" >> transcriptome_metrics.txt 77 | grep ">" known-transcripts-lncRNA.fa -c >> transcriptome_metrics.txt 78 | echo "" >> transcriptome_metrics.txt 79 | echo "Number of other expressed features, annotated:" >> transcriptome_metrics.txt 80 | grep ">" known-transcripts-other.fa -c >> transcriptome_metrics.txt 81 | echo "" >> transcriptome_metrics.txt 82 | echo "Number of non-annotated coding transcripts:" >> transcriptome_metrics.txt 83 | grep ">" novel-transcripts-coding.fa -c >> transcriptome_metrics.txt 84 | echo "" >> transcriptome_metrics.txt 85 | echo "Number of non-annotated non-coding novel transcripts:" >> transcriptome_metrics.txt 86 | grep ">" novel-transcripts-lncRNA.fa -c >> transcriptome_metrics.txt 87 | echo "" >> transcriptome_metrics.txt 88 | echo "Number of novel expressed features:" >> transcriptome_metrics.txt 89 | grep ">" novel-transcripts-other.fa -c >> transcriptome_metrics.txt 90 | exec 3>&- 91 | echo "::: Done. transcriptome_metrics.txt contains metrics of classified transcripts. Continue with gene metrics.." 92 | echo "" 93 | echo "" 94 | echo "===> Working on gene metrics" 95 | echo "" 96 | exec 3<> gene_metrics.txt 97 | # known coding genes counts 98 | perl -lne 'print "@m" if @m=(/((?:transcript_id|gene_id)\s+\S+)/g);' known-genes-coding.gtf > known-genes-coding.tab 99 | sed -i 's/transcript_id //g' known-genes-coding.tab 100 | sed -i 's/;/\t/g' known-genes-coding.tab 101 | sed -i 's/gene_id//g' known-genes-coding.tab 102 | sed -i 's/"//g' known-genes-coding.tab 103 | awk '{print $1}' known-genes-coding.tab > known-genes-coding.tabular && rm known-genes-coding.tab 104 | awk '!a[$0]++' known-genes-coding.tabular > known-genes-coding.tab && rm known-genes-coding.tabular 105 | echo "Number of reconciled coding genes:" >> gene_metrics.txt 106 | cat known-genes-coding.tab | wc -l >> gene_metrics.txt 107 | echo "" >> gene_metrics.txt 108 | echo "Number of reconciled non-coding genes:" >> gene_metrics.txt 109 | perl -lne 'print "@m" if @m=(/((?:transcript_id|gene_id)\s+\S+)/g);' known-genes-lncRNA.gtf > known-genes-lncRNA.tab 110 | sed -i 's/transcript_id //g' known-genes-lncRNA.tab 111 | sed -i 's/;/\t/g' known-genes-lncRNA.tab 112 | sed -i 's/gene_id//g' known-genes-lncRNA.tab 113 | sed -i 's/"//g' known-genes-lncRNA.tab 114 | awk '{print $1}' known-genes-lncRNA.tab > known-genes-lncRNA.tabular && rm known-genes-lncRNA.tab 115 | awk '!a[$0]++' known-genes-lncRNA.tabular > known-genes-lncRNA.tab && rm known-genes-lncRNA.tabular 116 | cat known-genes-lncRNA.tab | wc -l >> gene_metrics.txt 117 | echo "" >> gene_metrics.txt 118 | echo "Number of other reconciled genes:" >> gene_metrics.txt 119 | perl -lne 'print "@m" if @m=(/((?:transcript_id|gene_id)\s+\S+)/g);' known-genes-other.gtf > known-genes-other.tab 120 | sed -i 's/transcript_id //g' known-genes-other.tab 121 | sed -i 's/;/\t/g' known-genes-other.tab 122 | sed -i 's/gene_id//g' known-genes-other.tab 123 | sed -i 's/"//g' known-genes-other.tab 124 | awk '{print $1}' known-genes-other.tab > known-genes-other.tabular && rm known-genes-other.tab 125 | awk '!a[$0]++' known-genes-other.tabular > known-genes-other.tab && rm known-genes-other.tabular 126 | cat known-genes-other.tab | wc -l >> gene_metrics.txt 127 | echo "" >> gene_metrics.txt 128 | echo "Number of non-annotated coding genes:" >> gene_metrics.txt 129 | perl -lne 'print "@m" if @m=(/((?:transcript_id|gene_id)\s+\S+)/g);' novel-genes-coding.gtf > novel-genes-coding.tab 130 | sed -i 's/transcript_id //g' novel-genes-coding.tab 131 | sed -i 's/;/\t/g' novel-genes-coding.tab 132 | sed -i 's/gene_id//g' novel-genes-coding.tab 133 | sed -i 's/"//g' novel-genes-coding.tab 134 | awk '{print $1}' novel-genes-coding.tab > novel-genes-coding.tabular && rm novel-genes-coding.tab 135 | awk '!a[$0]++' novel-genes-coding.tabular > novel-genes-coding.tab && rm novel-genes-coding.tabular 136 | cat novel-genes-coding.tab | wc -l >> gene_metrics.txt 137 | echo "" >> gene_metrics.txt 138 | echo "Number of non-annotated non-coding genes:" >> gene_metrics.txt 139 | perl -lne 'print "@m" if @m=(/((?:transcript_id|gene_id)\s+\S+)/g);' novel-genes-lncRNA.gtf > novel-genes-lncRNA.tab 140 | sed -i 's/transcript_id //g' novel-genes-lncRNA.tab 141 | sed -i 's/;/\t/g' novel-genes-lncRNA.tab 142 | sed -i 's/gene_id//g' novel-genes-lncRNA.tab 143 | sed -i 's/"//g' novel-genes-lncRNA.tab 144 | awk '{print $1}' novel-genes-lncRNA.tab > novel-genes-lncRNA.tabular && rm novel-genes-lncRNA.tab 145 | awk '!a[$0]++' novel-genes-lncRNA.tabular > novel-genes-lncRNA.tab && rm novel-genes-lncRNA.tabular 146 | cat novel-genes-lncRNA.tab | wc -l >> gene_metrics.txt 147 | echo "" >> gene_metrics.txt 148 | echo "Number of non-annotated other genes:" >> gene_metrics.txt 149 | perl -lne 'print "@m" if @m=(/((?:transcript_id|gene_id)\s+\S+)/g);' novel-genes-other.gtf > novel-genes-other.tab 150 | sed -i 's/transcript_id //g' novel-genes-other.tab 151 | sed -i 's/;/\t/g' novel-genes-other.tab 152 | sed -i 's/gene_id//g' novel-genes-other.tab 153 | sed -i 's/"//g' novel-genes-other.tab 154 | awk '{print $1}' novel-genes-other.tab > novel-genes-other.tabular && rm novel-genes-other.tab 155 | awk '!a[$0]++' novel-genes-other.tabular > novel-genes-other.tab && rm novel-genes-other.tabular 156 | cat novel-genes-other.tab | wc -l >> gene_metrics.txt 157 | exec 3>&- 158 | echo "::: gene_metrics.txt were succesfully generated." 159 | echo "" 160 | mkdir transcriptome_metrics 161 | mv genes-and-transcripts.tab genes.tab novel-genes.tab known-genes.tab novel-genes.gtf known-genes.gtf known-genes-coding.gtf known-genes-lncRNA.gtf known-genes-other.gtf novel-genes-coding.gtf novel-genes-lncRNA.gtf novel-genes-other.gtf known-transcripts-coding.fa known-transcripts-lncRNA.fa known-transcripts-other.fa novel-transcripts-coding.fa novel-transcripts-lncRNA.fa novel-transcripts-other.fa known-genes-coding.tab known-genes-lncRNA.tab known-genes-other.tab novel-genes-coding.tab novel-genes-lncRNA.tab novel-genes-other.tab ./transcriptome_metrics/ 162 | mv transcriptome_metrics.txt gene_metrics.txt ./transcriptome_metrics/ 163 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::${NC}\n" 164 | echo "All Done. The transcripts were classified in the ./transcriptome_metrics subdirectory." 165 | echo "" 166 | echo "Transcript discoveries are summarized in transcriptome_metrics.txt file." 167 | echo "" 168 | echo "Gene discoveries are summarized in gene_metrics.txt file." 169 | echo "" 170 | echo "known-genes-coding.gtf, known-genes-lncRNA.gtf and known-genes-other.gtf contains reconciled annotation with reference, in GTF format" 171 | echo "" 172 | echo "novel-genes-coding.gtf, novel-genes-lncRNA.gtf and novel-genes-other.gtf contains novel annotation with reference, in GTF format" 173 | echo "" 174 | echo "known-transcripts-coding.fa, known-transcripts-lncRNA.fa and known-transcripts-other.fa contains reconciled classified transcripts, in FASTA format" 175 | echo "" 176 | echo "novel-transcripts-coding.fa, novel-transcripts-lncRNA.fa and novel-transcripts-other.fa contains novel classified transcripts, in FASTA format" 177 | echo "" 178 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::${NC}\n" 179 | -------------------------------------------------------------------------------- /bash_scripts/add_ncbi_annotation.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | { 6 | 7 | usage="$(basename "$0") [-h] [-a ] [-n ] [-r ] [-g ] [-c ] [-t ] [-o ] 8 | This pipeline will Overlap StringTie transcripts (GTF format) with current NCBI annotation and will annotate novel transcripts. 9 | Arguments: 10 | -h show this help text 11 | -a StringTie GTF 12 | -n NCBI gene annotation (in GTF format) 13 | -r UCSC gene annotation (in GTF format) 14 | -g Reference genome (in fasta format) 15 | -c GAWN config file (path to gawn_config.sh in annotate_my_genomes folder) 16 | -t Number of threads for processing (integer) 17 | -o output folder (must exist)" 18 | options=':ha:n:r:g:c:t:o:' 19 | while getopts $options option; do 20 | case "$option" in 21 | h) echo "$usage"; exit;; 22 | a) a=$OPTARG;; 23 | n) n=$OPTARG;; 24 | r) r=$OPTARG;; 25 | g) g=$OPTARG;; 26 | c) c=$OPTARG;; 27 | t) t=$OPTARG;; 28 | o) o=$OPTARG;; 29 | :) printf "missing argument for -%s\n" "$OPTARG" >&2; echo "$usage" >&2; exit 1;; 30 | \?) printf "illegal option: -%s\n" "$OPTARG" >&2; echo "$usage" >&2; exit 1;; 31 | esac 32 | done 33 | 34 | # mandatory arguments 35 | if [ ! "$a" ] || [ ! "$n" ] || [ ! "$r" ] || [ ! "$g" ] || [ ! "$c" ] || [ ! "$t" ] || [ ! "$o" ]; then 36 | echo "arguments -a, -n, -r, -g, -c, -t and -o must be provided" 37 | echo "$usage" >&2; exit 1 38 | fi 39 | 40 | # Conditions : output folder 41 | if [ ! -d "$o" ]; then 42 | echo "Output directory: $o not found. Please create the output directory first, before running the pipeline." 43 | exit 9999 # die with error code 9999 44 | fi 45 | 46 | # Conditions : Input existance if [ ! "$a" ] || [ ! "$n" ] || [ ! "$r" ] || [ ! "$g" ] || [ ! "$c" ] || [ ! "$t" ] || [ ! "$o" ]; then 47 | 48 | if [ ! -e "$a" ]; then 49 | echo "$a does not exist. Check your -a input" 50 | exit 9999 # die with error code 9999 51 | fi 52 | 53 | if [ ! -e "$n" ]; then 54 | echo "$n does not exist. Check your -n input" 55 | exit 9999 # die with error code 9999 56 | fi 57 | 58 | if [ ! -e "$r" ]; then 59 | echo "$r does not exist. Check your -r input" 60 | exit 9999 # die with error code 9999 61 | fi 62 | 63 | if [ ! -e "$g" ]; then 64 | echo "$g does not exist. Check your -g input" 65 | exit 9999 # die with error code 9999 66 | fi 67 | 68 | if [ ! -e "$c" ]; then 69 | echo "$c does not exist. Check your -c input" 70 | exit 9999 # die with error code 9999 71 | fi 72 | 73 | # Conditions : Getting absolute path of inputs 74 | echo "" 75 | a_DIR="$( cd "$( dirname "$a" )" && pwd )" 76 | echo "" 77 | echo "::: The absolute path of -a is $a_DIR" 78 | echo "" 79 | n_DIR="$( cd "$( dirname "$n" )" && pwd )" 80 | echo "" 81 | echo "::: The absolute path of -n is $n_DIR" 82 | echo "" 83 | r_DIR="$( cd "$( dirname "$r" )" && pwd )" 84 | echo "" 85 | echo "::: The absolute path of -r is $r_DIR" 86 | echo "" 87 | g_DIR="$( cd "$( dirname "$g" )" && pwd )" 88 | echo "" 89 | echo "::: The absolute path of -g is $g_DIR" 90 | echo "" 91 | c_DIR="$( cd "$( dirname "$c" )" && pwd )" 92 | echo "" 93 | echo "::: The absolute path of -c is $c_DIR" 94 | echo "" 95 | o_DIR="$( cd "$( dirname "$o" )" && pwd )" 96 | echo "" 97 | echo "::: The absolute path of -o is $o_DIR" 98 | echo "" 99 | 100 | 101 | begin=`date +%s` 102 | # .---------- constant part! 103 | # vvvv vvvv-- the code from above 104 | YELLOW='\033[1;33m' 105 | PURPLE='\033[0;35m' 106 | CYAN='\033[0;36m' 107 | NC='\033[0m' # No Color 108 | 109 | 110 | printf "${YELLOW}::: Defining Variables :::\n" 111 | echo "" 112 | echo "Defining variables:" 113 | echo"" 114 | FILE1="$a" 115 | basename "$FILE1" 116 | stringtie_input="$(basename -- $FILE1)" 117 | echo "The stringtie file used as input is the following: $stringtie_input" 118 | echo "" 119 | FILE2="$n" 120 | basename "$FILE2" 121 | ncbi_reference_gtf="$(basename -- $FILE2)" 122 | echo "The NCBI reference GTF used as input is the following: $ncbi_reference_gtf" 123 | echo "" 124 | FILE3="$r" 125 | basename "$FILE3" 126 | reference_gtf="$(basename -- $FILE3)" 127 | echo "The reference GTF used as input is the following: $reference_gtf" 128 | echo "" 129 | FILE4="$g" 130 | basename "$FILE4" 131 | reference_genome="$(basename -- $FILE4)" 132 | echo "The reference genome used as input is the following: $reference_genome" 133 | echo "" 134 | FILE5="$c" 135 | basename "$FILE5" 136 | gawn_config="$(basename -- $FILE5)" 137 | echo "The gawn_config file used as input is the following: $gawn_config" 138 | echo "" 139 | FILE6="$t" 140 | basename "$FILE6" 141 | threads="$(basename -- $FILE6)" 142 | echo "The number of threads for calculation are the following: $threads" 143 | echo "" 144 | FILE7="$o" 145 | basename "$FILE7" 146 | output_folder="$(basename -- $FILE7)" 147 | echo "The output folder is the following: $output_folder" 148 | echo "" 149 | 150 | printf "${YELLOW}:::::::::::::::::::::::::::::::\n" 151 | printf "${YELLOW}::: 0. Defining directories :::\n" 152 | printf "${YELLOW}:::::::::::::::::::::::::::::::${CYAN}\n" 153 | echo "" 154 | 155 | dir0=$(cd -P -- "$(dirname -- "$0")" && pwd -P) 156 | 157 | sec=$(date "+%Y%m%d_%H%M%S") 158 | # mkdir add_ncbi_annotation_$sec 159 | 160 | if [ -z "$(ls -A ${o_DIR}/${output_folder})" ]; then 161 | echo "" 162 | echo "Output folder is empty. We will work inside the provided output folder: " 163 | cd ${o_DIR}/${output_folder} 164 | echo "" 165 | else 166 | echo "" 167 | echo "Output folder is not empty. Creating temporary folder:" 168 | sec=$(date "+%Y%m%d_%H%M%S") 169 | cd ${o_DIR}/${output_folder} 170 | mkdir add_ncbi_annotation_$sec && cd add_ncbi_annotation_$sec 171 | fi 172 | 173 | # cd ${o_DIR}/${output_folder} 174 | 175 | if [ -f $stringtie_input ]; then 176 | echo "" 177 | echo "$stringtie_input file found in output directory. Continue." 178 | echo "" 179 | : 180 | else 181 | echo "" 182 | echo "Copying $stringtie_input file into the output directory:" 183 | cp ${a_DIR}/${stringtie_input} ./ 184 | echo "" 185 | fi 186 | 187 | # cp ${a_DIR}/${stringtie_input} ${o_DIR}/${output_folder} 188 | # cd add_ncbi_annotation_$sec 189 | 190 | dir1=$(cd -P -- "$(dirname -- "$0")" && pwd -P) 191 | echo "" 192 | echo "Current Working Directory:" 193 | echo "" 194 | echo $dir1 195 | echo "" 196 | printf "${YELLOW}::: Done :::\n" 197 | echo "" 198 | 199 | printf "${YELLOW}:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n" 200 | printf "${YELLOW}::: 1. Overlapping StringTie transcripts with NCBI annotation :::\n" 201 | printf "${YELLOW}:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::${CYAN}\n" 202 | echo "" 203 | 204 | gffcompare -R -r ${n_DIR}/${ncbi_reference_gtf} -s ${g_DIR}/${reference_genome} -o NCBI_compare ${stringtie_input} 205 | echo "Done." 206 | printf "${PURPLE}::: Done :::\n" 207 | echo "" 208 | 209 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::\n" 210 | printf "${YELLOW}::: 2. Writting novel discoveries to Stats.txt :::\n" 211 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::${CYAN}\n" 212 | 213 | echo "" 214 | # Stats 215 | exec 3<> Stats.txt 216 | echo "Number of assembled genes:" >> Stats.txt 217 | cat NCBI_compare.${stringtie_input}.tmap | sed "1d" | cut -f4 | sort | uniq | wc -l >> Stats.txt 218 | echo "" >> Stats.txt 219 | echo "Number of novel genes:" >> Stats.txt 220 | cat NCBI_compare.${stringtie_input}.tmap | awk '$3=="u"{print $0}' | cut -f4 | sort | uniq | wc -l >> Stats.txt 221 | echo "" >> Stats.txt 222 | echo "Number of novel transcripts:" >> Stats.txt 223 | cat NCBI_compare.${stringtie_input}.tmap | awk '$3=="u"{print $0}' | cut -f5 | sort | uniq | wc -l >> Stats.txt 224 | echo "" >> Stats.txt 225 | echo "Number of transcripts matching annotation:" >> Stats.txt 226 | cat NCBI_compare.${stringtie_input}.tmap | awk '$3=="="{print $0}' | cut -f5 | sort | uniq | wc -l >> Stats.txt 227 | exec 3>&- 228 | printf "${PURPLE}Done\n" 229 | echo "" 230 | 231 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n" 232 | printf "${YELLOW}::: 3. Replacing gene_id field in final_annotated.gtf file with NCBI gene_id's :::\n" 233 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::${CYAN}\n" 234 | 235 | echo "" 236 | ####################################### 237 | # Merging novel transcripts with ref. # 238 | ####################################### 239 | awk '{print $4"\t"$1}' NCBI_compare.${stringtie_input}.tmap > NCBI_compare.${stringtie_input}.tmap.1 240 | tail -n +2 NCBI_compare.${stringtie_input}.tmap.1 > NCBI_compare.${stringtie_input}.tmap.2 241 | awk '$2 != "-"' NCBI_compare.${stringtie_input}.tmap.2 > namelist 242 | awk '!a[$0]++' namelist > namelist_unique 243 | tac namelist_unique > namelist_unique_sorted 244 | rm namelist namelist_unique 245 | awk '{print $1}' namelist_unique_sorted > A 246 | awk '{print $2}' namelist_unique_sorted > B 247 | sed 's/^/"/' A > A.1 248 | sed 's/$/"/' A.1 > A.2 249 | sed 's/^/"/' B > B.1 250 | sed 's/$/"/' B.1 > B.2 251 | paste -d'\t' A.2 B.2 > namelist 252 | rm A A.1 A.2 B B.1 B.2 253 | ############################### 254 | # Getting gene names replaced # 255 | ############################### 256 | awk '{print $1}' namelist > fileA 257 | awk '{print $2}' namelist > fileB 258 | paste -d % fileA fileB > sed.script 259 | sed -i -e 's/^/s%/' sed.script 260 | sed -i -e 's/$/%/' sed.script 261 | cat ${a_DIR}/${stringtie_input} | parallel --pipe -j ${t} sed -f sed.script > final_annotated.gtf 262 | rm -f fileA fileB *tmap.1 *tmap.2 263 | # sorting GTF file 264 | rm -r -f gff3sort 265 | git clone https://github.com/cfarkas/gff3sort.git 266 | perl ./gff3sort/gff3sort.pl final_annotated.gtf > final_annotated.sorted.gtf 267 | rm final_annotated.gtf 268 | mv final_annotated.sorted.gtf final_annotated.gtf 269 | printf "${PURPLE}::: Done. Gene_id field was replaced in the StringTie.gtf file and final_annotated.gtf was generated with these changes\n" 270 | echo "" 271 | printf "${PURPLE}::: Moving gffcompare results to gffcompare_outputs folder ...\n" 272 | echo "" 273 | rm -r -f gffcompare_outputs_NCBI 274 | mkdir gffcompare_outputs_NCBI 275 | mv *.loci *.stats *.refmap *.tmap *.tracking ./gffcompare_outputs_NCBI 276 | echo "" 277 | printf "${PURPLE}::: Done\n" 278 | echo "" 279 | 280 | printf "${YELLOW}:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n" 281 | printf "${YELLOW}::: 4. Obtaining Transcripts in FASTA format with gffread :::\n" 282 | printf "${YELLOW}:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::${CYAN}\n" 283 | 284 | echo "" 285 | gffread -w NCBI_transcripts.fa -g ${g_DIR}/${reference_genome} final_annotated.gtf 286 | echo "" 287 | printf "${PURPLE}::: Done. NCBI_transcripts.fa are located in current directory\n" 288 | echo "" 289 | 290 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n" 291 | printf "${YELLOW}::: 5. Performing gene annotation by using GAWN pipeline :::\n" 292 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::${CYAN}\n" 293 | 294 | ################################################################ 295 | # Configuring Gawn Inputs, config file and running GAWN pipeline 296 | ################################################################ 297 | 298 | echo "" 299 | printf "${PURPLE}::: Downloading GAWN annotation folder. See https://github.com/enormandeau/gawn.git${CYAN}\n" 300 | echo "" 301 | rm -r -f gawn 302 | git clone https://github.com/cfarkas/gawn.git 303 | cd gawn/02_infos/ 304 | dir2=$(cd -P -- "$(dirname -- "$0")" && pwd -P) 305 | echo "Done" 306 | echo "" 307 | cd ${dir1} 308 | cp ${g_DIR}/${reference_genome} ${dir1}/gawn/03_data/genome.fasta 309 | cp NCBI_transcripts.fa ${dir1}/gawn/03_data/transcriptome.fasta 310 | rm ${dir2}/gawn_config.sh 311 | cp ${c_DIR}/${gawn_config} ${dir2}/gawn_config.sh 312 | echo "" 313 | printf "${PURPLE}::: Starting GAWN transcript annotation${CYAN}\n" 314 | echo "" 315 | cd ${dir1}/gawn/ 316 | ./gawn 02_infos/gawn_config.sh 317 | echo "" 318 | printf "${PURPLE}::: Done. The novel transcripts were annotated in ./gawn/04_annotation/ :::${CYAN}\n" 319 | echo "" 320 | 321 | ################################# 322 | # Extracting transcriptome hits # 323 | ################################# 324 | 325 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::\n" 326 | printf "${YELLOW}::: 6. Extracting transcriptome hits :::\n" 327 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::${CYAN}\n" 328 | 329 | echo "" 330 | cd ${dir1}/ 331 | cp ${dir1}/gawn/04_annotation/transcriptome.swissprot ${dir1} 332 | cp ${dir1}/gawn/04_annotation/transcriptome.hits ${dir1} 333 | printf "${PURPLE}::: Done. transcriptome hits were succesfully extracted :::${CYAN}\n" 334 | echo "" 335 | 336 | ############################################ 337 | # FEELnc long noncoding RNA identification # 338 | ############################################ 339 | 340 | cd ${dir1} 341 | 342 | printf "${YELLOW}:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n" 343 | printf "${YELLOW}::: 7. Classifying protein-coding and long non-coding transcripts with FEELnc :::\n" 344 | printf "${YELLOW}:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::${CYAN}\n" 345 | 346 | grep "NM_" ${r_DIR}/${reference_gtf} > NM_coding.gtf 347 | echo "" 348 | printf "${PURPLE}::: 1/3) Filtering transcripts :::${CYAN}\n" 349 | # Filter 350 | FEELnc_filter.pl -i final_annotated.gtf -a NM_coding.gtf -b transcript_biotype=protein_coding > candidate_lncRNA.gtf 351 | rm -r -f ${g_DIR}/${reference_genome}.index 352 | printf "${PURPLE}::: 2/3) Evaluating coding potential :::${CYAN}\n" 353 | # Coding_Potential 354 | FEELnc_codpot.pl -i candidate_lncRNA.gtf -a NM_coding.gtf -b transcript_biotype=protein_coding -g ${g_DIR}/${reference_genome} --mode=shuffle 355 | printf "${PURPLE}::: 3/3) Classifiyng lncRNA transcripts :::${CYAN}\n" 356 | # Classifier 357 | FEELnc_classifier.pl -i feelnc_codpot_out/candidate_lncRNA.gtf.lncRNA.gtf -a NM_coding.gtf > candidate_lncRNA_classes.txt 358 | echo "" 359 | printf "${PURPLE}::: FEELnc calculations were done. The output is called candidate_lncRNA_classes.txt :::\n" 360 | echo "" 361 | 362 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::\n" 363 | printf "${YELLOW}::: 8. Parsing GAWN and FEELnc outputs :::\n" 364 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::${CYAN}\n" 365 | 366 | echo "" 367 | cd ${dir1} 368 | awk '{print $3}' candidate_lncRNA_classes.txt > lncRNA_genes 369 | tail -n +2 lncRNA_genes > lncRNA_transcripts 370 | rm lncRNA_genes 371 | grep -w -F -f lncRNA_transcripts final_annotated.gtf > merged.fixed.lncRNAs.gtf 372 | grep --invert-match -F -f lncRNA_transcripts final_annotated.gtf > merged.fixed.coding.gtf 373 | rm final_annotated.gtf 374 | sed -i 's/StringTie/lncRNA/' merged.fixed.lncRNAs.gtf 375 | awk '{print $1"\t"$2}' transcriptome.hits > coding_list 376 | awk -F'\t' '$2!=""' coding_list > coding_transcripts 377 | awk '{print $1}' coding_transcripts > coding_transcripts.tab 378 | rm coding_lis* coding_transcripts lncRNA_transcripts 379 | grep -w -F -f coding_transcripts.tab merged.fixed.coding.gtf > coding-genes.gtf 380 | grep --invert-match -F -f coding_transcripts.tab merged.fixed.coding.gtf > other-genes.gtf 381 | cat coding-genes.gtf merged.fixed.lncRNAs.gtf other-genes.gtf > final_annotated.gtf 382 | rm coding_transcripts.tab 383 | # sorting GTF file 384 | perl ./gff3sort/gff3sort.pl coding-genes.gtf > coding-genes.sorted.gtf 385 | rm coding-genes.gtf 386 | mv coding-genes.sorted.gtf coding-genes.gtf 387 | echo "All done" 388 | echo "" 389 | ########################################## 390 | # Gene Prediction Step with TransDecoder # 391 | ########################################## 392 | cd ${dir1} 393 | echo "" 394 | 395 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n" 396 | printf "${YELLOW}::: 9. Predicting coding regions from transcripts with coding potential using TransDecoder :::\n" 397 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::${CYAN}\n" 398 | 399 | echo "" 400 | gffread -w coding-transcripts.fa -g ${g_DIR}/${reference_genome} coding-genes.gtf 401 | TransDecoder.LongOrfs -m 60 -t coding-transcripts.fa 402 | TransDecoder.Predict -t coding-transcripts.fa --single_best_only 403 | awk '{print $1}' coding-transcripts.fa.transdecoder.bed > coding.sequences 404 | tail -n +2 coding.sequences > coding.hits && rm coding.sequences 405 | echo "" 406 | printf "${PURPLE}::: Done. coding-transcripts.fa.transdecoder.gff3 file is present in current directory...${CYAN}\n" 407 | echo "" 408 | 409 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n" 410 | printf "${YELLOW}::: 10. Converting gff3 to GTF format and formatting coding sequences and proteins :::\n" 411 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::${CYAN}\n" 412 | 413 | echo "" 414 | sed 's/Name=.*$//' coding-transcripts.fa.transdecoder.gff3 > coding-transcripts.fa.test.gff3 415 | sed -i 's/ID=GENE[.]/ID=/'g coding-transcripts.fa.test.gff3 416 | sed -i 's/Parent=GENE[.]/Parent=/'g coding-transcripts.fa.test.gff3 417 | sed -i 's/~~/;protein_id=/'g coding-transcripts.fa.test.gff3 418 | gffread coding-transcripts.fa.test.gff3 -T -P -g NCBI_transcripts.fa -o coding_transcripts.gtf 419 | rm coding-transcripts.fa.test.gff3 420 | # removing protein id by expansion 421 | sed -i 's/[.]p[0-9]//'g coding_transcripts.gtf 422 | sed -i 's/[.]p[0-9][0-9]//'g coding_transcripts.gtf 423 | sed -i 's/[.]p[0-9][0-9][0-9]//'g coding_transcripts.gtf 424 | sed -i 's/[.]p[0-9][0-9][0-9][0-9]//'g coding_transcripts.gtf 425 | sed -i 's/[.]p[0-9][0-9][0-9][0-9][0-9]//'g coding_transcripts.gtf 426 | # 427 | # obtaining cds.fa and prot.fa from coding_transcripts.gtf 428 | echo "" 429 | echo "::: Obtaining cds.fa and prot.fa from coding_transcripts.gtf" 430 | echo "" 431 | gffread -x cds.fa -g NCBI_transcripts.fa coding_transcripts.gtf 432 | gffread -y prot.fa -g NCBI_transcripts.fa coding_transcripts.gtf 433 | echo "done" 434 | rm coding-transcripts.fa coding-genes.gtf merged.fixed.lncRNAs.gtf other-genes.gtf 435 | grep "StringTie" final_annotated.gtf > genes.gtf 436 | grep "lncRNA" final_annotated.gtf > lncRNAs.gtf 437 | grep -w -F -f coding.hits genes.gtf > coding-genes.gtf 438 | grep --invert-match -F -f coding.hits genes.gtf > other-genes.gtf 439 | sed -i 's/StringTie/coding/' coding-genes.gtf 440 | cat coding-genes.gtf lncRNAs.gtf other-genes.gtf > final_annotated.gtf 441 | echo "" 442 | echo "::: Parsing transcriptome hits" 443 | echo "" 444 | grep -w -F -f coding.hits transcriptome.swissprot > coding.annotation 445 | rm transcriptome.swissprot 446 | mv coding.annotation transcriptome.swissprot 447 | echo "done" 448 | # sorting GTF file 449 | echo "" 450 | echo "::: Sorting final_annotated.gtf" 451 | echo "" 452 | perl ./gff3sort/gff3sort.pl final_annotated.gtf > final_annotated.sorted.gtf 453 | echo "done" 454 | rm final_annotated.gtf 455 | mv final_annotated.sorted.gtf final_annotated.gtf 456 | rm coding-genes.gtf lncRNAs.gtf other-genes.gtf transcriptome.hits 457 | ### Novel coding genes and correspondent proteins 458 | echo "" 459 | echo "::: Obtaining novel coding transcripts (cds) and correspondent proteins" 460 | echo "" 461 | # 462 | wget https://raw.githubusercontent.com/cfarkas/annotate_my_genomes/master/additional_scripts/transcriptome_metrics.sh 463 | bash transcriptome_metrics.sh -f final_annotated.gtf -g ${g_DIR}/${reference_genome} 464 | cp ./transcriptome_metrics/known-genes-coding.gtf ./ 465 | cp ./transcriptome_metrics/novel-genes-coding.gtf ./ 466 | cp ./transcriptome_metrics/novel-transcripts-lncRNA.fa ./ 467 | cp ./transcriptome_metrics/known-transcripts-lncRNA.fa ./ 468 | # 469 | perl -lne 'print "@m" if @m=(/((?:transcript_id|gene_id)\s+\S+)/g);' novel-genes-coding.gtf > novel_annotated.tab 470 | awk '{print $(NF)}' novel_annotated.tab > novel-coding-transcripts.matches 471 | sed -i 's/;//g' novel-coding-transcripts.matches 472 | sed -i 's/"//g' novel-coding-transcripts.matches 473 | awk '!a[$0]++' novel-coding-transcripts.matches > novel-coding-transcripts.tab && rm novel-coding-transcripts.matches 474 | mv novel-coding-transcripts.tab novel-coding-transcripts.matches 475 | # 476 | seqkit fx2tab cds.fa > cds.tab 477 | seqkit fx2tab prot.fa > prot.tab 478 | grep -w -F -f novel-coding-transcripts.matches cds.tab > novel-coding-cds.tab 479 | grep -w -F -f novel-coding-transcripts.matches prot.tab > novel-coding-prot.tab 480 | seqkit tab2fx novel-coding-cds.tab > novel-cds.fa && seqkit tab2fx novel-coding-prot.tab > novel-prot.fa 481 | rm -r -f novel-coding-cds.tab novel-coding-prot.tab novel-coding-transcripts.matches cds.tab prot.tab 482 | # obtaining final gff file 483 | echo "" 484 | echo "::: Obtaining final gff file" 485 | echo "" 486 | gffread -E -F --merge final_annotated.gtf -o final_annotated.gff 487 | rm -r -f gff3sort 488 | echo "done" 489 | echo "" 490 | rm merged.fixed.coding.gtf namelist namelist_unique_sorted coding.hits 491 | 492 | ############################### 493 | # Configuring Summary Results # 494 | ############################### 495 | 496 | printf "${YELLOW}:::::::::::::::::::::::::::::::::::::::::::::::::::::\n" 497 | printf "${YELLOW}::: 11. Moving results to the specified directory :::\n" 498 | printf "${YELLOW}:::::::::::::::::::::::::::::::::::::::::::::::::::::${CYAN}\n" 499 | 500 | echo "" 501 | printf "${PURPLE}::: Moving results to the specified directory :::${CYAN}\n" 502 | rm -r -f output_files 503 | mkdir output_files 504 | mv candidate_lncRNA_classes.txt final_annotated.gtf final_annotated.gff NCBI_transcripts.fa cds.fa prot.fa Stats.txt coding_transcripts.gtf transcriptome.swissprot novel-cds.fa novel-prot.fa sed.script novel-transcripts-lncRNA.fa known-transcripts-lncRNA.fa known-genes-coding.gtf novel-genes-coding.gtf ./output_files 505 | rm -r -f *feelncfilter.log genes.gtf pipeliner* NM_coding.gtf candidate_lncRNA.gtf* coding-transcripts.fa.transdecoder_dir.__* NCBI_transcripts.fa.fai 506 | rm -r -f transdecoder 507 | mkdir transdecoder 508 | mv coding-transcripts.fa.transdecoder.* ./transdecoder 509 | mv NCBI_compare.annotated.gtf ./gffcompare_outputs_NCBI 510 | cp ${dir1}/gffcompare_outputs_NCBI/NCBI_compare.${stringtie_input}.tmap ./ 511 | mv NCBI_compare.${stringtie_input}.tmap gffcompare.tmap 512 | mv gffcompare.tmap ./output_files/ 513 | 514 | # cd ${dir0} 515 | # mv add_ncbi_annotation_$sec ${o_DIR}/${output_folder} 516 | 517 | cd ${dir0} 518 | 519 | echo "Done" 520 | echo "" 521 | printf "${YELLOW}::: Done:::\n" 522 | echo "" 523 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n" 524 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n" 525 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::${CYAN}\n" 526 | echo "" 527 | echo "The following files are available in ${dir1}/output_files : " 528 | echo "" 529 | echo "Transcript discoveries are summarized in Stats.txt file. GAWN protein annotation is called transcriptome.hits" 530 | echo "" 531 | echo "gffcompare.tmap file contains Best Reference Transcript for each assembled transcript" 532 | echo "" 533 | echo "GTF file named final_annotated.gtf (and correspondent gff file) contain novel genes and lncRNA classification (second field in GTF file)" 534 | echo "" 535 | echo "candidate_lncRNA_classes.txt contained detailed long non-coding classification of transcripts" 536 | echo "" 537 | echo "Associated FASTA file to this GTF correspond to NCBI_transcripts.fa file" 538 | echo "" 539 | echo "TransDecoder GTF file suitable to parse NCBI_transcripts.fa (coding_transcripts.gtf), contains all coding transcripts resolved by TransDecoder" 540 | echo "" 541 | echo "Predicted coding sequences and correspondent protein sequences were named cds.fa and prot.fa, respectively" 542 | echo "" 543 | echo "Novel predicted coding sequences and correspondent protein sequences were named novel-cds.fa and novel-prot.fa, respectively" 544 | echo "" 545 | echo "Novel and Known predicted lncRNAs were named novel-transcripts-lncRNA.fa and known-transcripts-lncRNA.fa, respectively" 546 | echo "" 547 | echo "Novel and Known coding genes were named novel-genes-coding.gtf and known-genes-coding.gtf, respectively" 548 | echo "" 549 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n" 550 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n" 551 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::${NC}\n" 552 | end=`date +%s` 553 | elapsed=`expr $end - $begin` 554 | echo Time taken: $elapsed 555 | # 556 | } | tee logfile_add_ncbi_annotation_$seconds 557 | # 558 | -------------------------------------------------------------------------------- /bash_scripts/annotate_my_genomes.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | { 6 | 7 | usage="$(basename "$0") [-h] [-a ] [-r ] [-g ] [-c ] [-t ] 8 | This pipeline will Overlap StringTie transcripts (GTF format) with current UCSC annotation and will annotate novel transcripts. 9 | Arguments: 10 | -h show this help text 11 | -a StringTie GTF 12 | -r UCSC gene annotation (in GTF format) 13 | -g Reference genome (in fasta format) 14 | -c GAWN config file (path to gawn_config.sh in annotate_my_genomes folder) 15 | -t Number of threads for processing (integer) 16 | -o output folder (must exist)" 17 | options=':ha:r:g:c:t:o:' 18 | while getopts $options option; do 19 | case "$option" in 20 | h) echo "$usage"; exit;; 21 | a) a=$OPTARG;; 22 | r) r=$OPTARG;; 23 | g) g=$OPTARG;; 24 | c) c=$OPTARG;; 25 | t) t=$OPTARG;; 26 | o) o=$OPTARG;; 27 | :) printf "missing argument for -%s\n" "$OPTARG" >&2; echo "$usage" >&2; exit 1;; 28 | \?) printf "illegal option: -%s\n" "$OPTARG" >&2; echo "$usage" >&2; exit 1;; 29 | esac 30 | done 31 | 32 | # mandatory arguments 33 | if [ ! "$a" ] || [ ! "$r" ] || [ ! "$g" ] || [ ! "$c" ] || [ ! "$t" ] || [ ! "$o" ]; then 34 | echo "" 35 | echo "arguments -a, -r, -g, -c, -t and -o must be provided" 36 | echo "" 37 | echo "$usage" >&2; exit 1 38 | fi 39 | 40 | # Conditions : output folder 41 | if [ ! -d "$o" ]; then 42 | echo "" 43 | echo "Output directory: $o not found. Please create the output directory first, before running the pipeline." 44 | echo "" 45 | exit 9999 # die with error code 9999 46 | fi 47 | 48 | # Conditions : Input existance 49 | 50 | if [ ! -e "$a" ]; then 51 | echo "" 52 | echo "$a does not exist. Check your -a input" 53 | echo "" 54 | exit 9999 # die with error code 9999 55 | fi 56 | 57 | if [ ! -e "$r" ]; then 58 | echo "" 59 | echo "$r does not exist. Check your -r input" 60 | echo "" 61 | exit 9999 # die with error code 9999 62 | fi 63 | 64 | if [ ! -e "$g" ]; then 65 | echo "" 66 | echo "$g does not exist. Check your -g input" 67 | echo "" 68 | exit 9999 # die with error code 9999 69 | fi 70 | 71 | if [ ! -e "$c" ]; then 72 | echo "" 73 | echo "$c does not exist. Check your -c input" 74 | echo "" 75 | exit 9999 # die with error code 9999 76 | fi 77 | 78 | # Conditions : Getting absolute path of inputs 79 | echo "" 80 | a_DIR="$( cd "$( dirname "$a" )" && pwd )" 81 | echo "" 82 | echo "::: The absolute path of -a is $a_DIR" 83 | echo "" 84 | r_DIR="$( cd "$( dirname "$r" )" && pwd )" 85 | echo "" 86 | echo "::: The absolute path of -r is $r_DIR" 87 | echo "" 88 | g_DIR="$( cd "$( dirname "$g" )" && pwd )" 89 | echo "" 90 | echo "::: The absolute path of -g is $g_DIR" 91 | echo "" 92 | c_DIR="$( cd "$( dirname "$c" )" && pwd )" 93 | echo "" 94 | echo "::: The absolute path of -c is $c_DIR" 95 | echo "" 96 | o_DIR="$( cd "$( dirname "$o" )" && pwd )" 97 | echo "" 98 | echo "::: The absolute path of -o is $o_DIR" 99 | echo "" 100 | 101 | begin=`date +%s` 102 | # .---------- constant part! 103 | # vvvv vvvv-- the code from above 104 | YELLOW='\033[1;33m' 105 | PURPLE='\033[0;35m' 106 | CYAN='\033[0;36m' 107 | NC='\033[0m' # No Color 108 | 109 | printf "${YELLOW}::: Defining Variables :::\n" 110 | echo "" 111 | echo "Defining variables:" 112 | echo"" 113 | FILE1="$a" 114 | basename "$FILE1" 115 | stringtie_input="$(basename -- $FILE1)" 116 | echo "The stringtie file used as input is the following: $stringtie_input" 117 | echo "" 118 | FILE2="$r" 119 | basename "$FILE2" 120 | reference_gtf="$(basename -- $FILE2)" 121 | echo "The reference GTF used as input is the following: $reference_gtf" 122 | echo "" 123 | FILE3="$g" 124 | basename "$FILE3" 125 | reference_genome="$(basename -- $FILE3)" 126 | echo "The reference genome used as input is the following: $reference_genome" 127 | echo "" 128 | FILE4="$c" 129 | basename "$FILE4" 130 | gawn_config="$(basename -- $FILE4)" 131 | echo "The gawn_config file used as input is the following: $gawn_config" 132 | echo "" 133 | FILE5="$t" 134 | basename "$FILE5" 135 | threads="$(basename -- $FILE5)" 136 | echo "The number of threads for calculation are the following: $threads" 137 | echo "" 138 | FILE6="$o" 139 | basename "$FILE6" 140 | output_folder="$(basename -- $FILE6)" 141 | echo "The output folder is the following: $output_folder" 142 | echo "" 143 | 144 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n" 145 | printf "${YELLOW}::: 0. Defining directories and StringTie input data :::\n" 146 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::${CYAN}\n" 147 | echo "" 148 | 149 | dir0=$(cd -P -- "$(dirname -- "$0")" && pwd -P) 150 | 151 | sec=$(date "+%Y%m%d_%H%M%S") 152 | # mkdir annotate_my_genomes_$sec 153 | 154 | if [ -z "$(ls -A ${o_DIR}/${output_folder})" ]; then 155 | echo "" 156 | echo "Output folder is empty. We will work inside the provided output folder:" 157 | echo "" 158 | cd ${o_DIR}/${output_folder} 159 | else 160 | echo "" 161 | echo "Output folder is not empty. Creating temporary folder:" 162 | echo "" 163 | sec=$(date "+%Y%m%d_%H%M%S") 164 | cd ${o_DIR}/${output_folder} 165 | mkdir annotate_my_genomes_$sec && cd annotate_my_genomes_$sec 166 | fi 167 | 168 | # cd ${o_DIR}/${output_folder} 169 | 170 | if [ -f $stringtie_input ]; then 171 | echo "" 172 | echo "$stringtie_input file found in output directory. Continue." 173 | echo "" 174 | : 175 | else 176 | echo "" 177 | echo "Copying $stringtie_input file into the output directory:" 178 | cp ${a_DIR}/${stringtie_input} ./ 179 | echo "" 180 | fi 181 | 182 | # cp ${a_DIR}/${stringtie_input} ${o_DIR}/${output_folder} 183 | # cd annotate_my_genomes_$sec 184 | 185 | dir1=$(cd -P -- "$(dirname -- "$0")" && pwd -P) 186 | echo "" 187 | echo "Current Working Directory:" 188 | echo "" 189 | echo $dir1 190 | echo "" 191 | printf "${YELLOW}::: Done :::\n" 192 | echo "" 193 | 194 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n" 195 | printf "${YELLOW}::: 1. Overlapping StringTie transcripts with UCSC GTF :::\n" 196 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::${CYAN}\n" 197 | echo "" 198 | 199 | gffcompare -R -r ${r_DIR}/${reference_gtf} -s ${g_DIR}/${reference_genome} -o UCSC_compare ${stringtie_input} 200 | echo "Done." 201 | printf "${PURPLE}::: Done :::\n" 202 | echo "" 203 | 204 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::\n" 205 | printf "${YELLOW}::: 2. Writting novel discoveries to Stats.txt :::\n" 206 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::${CYAN}\n" 207 | echo "" 208 | 209 | # Stats 210 | exec 3<> Stats.txt 211 | echo "Number of assembled genes:" >> Stats.txt 212 | cat UCSC_compare.${stringtie_input}.tmap | sed "1d" | cut -f4 | sort | uniq | wc -l >> Stats.txt 213 | echo "" >> Stats.txt 214 | echo "Number of novel genes:" >> Stats.txt 215 | cat UCSC_compare.${stringtie_input}.tmap | awk '$3=="u"{print $0}' | cut -f4 | sort | uniq | wc -l >> Stats.txt 216 | echo "" >> Stats.txt 217 | echo "Number of novel transcripts:" >> Stats.txt 218 | cat UCSC_compare.${stringtie_input}.tmap | awk '$3=="u"{print $0}' | cut -f5 | sort | uniq | wc -l >> Stats.txt 219 | echo "" >> Stats.txt 220 | echo "Number of transcripts matching annotation:" >> Stats.txt 221 | cat UCSC_compare.${stringtie_input}.tmap | awk '$3=="="{print $0}' | cut -f5 | sort | uniq | wc -l >> Stats.txt 222 | exec 3>&- 223 | printf "${PURPLE}Done\n" 224 | echo "" 225 | 226 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n" 227 | printf "${YELLOW}::: 3. Replacing gene_id field in final_annotated.gtf file with UCSC gene_id's :::\n" 228 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::${CYAN}\n" 229 | 230 | echo "" 231 | ####################################### 232 | # Merging novel transcripts with ref. # 233 | ####################################### 234 | awk '{print $4"\t"$1}' UCSC_compare.${stringtie_input}.tmap > UCSC_compare.${stringtie_input}.tmap.1 235 | tail -n +2 UCSC_compare.${stringtie_input}.tmap.1 > UCSC_compare.${stringtie_input}.tmap.2 236 | awk '$2 != "-"' UCSC_compare.${stringtie_input}.tmap.2 > namelist 237 | awk '!a[$0]++' namelist > namelist_unique 238 | tac namelist_unique > namelist_unique_sorted 239 | rm namelist namelist_unique 240 | awk '{print $1}' namelist_unique_sorted > A 241 | awk '{print $2}' namelist_unique_sorted > B 242 | sed 's/^/"/' A > A.1 243 | sed 's/$/"/' A.1 > A.2 244 | sed 's/^/"/' B > B.1 245 | sed 's/$/"/' B.1 > B.2 246 | paste -d'\t' A.2 B.2 > namelist 247 | rm A A.1 A.2 B B.1 B.2 248 | ############################### 249 | # Getting gene names replaced # 250 | ############################### 251 | awk '{print $1}' namelist > fileA 252 | awk '{print $2}' namelist > fileB 253 | paste -d % fileA fileB > sed.script 254 | sed -i -e 's/^/s%/' sed.script 255 | sed -i -e 's/$/%/' sed.script 256 | cat ${a_DIR}/${stringtie_input} | parallel --pipe -j ${t} sed -f sed.script > final_annotated.gtf 257 | rm -f fileA fileB *tmap.1 *tmap.2 258 | # sorting GTF file 259 | rm -r -f gff3sort 260 | git clone https://github.com/cfarkas/gff3sort.git 261 | perl ./gff3sort/gff3sort.pl final_annotated.gtf > final_annotated.sorted.gtf 262 | rm final_annotated.gtf 263 | mv final_annotated.sorted.gtf final_annotated.gtf 264 | printf "${PURPLE}::: Done. Gene_id field was replaced in the StringTie.gtf file and final_annotated.gtf was generated with these changes\n" 265 | echo "" 266 | printf "${PURPLE}::: Moving gffcompare results to gffcompare_outputs folder ...\n" 267 | echo "" 268 | rm -r -f gffcompare_outputs_UCSC 269 | mkdir gffcompare_outputs_UCSC 270 | mv *.loci *.stats *.refmap *.tmap *.tracking ./gffcompare_outputs_UCSC 271 | echo "" 272 | 273 | printf "${YELLOW}:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n" 274 | printf "${YELLOW}::: 4. Obtaining Transcripts in FASTA format with gffread :::\n" 275 | printf "${YELLOW}:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::${CYAN}\n" 276 | 277 | echo "" 278 | gffread -w transcripts.fa -g ${g_DIR}/${reference_genome} final_annotated.gtf 279 | echo "" 280 | printf "${PURPLE}::: Done. transcripts.fa are located in current directory\n" 281 | echo "" 282 | 283 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n" 284 | printf "${YELLOW}::: 5. Performing gene annotation by using GAWN pipeline :::\n" 285 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::${CYAN}\n" 286 | 287 | ################################################################ 288 | # Configuring Gawn Inputs, config file and running GAWN pipeline 289 | ################################################################ 290 | 291 | echo "" 292 | printf "${PURPLE}::: Downloading GAWN annotation folder. See https://github.com/enormandeau/gawn.git${CYAN}\n" 293 | echo "" 294 | rm -r -f gawn 295 | git clone https://github.com/cfarkas/gawn.git 296 | cd gawn/02_infos/ 297 | dir2=$(cd -P -- "$(dirname -- "$0")" && pwd -P) 298 | echo "Done" 299 | echo "" 300 | cd ${dir1} 301 | cp ${g_DIR}/${reference_genome} ${dir1}/gawn/03_data/genome.fasta 302 | cp transcripts.fa ${dir1}/gawn/03_data/transcriptome.fasta 303 | rm ${dir2}/gawn_config.sh 304 | cp ${c_DIR}/${gawn_config} ${dir2}/gawn_config.sh 305 | echo "" 306 | printf "${PURPLE}::: Starting GAWN transcript annotation${CYAN}\n" 307 | echo "" 308 | cd ${dir1}/gawn/ 309 | ./gawn 02_infos/gawn_config.sh 310 | echo "" 311 | printf "${PURPLE}::: Done. The novel transcripts were annotated in ./gawn/04_annotation/ :::${CYAN}\n" 312 | echo "" 313 | 314 | ################################# 315 | # Extracting transcriptome hits # 316 | ################################# 317 | 318 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::\n" 319 | printf "${YELLOW}::: 6. Extracting transcriptome hits :::\n" 320 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::${CYAN}\n" 321 | 322 | echo "" 323 | cd ${dir1} 324 | cp ${dir1}/gawn/04_annotation/transcriptome.swissprot ${dir1} 325 | cp ${dir1}/gawn/04_annotation/transcriptome.hits ${dir1} 326 | printf "${PURPLE}::: Done. transcriptome hits were succesfully extracted :::${CYAN}\n" 327 | echo "" 328 | 329 | ############################################ 330 | # FEELnc long noncoding RNA identification # 331 | ############################################ 332 | 333 | cd ${dir1} 334 | 335 | printf "${YELLOW}:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n" 336 | printf "${YELLOW}::: 7. Classifying protein-coding and long non-coding transcripts with FEELnc :::\n" 337 | printf "${YELLOW}:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::${CYAN}\n" 338 | 339 | grep "NM_" ${r_DIR}/${reference_gtf} > NM_coding.gtf 340 | echo "" 341 | printf "${PURPLE}::: 1/3) Filtering transcripts :::${CYAN}\n" 342 | # Filter 343 | FEELnc_filter.pl -i final_annotated.gtf -a NM_coding.gtf -b transcript_biotype=protein_coding > candidate_lncRNA.gtf 344 | rm -r -f ${g_DIR}/${reference_genome}.index 345 | printf "${PURPLE}::: 2/3) Evaluating coding potential :::${CYAN}\n" 346 | # Coding_Potential 347 | FEELnc_codpot.pl -i candidate_lncRNA.gtf -a NM_coding.gtf -b transcript_biotype=protein_coding -g ${g_DIR}/${reference_genome} --mode=shuffle 348 | printf "${PURPLE}::: 3/3) Classifiyng lncRNA transcripts :::${CYAN}\n" 349 | # Classifier 350 | FEELnc_classifier.pl -i feelnc_codpot_out/candidate_lncRNA.gtf.lncRNA.gtf -a NM_coding.gtf > candidate_lncRNA_classes.txt 351 | echo "" 352 | printf "${PURPLE}::: FEELnc calculations were done. The output is called candidate_lncRNA_classes.txt :::\n" 353 | echo "" 354 | 355 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::\n" 356 | printf "${YELLOW}::: 8. Parsing GAWN and FEELnc outputs :::\n" 357 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::${CYAN}\n" 358 | 359 | echo "" 360 | cd ${dir1} 361 | awk '{print $3}' candidate_lncRNA_classes.txt > lncRNA_genes 362 | tail -n +2 lncRNA_genes > lncRNA_transcripts 363 | rm lncRNA_genes 364 | grep -w -F -f lncRNA_transcripts final_annotated.gtf > merged.fixed.lncRNAs.gtf 365 | grep --invert-match -F -f lncRNA_transcripts final_annotated.gtf > merged.fixed.coding.gtf 366 | rm final_annotated.gtf 367 | sed -i 's/StringTie/lncRNA/' merged.fixed.lncRNAs.gtf 368 | awk '{print $1"\t"$2}' transcriptome.hits > coding_list 369 | awk -F'\t' '$2!=""' coding_list > coding_transcripts 370 | awk '{print $1}' coding_transcripts > coding_transcripts.tab 371 | rm coding_lis* coding_transcripts lncRNA_transcripts 372 | grep -w -F -f coding_transcripts.tab merged.fixed.coding.gtf > coding-genes.gtf 373 | grep --invert-match -F -f coding_transcripts.tab merged.fixed.coding.gtf > other-genes.gtf 374 | cat coding-genes.gtf merged.fixed.lncRNAs.gtf other-genes.gtf > final_annotated.gtf 375 | rm coding_transcripts.tab 376 | # sorting GTF file 377 | perl ./gff3sort/gff3sort.pl coding-genes.gtf > coding-genes.sorted.gtf 378 | rm coding-genes.gtf 379 | mv coding-genes.sorted.gtf coding-genes.gtf 380 | echo "All done" 381 | echo "" 382 | ########################################## 383 | # Gene Prediction Step with TransDecoder # 384 | ########################################## 385 | cd ${dir1} 386 | echo "" 387 | 388 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n" 389 | printf "${YELLOW}::: 9. Predicting coding regions from transcripts with coding potential using TransDecoder :::\n" 390 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::${CYAN}\n" 391 | 392 | echo "" 393 | echo "" 394 | gffread -w coding-transcripts.fa -g ${g_DIR}/${reference_genome} coding-genes.gtf 395 | TransDecoder.LongOrfs -m 60 -t coding-transcripts.fa 396 | TransDecoder.Predict -t coding-transcripts.fa --single_best_only 397 | awk '{print $1}' coding-transcripts.fa.transdecoder.bed > coding.sequences 398 | tail -n +2 coding.sequences > coding.hits && rm coding.sequences 399 | echo "" 400 | printf "${PURPLE}::: Done. coding-transcripts.fa.transdecoder.gff3 file is present in current directory...${CYAN}\n" 401 | echo "" 402 | 403 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n" 404 | printf "${YELLOW}::: 10. Converting gff3 to GTF format and formatting coding sequences and proteins :::\n" 405 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::${CYAN}\n" 406 | 407 | echo "" 408 | sed 's/Name=.*$//' coding-transcripts.fa.transdecoder.gff3 > coding-transcripts.fa.test.gff3 409 | sed -i 's/ID=GENE[.]/ID=/'g coding-transcripts.fa.test.gff3 410 | sed -i 's/Parent=GENE[.]/Parent=/'g coding-transcripts.fa.test.gff3 411 | sed -i 's/~~/;protein_id=/'g coding-transcripts.fa.test.gff3 412 | gffread coding-transcripts.fa.test.gff3 -T -P -g transcripts.fa -o coding_transcripts.gtf 413 | rm coding-transcripts.fa.test.gff3 414 | # removing protein id by expansion 415 | sed -i 's/[.]p[0-9]//'g coding_transcripts.gtf 416 | sed -i 's/[.]p[0-9][0-9]//'g coding_transcripts.gtf 417 | sed -i 's/[.]p[0-9][0-9][0-9]//'g coding_transcripts.gtf 418 | sed -i 's/[.]p[0-9][0-9][0-9][0-9]//'g coding_transcripts.gtf 419 | sed -i 's/[.]p[0-9][0-9][0-9][0-9][0-9]//'g coding_transcripts.gtf 420 | # 421 | # obtaining cds.fa and prot.fa from coding_transcripts.gtf 422 | echo "" 423 | echo "::: Obtaining cds.fa and prot.fa from coding_transcripts.gtf" 424 | echo "" 425 | gffread -x cds.fa -g transcripts.fa coding_transcripts.gtf 426 | gffread -y prot.fa -g transcripts.fa coding_transcripts.gtf 427 | echo "done" 428 | rm coding-transcripts.fa coding-genes.gtf merged.fixed.lncRNAs.gtf other-genes.gtf 429 | grep "StringTie" final_annotated.gtf > genes.gtf 430 | grep "lncRNA" final_annotated.gtf > lncRNAs.gtf 431 | grep -w -F -f coding.hits genes.gtf > coding-genes.gtf 432 | grep --invert-match -F -f coding.hits genes.gtf > other-genes.gtf 433 | sed -i 's/StringTie/coding/' coding-genes.gtf 434 | cat coding-genes.gtf lncRNAs.gtf other-genes.gtf > final_annotated.gtf 435 | echo "" 436 | echo "::: Parsing transcriptome hits" 437 | echo "" 438 | grep -w -F -f coding.hits transcriptome.swissprot > coding.annotation 439 | rm transcriptome.swissprot 440 | mv coding.annotation transcriptome.swissprot 441 | echo "done" 442 | # sorting GTF file 443 | echo "" 444 | echo "::: Sorting final_annotated.gtf" 445 | echo "" 446 | perl ./gff3sort/gff3sort.pl final_annotated.gtf > final_annotated.sorted.gtf 447 | echo "done" 448 | rm final_annotated.gtf 449 | mv final_annotated.sorted.gtf final_annotated.gtf 450 | rm coding-genes.gtf lncRNAs.gtf other-genes.gtf transcriptome.hits 451 | ### Novel coding genes and correspondent proteins 452 | echo "" 453 | echo "::: Obtaining novel coding transcripts (cds) and correspondent proteins" 454 | echo "" 455 | # 456 | wget https://raw.githubusercontent.com/cfarkas/annotate_my_genomes/master/additional_scripts/transcriptome_metrics.sh 457 | bash transcriptome_metrics.sh -f final_annotated.gtf -g ${g_DIR}/${reference_genome} 458 | cp ./transcriptome_metrics/known-genes-coding.gtf ./ 459 | cp ./transcriptome_metrics/novel-genes-coding.gtf ./ 460 | cp ./transcriptome_metrics/novel-transcripts-lncRNA.fa ./ 461 | cp ./transcriptome_metrics/known-transcripts-lncRNA.fa ./ 462 | # 463 | perl -lne 'print "@m" if @m=(/((?:transcript_id|gene_id)\s+\S+)/g);' novel-genes-coding.gtf > novel_annotated.tab 464 | awk '{print $(NF)}' novel_annotated.tab > novel-coding-transcripts.matches 465 | sed -i 's/;//g' novel-coding-transcripts.matches 466 | sed -i 's/"//g' novel-coding-transcripts.matches 467 | awk '!a[$0]++' novel-coding-transcripts.matches > novel-coding-transcripts.tab && rm novel-coding-transcripts.matches 468 | mv novel-coding-transcripts.tab novel-coding-transcripts.matches 469 | # 470 | seqkit fx2tab cds.fa > cds.tab 471 | seqkit fx2tab prot.fa > prot.tab 472 | grep -w -F -f novel-coding-transcripts.matches cds.tab > novel-coding-cds.tab 473 | grep -w -F -f novel-coding-transcripts.matches prot.tab > novel-coding-prot.tab 474 | seqkit tab2fx novel-coding-cds.tab > novel-cds.fa && seqkit tab2fx novel-coding-prot.tab > novel-prot.fa 475 | rm -r -f novel-coding-cds.tab novel-coding-prot.tab novel-coding-transcripts.matches cds.tab prot.tab 476 | # obtaining final gff file 477 | echo "" 478 | echo "::: Obtaining final gff file" 479 | echo "" 480 | gffread -E -F --merge final_annotated.gtf -o final_annotated.gff 481 | rm -r -f gff3sort 482 | echo "done" 483 | echo "" 484 | rm -r -f merged.fixed.coding.gtf namelist namelist_unique_sorted coding.hits 485 | 486 | ############################### 487 | # Configuring Summary Results # 488 | ############################### 489 | 490 | printf "${YELLOW}:::::::::::::::::::::::::::::::::::::::::::::::::::::\n" 491 | printf "${YELLOW}::: 11. Moving results to the specified directory :::\n" 492 | printf "${YELLOW}:::::::::::::::::::::::::::::::::::::::::::::::::::::${CYAN}\n" 493 | 494 | echo "" 495 | printf "${PURPLE}::: Moving results to the specified directory :::${CYAN}\n" 496 | rm -r -f output_files 497 | mkdir output_files 498 | mv candidate_lncRNA_classes.txt final_annotated.gtf final_annotated.gff transcripts.fa cds.fa prot.fa coding_transcripts.gtf Stats.txt transcriptome.swissprot novel-cds.fa novel-prot.fa sed.script novel-transcripts-lncRNA.fa known-transcripts-lncRNA.fa known-genes-coding.gtf novel-genes-coding.gtf ./output_files 499 | rm -r -f *feelncfilter.log genes.gtf pipeliner* NM_coding.gtf candidate_lncRNA.gtf* coding-transcripts.fa.transdecoder_dir.__* transcripts.fa.fai 500 | rm -r -f transdecoder 501 | mkdir transdecoder 502 | mv coding-transcripts.fa.transdecoder.* ./transdecoder 503 | mv UCSC_compare.annotated.gtf ./gffcompare_outputs_UCSC 504 | cp ${dir1}/gffcompare_outputs_UCSC/UCSC_compare.${stringtie_input}.tmap ./ 505 | mv UCSC_compare.${stringtie_input}.tmap gffcompare.tmap 506 | mv gffcompare.tmap ./output_files/ 507 | 508 | # cd ${dir0} 509 | # mv annotate_my_genomes_$sec ${o_DIR}/${output_folder} 510 | 511 | cd ${dir0} 512 | 513 | echo "Done" 514 | echo "" 515 | printf "${YELLOW}::: Done:::\n" 516 | echo "" 517 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n" 518 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n" 519 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::${CYAN}\n" 520 | echo "" 521 | echo "The following files are available in ${dir1}/output_files : " 522 | echo "" 523 | echo "Transcript discoveries are summarized in Stats.txt file. GAWN protein annotation is named transcriptome.hits" 524 | echo "" 525 | echo "gffcompare.tmap file contains Best Reference Transcript for each assembled transcript" 526 | echo "" 527 | echo "GTF file named final_annotated.gtf (and correspondent gff file) contain novel genes and lncRNA classification (second field in GTF file)" 528 | echo "" 529 | echo "candidate_lncRNA_classes.txt contained detailed long non-coding classification of transcripts". 530 | echo "" 531 | echo "Associated FASTA file to this GTF correspond to transcripts.fa file" 532 | echo "" 533 | echo "TransDecoder GTF file suitable to parse transcripts.fa (coding_transcripts.gtf), contains all coding transcripts resolved by TransDecoder" 534 | echo "" 535 | echo "Predicted coding sequences and correspondent protein sequences were named cds.fa and prot.fa, respectively" 536 | echo "" 537 | echo "Novel predicted coding sequences and correspondent protein sequences were named novel-cds.fa and novel-prot.fa, respectively" 538 | echo "" 539 | echo "Novel and Known predicted lncRNAs were named novel-transcripts-lncRNA.fa and known-transcripts-lncRNA.fa, respectively" 540 | echo "" 541 | echo "Novel and Known coding genes were named novel-genes-coding.gtf and known-genes-coding.gtf, respectively" 542 | echo "" 543 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n" 544 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n" 545 | printf "${YELLOW}::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::${NC}\n" 546 | end=`date +%s` 547 | elapsed=`expr $end - $begin` 548 | echo Time taken: $elapsed 549 | # 550 | } | tee logfile_annotate_my_genomes_${sec} 551 | # 552 | -------------------------------------------------------------------------------- /bash_scripts/genome_download.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | genome=${1} 6 | 7 | if [ "$1" == "-h" ]; then 8 | echo "" 9 | echo "Usage: ./`basename $0` [genome]" 10 | echo "" 11 | echo "This program will download and index the specified genome from the uscs server (goldenpath) including annotation" 12 | echo "" 13 | echo "[genome]: UCSC Prefix of the genome assembly. Check names here: https://genome.ucsc.edu/cgi-bin/hgGateway" 14 | echo "" 15 | exit 0 16 | fi 17 | 18 | if [ "$1" == "-help" ]; then 19 | echo "" 20 | echo "Usage: ./`basename $0` [genome]" 21 | echo "" 22 | echo "This program will download and index the specified genome from the uscs server (goldenpath) including annotation" 23 | echo "" 24 | echo "[genome]: UCSC Prefix of the genome assembly. Check names here: https://genome.ucsc.edu/cgi-bin/hgGateway" 25 | echo "" 26 | exit 0 27 | fi 28 | if [ "$1" == "--h" ]; then 29 | echo "" 30 | echo "Usage: ./`basename $0` [genome]" 31 | echo "" 32 | echo "This program will download and index the specified genome from the uscs server (goldenpath) including annotation" 33 | echo "" 34 | echo "[genome]: UCSC Prefix of the genome assembly. Check names here: https://genome.ucsc.edu/cgi-bin/hgGateway" 35 | echo "" 36 | exit 0 37 | fi 38 | 39 | if [ "$1" == "--help" ]; then 40 | echo "" 41 | echo "Usage: ./`basename $0` [genome]" 42 | echo "" 43 | echo "This program will download and index the specified genome from the uscs server (goldenpath) including annotation" 44 | echo "" 45 | echo "[genome]: UCSC Prefix of the genome assembly. Check names here: https://genome.ucsc.edu/cgi-bin/hgGateway" 46 | echo "" 47 | exit 0 48 | fi 49 | 50 | [ $# -eq 0 ] && { echo "Usage: ./`basename $0` [genome]"; exit 1; } 51 | 52 | if [ $# -ne 1 ]; then 53 | echo 1>&2 "Usage: ./`basename $0` [genome]" 54 | exit 3 55 | fi 56 | 57 | # Obtaining {genome}.fa genome, and indexing 58 | wget http://hgdownload.cse.ucsc.edu/goldenpath/${genome}/bigZips/${genome}.2bit 59 | wget http://hgdownload.soe.ucsc.edu/goldenPath/${genome}/database/refGene.txt.gz 60 | wget http://hgdownload.soe.ucsc.edu/goldenPath/${genome}/database/ncbiRefSeq.txt.gz 61 | if [ -f twoBitToFa ]; then 62 | echo "twoBitToFa script found. Continue:" 63 | echo "" 64 | : 65 | else 66 | echo "Downloading twoBitToFa script" 67 | wget http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/twoBitToFa 68 | fi 69 | chmod 755 twoBitToFa 70 | ./twoBitToFa ${genome}.2bit ${genome}.fa 71 | samtools faidx ${genome}.fa 72 | 73 | if [ -f genePredToGtf ]; then 74 | echo "genePredToGtf script found. Continue:" 75 | echo "" 76 | : 77 | else 78 | echo "Downloading genePredToGtf script" 79 | wget http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/genePredToGtf 80 | fi 81 | chmod 755 genePredToGtf 82 | gunzip refGene.txt.gz 83 | gunzip ncbiRefSeq.txt.gz 84 | cut -f 2- refGene.txt | ./genePredToGtf file stdin -source=${genome}_Ref ${genome}.gtf 85 | cut -f 2- ncbiRefSeq.txt | ./genePredToGtf file stdin -source=${genome}_Ref ${genome}_ncbiRefSeq.gtf 86 | echo "" 87 | echo "All done. ${genome} FASTA and GTF files are located in the current directory" 88 | -------------------------------------------------------------------------------- /bash_scripts/genome_download_macOSX.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | genome=${1} 6 | 7 | if [ "$1" == "-h" ]; then 8 | echo "" 9 | echo "Usage: ./`basename $0` [genome]" 10 | echo "" 11 | echo "This program will download and index the specified genome from the uscs server (goldenpath) including annotation" 12 | echo "" 13 | echo "[genome]: UCSC Prefix of the genome assembly. Check names here: https://genome.ucsc.edu/cgi-bin/hgGateway" 14 | echo "" 15 | exit 0 16 | fi 17 | 18 | if [ "$1" == "-help" ]; then 19 | echo "" 20 | echo "Usage: ./`basename $0` [genome]" 21 | echo "" 22 | echo "This program will download and index the specified genome from the uscs server (goldenpath) including annotation" 23 | echo "" 24 | echo "[genome]: UCSC Prefix of the genome assembly. Check names here: https://genome.ucsc.edu/cgi-bin/hgGateway" 25 | echo "" 26 | exit 0 27 | fi 28 | if [ "$1" == "--h" ]; then 29 | echo "" 30 | echo "Usage: ./`basename $0` [genome]" 31 | echo "" 32 | echo "This program will download and index the specified genome from the uscs server (goldenpath) including annotation" 33 | echo "" 34 | echo "[genome]: UCSC Prefix of the genome assembly. Check names here: https://genome.ucsc.edu/cgi-bin/hgGateway" 35 | echo "" 36 | exit 0 37 | fi 38 | 39 | if [ "$1" == "--help" ]; then 40 | echo "" 41 | echo "Usage: ./`basename $0` [genome]" 42 | echo "" 43 | echo "This program will download and index the specified genome from the uscs server (goldenpath) including annotation" 44 | echo "" 45 | echo "[genome]: UCSC Prefix of the genome assembly. Check names here: https://genome.ucsc.edu/cgi-bin/hgGateway" 46 | echo "" 47 | exit 0 48 | fi 49 | 50 | [ $# -eq 0 ] && { echo "Usage: ./`basename $0` [genome]"; exit 1; } 51 | 52 | if [ $# -ne 1 ]; then 53 | echo 1>&2 "Usage: ./`basename $0` [genome]" 54 | exit 3 55 | fi 56 | 57 | # Obtaining {genome}.fa genome, and indexing 58 | wget http://hgdownload.cse.ucsc.edu/goldenpath/${genome}/bigZips/${genome}.2bit 59 | wget http://hgdownload.soe.ucsc.edu/goldenPath/${genome}/database/refGene.txt.gz 60 | wget http://hgdownload.soe.ucsc.edu/goldenPath/${genome}/database/ncbiRefSeq.txt.gz 61 | if [ -f twoBitToFa ]; then 62 | echo "twoBitToFa script found. Continue:" 63 | echo "" 64 | : 65 | else 66 | echo "Downloading twoBitToFa script" 67 | wget http://hgdownload.cse.ucsc.edu/admin/exe/macOSX.x86_64/twoBitToFa 68 | fi 69 | chmod 755 twoBitToFa 70 | ./twoBitToFa ${genome}.2bit ${genome}.fa 71 | samtools faidx ${genome}.fa 72 | 73 | if [ -f genePredToGtf ]; then 74 | echo "genePredToGtf script found. Continue:" 75 | echo "" 76 | : 77 | else 78 | echo "Downloading genePredToGtf script" 79 | wget http://hgdownload.cse.ucsc.edu/admin/exe/macOSX.x86_64/genePredToGtf 80 | fi 81 | chmod 755 genePredToGtf 82 | gunzip refGene.txt.gz 83 | gunzip ncbiRefSeq.txt.gz 84 | cut -f 2- refGene.txt | ./genePredToGtf file stdin -source=${genome}_Ref ${genome}.gtf 85 | cut -f 2- ncbiRefSeq.txt | ./genePredToGtf file stdin -source=${genome}_Ref ${genome}_ncbiRefSeq.gtf 86 | echo "" 87 | echo "All done. ${genome} FASTA and GTF files are located in the current directory" 88 | -------------------------------------------------------------------------------- /bash_scripts/get_transcripts.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | dir=$(cd -P -- "$(dirname -- "$0")" && pwd -P) 6 | usage="$(basename "$0") [-h] [-f ] [-g ] [-i ] 7 | This program will obtain and align all transcripts coming from a given gene, in order to obtain a consensus. 8 | Arguments: 9 | -h show this help text 10 | -f Name of the StringTie annotated GTF from the pipeline 11 | -g Reference genome (in fasta format) 12 | -i Gene Symbol" 13 | options=':hf:g:i:' 14 | while getopts $options option; do 15 | case "$option" in 16 | h) echo "$usage"; exit;; 17 | f) f=$OPTARG;; 18 | g) g=$OPTARG;; 19 | i) i=$OPTARG;; 20 | :) printf "missing argument for -%s\n" "$OPTARG" >&2; echo "$usage" >&2; exit 1;; 21 | \?) printf "illegal option: -%s\n" "$OPTARG" >&2; echo "$usage" >&2; exit 1;; 22 | esac 23 | done 24 | 25 | # mandatory arguments 26 | if [ ! "$f" ] || [ ! "$g" ] || [ ! "$i" ]; then 27 | echo "arguments -f, -g and -i must be provided" 28 | echo "$usage" >&2; exit 1 29 | fi 30 | 31 | echo "Working in $dir" 32 | echo "" 33 | echo "Obtaining GTF for the given gene_name using final_annotated.gtf file" 34 | grep "\<${i}\>" ${f} > ${i}.gtf 35 | echo "Done." 36 | echo "" 37 | echo "Obtaining gene-associated transcripts in fasta format" 38 | gffread -w ${i}.fa -g ${g} ${i}.gtf 39 | echo "Done." 40 | echo "" 41 | echo "Aligning transcript sequences with Clustal Omega" 42 | clustalo -i ${i}.fa -o ${i}.aln 43 | echo "Done" 44 | echo "" 45 | echo "Obtaining consensus sequence from alignment with EMBOSS consensus" 46 | em_cons -sequence ${i}.aln -outseq ${i}.cons 47 | echo "" 48 | echo "All done. ${i}.cons file contain suitable sequence to be validated for qPCR." 49 | echo "" 50 | echo "${i}.fa contain transcript sequences in fasta format associated with ${i} gene" 51 | -------------------------------------------------------------------------------- /bash_scripts/isoform_identification.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | { 6 | 7 | usage="$(basename "$0") [-h] [-m ] [-t ] [-g ] 8 | This script will produce an annotated csv table of transcripts, by using the tmap output file from add-ncbi-annotation pipeline. 9 | Arguments: 10 | -h show this help text 11 | -m NCBI gffcompare tmap output file. As example: gffcompare.tmap 12 | -t transcripts file, output of add-ncbi-annotation program. As example: NCBI_transcripts.fa 13 | -g UCSC genome name. In example: mm10, galGal6, hg38, rn6." 14 | options=':hm:t:g:' 15 | while getopts $options option; do 16 | case "$option" in 17 | h) echo "$usage"; exit;; 18 | m) m=$OPTARG;; 19 | t) t=$OPTARG;; 20 | g) g=$OPTARG;; 21 | :) printf "missing argument for -%s\n" "$OPTARG" >&2; echo "$usage" >&2; exit 1;; 22 | \?) printf "illegal option: -%s\n" "$OPTARG" >&2; echo "$usage" >&2; exit 1;; 23 | esac 24 | done 25 | 26 | # mandatory arguments 27 | if [ ! "$m" ] || [ ! "$t" ] || [ ! "$g" ]; then 28 | echo "" 29 | echo "arguments -m, -t and -g must be provided" 30 | echo "" 31 | echo "$usage" >&2; exit 1 32 | fi 33 | 34 | # Conditions : Input existance 35 | 36 | if [ ! -e "$m" ]; then 37 | echo "" 38 | echo "$m does not exist. Check your -m input" 39 | echo "" 40 | exit 9999 # die with error code 9999 41 | fi 42 | 43 | if [ ! -e "$t" ]; then 44 | echo "" 45 | echo "$t does not exist. Check your -t input" 46 | echo "" 47 | exit 9999 # die with error code 9999 48 | fi 49 | 50 | # Conditions : Getting absolute path of inputs 51 | echo "" 52 | m_DIR="$( cd "$( dirname "$m" )" && pwd )" 53 | echo "" 54 | echo "::: The absolute path of -m is $m_DIR" 55 | echo "" 56 | t_DIR="$( cd "$( dirname "$t" )" && pwd )" 57 | echo "" 58 | echo "::: The absolute path of -t is $t_DIR" 59 | echo "" 60 | echo "" 61 | printf "${YELLOW}::: Defining Inputs :::\n" 62 | echo "" 63 | FILE1="$m" 64 | basename "$FILE1" 65 | tmap_input="$(basename -- $FILE1)" 66 | echo "The tmap file used as input is the following: $tmap_input" 67 | echo"" 68 | FILE2="$t" 69 | basename "$FILE2" 70 | transcripts_input="$(basename -- $FILE2)" 71 | echo "The transcript file used as input is the following: $transcripts_input" 72 | echo "" 73 | if [ -f ncbiRefSeqLink.txt ]; then 74 | echo "::: ncbiRefSeqLink.txt file found. Continue:" 75 | echo "" 76 | : 77 | else 78 | echo "::: Downloading ncbiRefSeqLink.txt file" 79 | wget http://hgdownload.cse.ucsc.edu/goldenpath/${g}/database/ncbiRefSeqLink.txt.gz 80 | gunzip ncbiRefSeqLink.txt.gz 81 | fi 82 | 83 | # Inputs for python 84 | cp ${m_DIR}/${tmap_input} ./stringtie_for_script.tmap 85 | seqkit fx2tab ${t_DIR}/${transcripts_input} > transcripts_Isoform.tab 86 | # Formatting transcripts_Isoform.tab if gene= is present in file 87 | sed -i 's/gene=/\t/'g transcripts_Isoform.tab 88 | awk '{print $1"\t"$NF}' transcripts_Isoform.tab > transcripts_Isoform2.tab 89 | 90 | # Execute gffcompare_parser.py 91 | python << END 92 | 93 | import sys 94 | import pandas as pd 95 | 96 | class bcolors: 97 | HEADER = '\033[95m' 98 | OKBLUE = '\033[94m' 99 | OKCYAN = '\033[96m' 100 | OKGREEN = '\033[92m' 101 | OKRED = '\033[91m' 102 | FAIL = '\033[91m' 103 | ENDC = '\033[0m' 104 | BOLD = '\033[1m' 105 | UNDERLINE = '\033[4m' 106 | 107 | df = pd.read_csv('stringtie_for_script.tmap', sep = '\t') 108 | print(df.sample(10)) 109 | print("Total number of transcripts:", df.shape[0]) 110 | print("") 111 | 112 | df2 = df[~df.ref_id.astype(str).str.contains('-')] 113 | novel_transcripts = df[df.ref_id.astype(str).str.contains('-')] 114 | 115 | df3 = df2[["ref_gene_id", "ref_id", "class_code", "qry_gene_id", "qry_id", "num_exons", "FPKM", "TPM"]] 116 | df_novel_transcripts = novel_transcripts[["ref_gene_id", "ref_id", "class_code", "qry_gene_id", "qry_id", "num_exons", "FPKM", "TPM"]] 117 | 118 | print("Reference transcripts:") 119 | print(df3.sample(10)) 120 | print("") 121 | 122 | print("Novel transcripts:") 123 | print(df_novel_transcripts.sample(10)) 124 | print("") 125 | 126 | colnames=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18'] 127 | dfA1 = pd.read_csv('ncbiRefSeqLink.txt', sep = '\t', low_memory=False, names=colnames, header=None) 128 | print(dfA1.head(10)) 129 | 130 | dfA2 = dfA1[['0', '1', '2', '3', '5', '14', '16']] 131 | 132 | dfA2 = dfA2.rename(columns={'0': 'ref_id', '1': 'Annotation Status', '2' : 'NCBI RefSeq Gene ID', '3' : 'Transcript Description', '5' : 'NCBI RefSeq Protein ID', '14' : 'Alternative Gene Name', '16' : 'RefSeq Transcript Info'}) 133 | print("ncbiRefSeqLink annotation:") 134 | print(dfA2.sample(10)) 135 | print("") 136 | 137 | colnames = ['qry_id', 'cds_seq', 'none'] 138 | cds = pd.read_csv('transcripts_Isoform2.tab', sep = '\t', names=colnames) 139 | cds2 = cds[["qry_id", "cds_seq"]] 140 | print("transcripts file:") 141 | print(cds2.sample(10)) 142 | print("") 143 | 144 | result1 = pd.merge(df3, dfA2, on='ref_id', how='inner') 145 | result1.sample(10) 146 | result2 = pd.merge(result1, cds2, on='qry_id', how='inner') 147 | result2.sample(10) 148 | result3 = pd.merge(df_novel_transcripts, cds2, on='qry_id', how='inner') 149 | result3.sample(10) 150 | print("Number of Joined Transcripts (reference):", result2.shape[0]) 151 | print("") 152 | print("Number of Joined Transcripts (novel):", result3.shape[0]) 153 | print("") 154 | result2.to_csv('Ref_Transcript_Annotation.csv', index=False) 155 | result3.to_csv('Novel_Transcript_Annotation.csv', index=False) 156 | print(bcolors.OKGREEN + "::: Done. Ref_Transcript_Annotation.csv and Novel_Transcript_Annotation.csv were succesfully produced" + bcolors.ENDC) 157 | print("") 158 | END 159 | 160 | rm -r -f transcripts_Isoform.tab transcripts_Isoform2.tab stringtie_for_script.tmap ncbiRefSeqLink.txt 161 | echo "::: All done. :::" 162 | } 163 | -------------------------------------------------------------------------------- /data_examples/transcripts.gtf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cfarkas/annotate_my_genomes/401f5ebae995aed4f07184601edc0c8a368221be/data_examples/transcripts.gtf.gz -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: annotate_my_genomes 2 | channels: 3 | - anaconda 4 | - conda-forge 5 | - bioconda 6 | - defaults 7 | dependencies: 8 | - _libgcc_mutex=0.1=conda_forge 9 | - _openmp_mutex=4.5=1_gnu 10 | - _r-mutex=1.0.1=anacondar_1 11 | - argtable2=2.13=h14c3975_1001 12 | - atk-1.0=2.36.0=h3371d22_4 13 | - bedtools=2.30.0=h468198e_3 14 | - binutils_impl_linux-64=2.36.1=h193b22a_2 15 | - binutils_linux-64=2.36=hf3e587d_7 16 | - bioconductor-seqlogo=1.60.0=r41hdfd78af_0 17 | - bwidget=1.9.14=ha770c72_1 18 | - bzip2=1.0.8=h7f98852_4 19 | - c-ares=1.18.1=h7f98852_0 20 | - ca-certificates=2020.10.14=0 21 | - cairo=1.16.0=ha12eb4b_1010 22 | - certifi=2020.6.20=py36_0 23 | - clustalo=1.2.4=h87f3376_5 24 | - coreutils=9.0=h7f98852_0 25 | - curl=7.82.0=h7bff187_0 26 | - emboss=6.6.0=h5a44aac_5 27 | - expat=2.4.7=h27087fc_0 28 | - fasta_ushuffle=0.2=hec16e2b_4 29 | - feelnc=0.2=pl526_0 30 | - font-ttf-dejavu-sans-mono=2.37=hab24e00_0 31 | - font-ttf-inconsolata=3.000=h77eed37_0 32 | - font-ttf-source-code-pro=2.038=h77eed37_0 33 | - font-ttf-ubuntu=0.83=hab24e00_0 34 | - fontconfig=2.13.96=h8e229c2_2 35 | - fonts-conda-ecosystem=1=0 36 | - fonts-conda-forge=1=0 37 | - freetype=2.10.4=h0708190_1 38 | - fribidi=1.0.10=h36c2ea0_0 39 | - gawk=5.1.0=h7b6447c_0 40 | - gcc_impl_linux-64=9.4.0=h03d3576_13 41 | - gcc_linux-64=9.4.0=h391b98a_7 42 | - gdk-pixbuf=2.42.6=h04a7f16_0 43 | - gettext=0.19.8.1=h73d1719_1008 44 | - gffcompare=0.11.2=h9f5acd7_3 45 | - gffread=0.12.7=hd03093a_1 46 | - gfortran_impl_linux-64=9.4.0=h0003116_13 47 | - gfortran_linux-64=9.4.0=hf0ab688_7 48 | - giflib=5.2.1=h36c2ea0_2 49 | - gmap=2021.08.25=pl5262h36cd882_0 50 | - graphite2=1.3.13=h58526e2_1001 51 | - graphviz=3.0.0=h5abf519_0 52 | - gsl=2.7=he838d99_0 53 | - gtk2=2.24.33=h90689f9_2 54 | - gts=0.7.6=h64030ff_2 55 | - gxx_impl_linux-64=9.4.0=h03d3576_13 56 | - gxx_linux-64=9.4.0=h0316aca_7 57 | - harfbuzz=3.4.0=hb4a5f5f_0 58 | - htslib=1.14=h9753748_2 59 | - icu=69.1=h9c3ff4c_0 60 | - jbig=2.1=h7f98852_2003 61 | - jpeg=9e=h7f98852_0 62 | - k8=0.2.5=hd03093a_2 63 | - kernel-headers_linux-64=2.6.32=he073ed8_15 64 | - keyutils=1.6.1=h166bdaf_0 65 | - kmerinshort=1.0.1=0 66 | - krb5=1.19.2=h3790be6_4 67 | - ld_impl_linux-64=2.36.1=hea4e1c9_2 68 | - lerc=3.0=h9c3ff4c_0 69 | - libblas=3.9.0=13_linux64_openblas 70 | - libcblas=3.9.0=13_linux64_openblas 71 | - libcurl=7.82.0=h7bff187_0 72 | - libdb=6.2.32=h9c3ff4c_0 73 | - libdeflate=1.10=h7f98852_0 74 | - libedit=3.1.20191231=he28a2e2_2 75 | - libev=4.33=h516909a_1 76 | - libffi=3.4.2=h7f98852_5 77 | - libgcc=7.2.0=h69d50b8_2 78 | - libgcc-devel_linux-64=9.4.0=hd854feb_13 79 | - libgcc-ng=11.2.0=h1d223b6_13 80 | - libgd=2.3.3=h283352f_2 81 | - libgfortran-ng=11.2.0=h69a702a_13 82 | - libgfortran5=11.2.0=h5c6108e_13 83 | - libglib=2.70.2=h174f98d_4 84 | - libgomp=11.2.0=h1d223b6_13 85 | - libiconv=1.16=h516909a_0 86 | - liblapack=3.9.0=13_linux64_openblas 87 | - libnghttp2=1.47.0=h727a467_0 88 | - libnsl=2.0.0=h7f98852_0 89 | - libopenblas=0.3.18=pthreads_h8fe5266_0 90 | - libpng=1.6.37=h21135ba_2 91 | - librsvg=2.52.5=h0a9e6e8_2 92 | - libsanitizer=9.4.0=h79bfe98_13 93 | - libssh2=1.10.0=ha56f1ee_2 94 | - libstdcxx-devel_linux-64=9.4.0=hd854feb_13 95 | - libstdcxx-ng=11.2.0=he4da1e4_13 96 | - libtiff=4.3.0=h542a066_3 97 | - libtool=2.4.6=h9c3ff4c_1008 98 | - libuuid=2.32.1=h7f98852_1000 99 | - libwebp=1.2.2=h3452ae3_0 100 | - libwebp-base=1.2.2=h7f98852_1 101 | - libxcb=1.13=h7f98852_1004 102 | - libxml2=2.9.12=h885dcf4_1 103 | - libxslt=1.1.33=h0ef7038_3 104 | - libzlib=1.2.11=h36c2ea0_1013 105 | - lz4-c=1.9.3=h9c3ff4c_1 106 | - make=4.3=hd18ef5c_1 107 | - minimap2=2.24=h7132678_1 108 | - ncurses=6.2=h58526e2_4 109 | - numpy=1.19.5=py36hfc0c790_2 110 | - openssl=1.1.1l=h7f98852_0 111 | - pandas=1.1.3=py36he6710b0_0 112 | - pango=1.50.5=h4dcc4a0_0 113 | - parallel=20220222=ha770c72_0 114 | - pcre=8.45=h9c3ff4c_0 115 | - pcre2=10.37=h032f7d1_0 116 | - perl=5.26.2=h36c2ea0_1008 117 | - perl-aceperl=1.92=pl526_2 118 | - perl-algorithm-diff=1.1903=pl526_2 119 | - perl-algorithm-munkres=0.08=pl526_1 120 | - perl-apache-test=1.40=pl526_1 121 | - perl-app-cpanminus=1.7044=pl526_1 122 | - perl-appconfig=1.71=pl526_1 123 | - perl-array-compare=3.0.1=pl526_1 124 | - perl-autoloader=5.74=pl526_2 125 | - perl-base=2.23=pl526_1 126 | - perl-bio-asn1-entrezgene=1.73=pl5262hdfd78af_2 127 | - perl-bio-featureio=1.6.905=pl5262hdfd78af_3 128 | - perl-bio-phylo=0.58=pl5262hdfd78af_3 129 | - perl-bio-samtools=1.43=pl526h1341992_1 130 | - perl-bioperl=1.6.924=6 131 | - perl-bioperl-core=1.007002=pl5262hdfd78af_3 132 | - perl-bioperl-run=1.007002=pl5262hdfd78af_5 133 | - perl-business-isbn=3.004=pl526_0 134 | - perl-business-isbn-data=20140910.003=pl526_0 135 | - perl-cache-cache=1.08=pl526_0 136 | - perl-capture-tiny=0.48=pl526_0 137 | - perl-carp=1.38=pl526_3 138 | - perl-cgi=4.44=pl526h14c3975_1 139 | - perl-class-data-inheritable=0.08=pl526_1 140 | - perl-class-inspector=1.34=pl526_0 141 | - perl-class-load=0.25=pl526_0 142 | - perl-class-load-xs=0.10=pl526h6bb024c_2 143 | - perl-class-method-modifiers=2.12=pl526_0 144 | - perl-clone=0.42=pl526h516909a_0 145 | - perl-common-sense=3.74=pl526_2 146 | - perl-compress-raw-zlib=2.087=pl526hc9558a2_0 147 | - perl-constant=1.33=pl526_1 148 | - perl-convert-binary-c=0.78=pl526h6bb024c_3 149 | - perl-convert-binhex=1.125=pl526_1 150 | - perl-crypt-rc4=2.02=pl526_1 151 | - perl-data-dumper=2.173=pl526_0 152 | - perl-data-optlist=0.110=pl526_2 153 | - perl-data-stag=0.14=pl526_1 154 | - perl-date-format=2.30=pl526_2 155 | - perl-db-file=1.855=pl526h516909a_0 156 | - perl-dbd-sqlite=1.64=pl526h516909a_0 157 | - perl-dbi=1.642=pl526_0 158 | - perl-devel-globaldestruction=0.14=pl526_0 159 | - perl-devel-overloadinfo=0.005=pl526_0 160 | - perl-devel-stacktrace=2.04=pl526_0 161 | - perl-digest-hmac=1.03=pl526_3 162 | - perl-digest-md5=2.55=pl526_0 163 | - perl-digest-perl-md5=1.9=pl526_1 164 | - perl-digest-sha1=2.13=pl526h6bb024c_1 165 | - perl-dist-checkconflicts=0.11=pl526_2 166 | - perl-dynaloader=1.25=pl526_1 167 | - perl-email-date-format=1.005=pl526_2 168 | - perl-encode=2.88=pl526_1 169 | - perl-encode-locale=1.05=pl526_6 170 | - perl-error=0.17027=pl526_1 171 | - perl-eval-closure=0.14=pl526h6bb024c_4 172 | - perl-exception-class=1.44=pl526_0 173 | - perl-exporter=5.72=pl526_1 174 | - perl-exporter-tiny=1.002001=pl526_0 175 | - perl-extutils-makemaker=7.36=pl526_1 176 | - perl-file-listing=6.04=pl526_1 177 | - perl-file-path=2.16=pl526_0 178 | - perl-file-slurp-tiny=0.004=pl526_1 179 | - perl-file-sort=1.01=pl526_2 180 | - perl-file-temp=0.2304=pl526_2 181 | - perl-file-which=1.23=pl526_0 182 | - perl-font-afm=1.20=pl526_2 183 | - perl-font-ttf=1.06=pl526_0 184 | - perl-gd=2.68=pl526he941832_0 185 | - perl-getopt-long=2.50=pl526_1 186 | - perl-graph=0.9704=pl526_1 187 | - perl-graphviz=2.20=1 188 | - perl-html-element-extended=1.18=pl526_1 189 | - perl-html-entities-numbered=0.04=pl526_1 190 | - perl-html-formatter=2.16=pl526_0 191 | - perl-html-parser=3.72=pl526h6bb024c_5 192 | - perl-html-tableextract=2.13=pl526_2 193 | - perl-html-tagset=3.20=pl526_3 194 | - perl-html-tidy=1.60=pl526_0 195 | - perl-html-tree=5.07=pl526_1 196 | - perl-html-treebuilder-xpath=0.14=pl526_1 197 | - perl-http-cookies=6.04=pl526_0 198 | - perl-http-daemon=6.01=pl526_1 199 | - perl-http-date=6.02=pl526_3 200 | - perl-http-message=6.18=pl526_0 201 | - perl-http-negotiate=6.01=pl526_3 202 | - perl-image-info=1.38=pl526_1 203 | - perl-image-size=3.300=pl526_2 204 | - perl-io-html=1.001=pl526_2 205 | - perl-io-sessiondata=1.03=pl526_1 206 | - perl-io-socket-ssl=2.066=pl526_0 207 | - perl-io-string=1.08=pl526_3 208 | - perl-io-stringy=2.111=pl526_1 209 | - perl-io-tty=1.12=pl526_1 210 | - perl-ipc-run=20180523.0=pl526_0 211 | - perl-ipc-sharelite=0.17=pl526h6bb024c_1 212 | - perl-jcode=2.07=pl526_2 213 | - perl-json=4.02=pl526_0 214 | - perl-json-xs=2.34=pl526h6bb024c_3 215 | - perl-libwww-perl=6.39=pl526_0 216 | - perl-libxml-perl=0.08=pl526_2 217 | - perl-list-moreutils=0.428=pl526_1 218 | - perl-list-moreutils-xs=0.428=pl526_0 219 | - perl-local-lib=2.000024=pl526_0 220 | - perl-lwp-mediatypes=6.04=pl526_0 221 | - perl-lwp-protocol-https=6.07=pl526_4 222 | - perl-lwp-simple=6.15=pl526h470a237_4 223 | - perl-mailtools=2.21=pl526_0 224 | - perl-math-cdf=0.1=pl526h14c3975_5 225 | - perl-math-derivative=1.01=pl526_0 226 | - perl-math-random=0.72=pl526h14c3975_2 227 | - perl-math-spline=0.02=pl526_2 228 | - perl-mime-base64=3.15=pl526_1 229 | - perl-mime-lite=3.030=pl526_1 230 | - perl-mime-tools=5.508=pl526_1 231 | - perl-mime-types=2.17=pl526_0 232 | - perl-mldbm=2.05=pl526_1 233 | - perl-module-build=0.4224=pl526h470a237_1 234 | - perl-module-implementation=0.09=pl526_2 235 | - perl-module-runtime=0.016=pl526_1 236 | - perl-module-runtime-conflicts=0.003=pl526_0 237 | - perl-moo=2.003004=pl526_0 238 | - perl-moose=2.2011=pl526hf484d3e_1 239 | - perl-mozilla-ca=20180117=pl526_1 240 | - perl-mro-compat=0.13=pl526_0 241 | - perl-net-http=6.19=pl526_0 242 | - perl-net-ssleay=1.88=pl526h90d6eec_0 243 | - perl-ntlm=1.09=pl526_4 244 | - perl-ole-storage_lite=0.19=pl526_3 245 | - perl-package-deprecationmanager=0.17=pl526_0 246 | - perl-package-stash=0.38=pl526hf484d3e_1 247 | - perl-package-stash-xs=0.28=pl526hf484d3e_1 248 | - perl-parallel-forkmanager=2.02=pl526_0 249 | - perl-params-util=1.07=pl526h6bb024c_4 250 | - perl-parent=0.236=pl526_1 251 | - perl-parse-recdescent=1.967015=pl526_0 252 | - perl-pathtools=3.75=pl526h14c3975_1 253 | - perl-pdf-api2=2.035=pl526_0 254 | - perl-postscript=0.06=pl526_2 255 | - perl-role-tiny=2.000008=pl526_0 256 | - perl-scalar-list-utils=1.52=pl526h516909a_0 257 | - perl-set-scalar=1.29=pl526_2 258 | - perl-soap-lite=1.19=pl526_1 259 | - perl-socket=2.027=pl526_1 260 | - perl-sort-naturally=1.03=pl526_2 261 | - perl-spreadsheet-parseexcel=0.65=pl526_2 262 | - perl-spreadsheet-writeexcel=2.40=pl526_2 263 | - perl-statistics-descriptive=3.0702=pl526_0 264 | - perl-storable=3.15=pl526h14c3975_0 265 | - perl-sub-exporter=0.987=pl526_2 266 | - perl-sub-exporter-progressive=0.001013=pl526_0 267 | - perl-sub-identify=0.14=pl526h14c3975_0 268 | - perl-sub-install=0.928=pl526_2 269 | - perl-sub-name=0.21=pl526_1 270 | - perl-sub-quote=2.006003=pl526_1 271 | - perl-sub-uplevel=0.2800=pl526h14c3975_2 272 | - perl-svg=2.84=pl526_0 273 | - perl-svg-graph=0.02=pl526_3 274 | - perl-task-weaken=1.06=pl526_0 275 | - perl-template-toolkit=2.26=pl526_1 276 | - perl-test-deep=1.128=pl526_1 277 | - perl-test-differences=0.67=pl526_0 278 | - perl-test-exception=0.43=pl526_2 279 | - perl-test-leaktrace=0.16=pl526h14c3975_2 280 | - perl-test-most=0.35=pl526_0 281 | - perl-test-pod=1.52=pl526_0 282 | - perl-test-requiresinternet=0.05=pl526_0 283 | - perl-test-warn=0.36=pl526_1 284 | - perl-text-diff=1.45=pl526_0 285 | - perl-threaded=5.32.1=hdfd78af_1 286 | - perl-tie-ixhash=1.23=pl526_2 287 | - perl-time-local=1.28=pl526_1 288 | - perl-timedate=2.30=pl526_1 289 | - perl-tree-dag_node=1.31=pl526_0 290 | - perl-try-tiny=0.30=pl526_1 291 | - perl-type-tiny=1.004004=pl526_0 292 | - perl-types-serialiser=1.0=pl526_2 293 | - perl-unicode-map=0.112=pl526h6bb024c_3 294 | - perl-uri=1.76=pl526_0 295 | - perl-www-robotrules=6.02=pl526_3 296 | - perl-xml-dom=1.46=pl526_0 297 | - perl-xml-dom-xpath=0.14=pl526_1 298 | - perl-xml-filter-buffertext=1.01=pl526_2 299 | - perl-xml-libxml=2.0132=pl526h7ec2d77_1 300 | - perl-xml-libxslt=1.94=pl526_1 301 | - perl-xml-namespacesupport=1.12=pl526_0 302 | - perl-xml-parser=2.44_01=pl5262hc3e0081_1002 303 | - perl-xml-regexp=0.04=pl526_2 304 | - perl-xml-sax=1.02=pl526_0 305 | - perl-xml-sax-base=1.09=pl526_0 306 | - perl-xml-sax-expat=0.51=pl526_3 307 | - perl-xml-sax-writer=0.57=pl526_0 308 | - perl-xml-simple=2.25=pl526_1 309 | - perl-xml-twig=3.52=pl526_2 310 | - perl-xml-writer=0.625=pl526_2 311 | - perl-xml-xpath=1.44=pl526_0 312 | - perl-xml-xpathengine=0.14=pl526_2 313 | - perl-xsloader=0.24=pl526_0 314 | - perl-yaml=1.29=pl526_0 315 | - pip=20.0.2=py36_1 316 | - pixman=0.40.0=h36c2ea0_0 317 | - pthread-stubs=0.4=h36c2ea0_1001 318 | - python=3.6.15=hb7a2778_0_cpython 319 | - python-dateutil=2.8.1=py_0 320 | - python_abi=3.6=2_cp36m 321 | - pytz=2020.1=py_0 322 | - r-assertthat=0.2.1=r41hc72bb7e_2 323 | - r-backports=1.4.1=r41hcfec24a_0 324 | - r-base=4.1.2=h2553ce4_1 325 | - r-bitops=1.0_7=r41hcfec24a_0 326 | - r-brio=1.1.3=r41hcfec24a_0 327 | - r-callr=3.7.0=r41hc72bb7e_0 328 | - r-catools=1.18.2=r41h03ef668_0 329 | - r-cli=3.2.0=r41h03ef668_0 330 | - r-colorspace=2.0_3=r41h06615bd_0 331 | - r-crayon=1.5.0=r41hc72bb7e_0 332 | - r-desc=1.4.0=r41hc72bb7e_0 333 | - r-diffobj=0.3.5=r41hcfec24a_0 334 | - r-digest=0.6.29=r41h03ef668_0 335 | - r-ellipsis=0.3.2=r41hcfec24a_0 336 | - r-evaluate=0.15=r41hc72bb7e_0 337 | - r-fansi=1.0.2=r41hcfec24a_0 338 | - r-farver=2.1.0=r41h03ef668_0 339 | - r-ggplot2=3.3.5=r41hc72bb7e_0 340 | - r-glue=1.6.2=r41h06615bd_0 341 | - r-gplots=3.1.1=r41hc72bb7e_0 342 | - r-gtable=0.3.0=r41hc72bb7e_3 343 | - r-gtools=3.9.2=r41hcfec24a_0 344 | - r-isoband=0.2.5=r41h03ef668_0 345 | - r-jsonlite=1.8.0=r41h06615bd_0 346 | - r-kernsmooth=2.23_20=r41h742201e_0 347 | - r-labeling=0.4.2=r41hc72bb7e_1 348 | - r-lattice=0.20_45=r41hcfec24a_0 349 | - r-lifecycle=1.0.1=r41hc72bb7e_0 350 | - r-magrittr=2.0.2=r41hcfec24a_0 351 | - r-mass=7.3_55=r41hcfec24a_0 352 | - r-matrix=1.4_0=r41he454529_0 353 | - r-mgcv=1.8_39=r41h0154571_0 354 | - r-munsell=0.5.0=r41hc72bb7e_1004 355 | - r-nlme=3.1_155=r41h859d828_0 356 | - r-pillar=1.7.0=r41hc72bb7e_0 357 | - r-pkgconfig=2.0.3=r41hc72bb7e_1 358 | - r-pkgload=1.2.4=r41h03ef668_0 359 | - r-praise=1.0.0=r41hc72bb7e_1005 360 | - r-processx=3.5.2=r41hcfec24a_0 361 | - r-ps=1.6.0=r41hcfec24a_0 362 | - r-r6=2.5.1=r41hc72bb7e_0 363 | - r-randomforest=4.6_14=r41h859d828_1004 364 | - r-rcolorbrewer=1.1_2=r41h785f33e_1003 365 | - r-rcpp=1.0.8=r41h03ef668_0 366 | - r-rematch2=2.1.2=r41hc72bb7e_1 367 | - r-rlang=0.4.12=r41hcfec24a_0 368 | - r-rocr=1.0_11=r41hc72bb7e_1 369 | - r-rprojroot=2.0.2=r41hc72bb7e_0 370 | - r-rstudioapi=0.13=r41hc72bb7e_0 371 | - r-scales=1.1.1=r41hc72bb7e_0 372 | - r-testthat=3.1.2=r41h03ef668_0 373 | - r-tibble=3.1.6=r41hcfec24a_0 374 | - r-utf8=1.2.2=r41hcfec24a_0 375 | - r-vctrs=0.3.8=r41hcfec24a_1 376 | - r-viridislite=0.4.0=r41hc72bb7e_0 377 | - r-waldo=0.3.1=r41hc72bb7e_0 378 | - r-withr=2.5.0=r41hc72bb7e_0 379 | - readline=8.1=h46c0cb4_0 380 | - seqkit=2.1.0=h9ee0642_0 381 | - setuptools=49.6.0=py36h5fab9bb_3 382 | - six=1.15.0=py_0 383 | - sqlite=3.37.0=h9cd32fc_0 384 | - stringtie=2.2.1=hecb563c_2 385 | - sysroot_linux-64=2.12=he073ed8_15 386 | - tidyp=1.04=hec16e2b_4 387 | - tk=8.6.12=h27826a3_0 388 | - tktable=2.10=hb7b940f_3 389 | - transdecoder=5.5.0=pl5262hdfd78af_4 390 | - wheel=0.37.1=pyhd8ed1ab_0 391 | - xorg-kbproto=1.0.7=h7f98852_1002 392 | - xorg-libice=1.0.10=h7f98852_0 393 | - xorg-libsm=1.2.3=hd9c2040_1000 394 | - xorg-libx11=1.7.2=h7f98852_0 395 | - xorg-libxau=1.0.9=h7f98852_0 396 | - xorg-libxdmcp=1.1.3=h7f98852_0 397 | - xorg-libxext=1.3.4=h7f98852_1 398 | - xorg-libxrender=0.9.10=h7f98852_1003 399 | - xorg-libxt=1.2.1=h7f98852_2 400 | - xorg-renderproto=0.11.1=h7f98852_1002 401 | - xorg-xextproto=7.3.0=h7f98852_1002 402 | - xorg-xproto=7.0.31=h7f98852_1007 403 | - xz=5.2.5=h516909a_1 404 | - zlib=1.2.11=h36c2ea0_1013 405 | - zstd=1.5.2=ha95c52a_0 406 | -------------------------------------------------------------------------------- /makefile.nf: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env nextflow 2 | 3 | /* 4 | * pipeline input parameters 5 | */ 6 | 7 | params.workdir = './' 8 | params.conda = './environment.yml' 9 | 10 | println """\ 11 | M A K E F I L E 12 | ======================================= 13 | working_directory : ${params.workdir} 14 | environment : ${params.conda} 15 | """ 16 | .stripIndent() 17 | 18 | 19 | process make_and_install { 20 | echo true 21 | stageInMode 'copy' 22 | conda "${params.conda}" 23 | 24 | shell: 25 | ''' 26 | cd "!{params.workdir}" 27 | dir=$(cd -P -- "$(dirname -- "$0")" && pwd -P) 28 | rm -r -f swissprot 29 | mkdir swissprot 30 | cd swissprot 31 | wget --ignore-length ftp://ftp.ncbi.nlm.nih.gov/blast/db/swissprot.tar.gz 32 | gunzip swissprot.tar.gz 33 | tar -xvf swissprot.tar 34 | SWISSPROT_PATH=$PWD 35 | echo "$SWISSPROT_PATH" 36 | cd "!{params.workdir}"/test/ 37 | exec 3<> gawn_config.sh 38 | #!/bin/bash >> gawn_config.sh 39 | echo "" >> gawn_config.sh 40 | # Modify the following parameter values according to your experiment >> gawn_config.sh 41 | # Do not modify the parameter names or remove parameters >> gawn_config.sh 42 | # Do not add spaces around the equal (=) sign >> gawn_config.sh 43 | echo "" >> gawn_config.sh 44 | # Global parameters >> gawn_config.sh 45 | NCPUS=10 # Number of CPUs to use for analyses (int, 1+) >> gawn_config.sh 46 | echo "" >> gawn_config.sh 47 | # Genome indexing >> gawn_config.sh 48 | SKIP_GENOME_INDEXING=1 # 1 to skip genome indexing, 0 to index it >> gawn_config.sh 49 | echo "" >> gawn_config.sh 50 | # Genome annotation with transcriptome >> gawn_config.sh 51 | # NOTE: do not use compressed fasta files >> gawn_config.sh 52 | GENOME_NAME="genome.fasta" # Name of genome fasta file found in 03_data >> gawn_config.sh 53 | TRANSCRIPTOME_NAME="transcriptome.fasta" # Name of transcriptome fasta file found in 03_data >> gawn_config.sh 54 | echo "" >> gawn_config.sh 55 | # Path to swissprot database >> gawn_config.sh 56 | echo 'SWISSPROT_DB="'$SWISSPROT_PATH'/swissprot"' >> gawn_config.sh 57 | echo '#' >> gawn_config.sh 58 | exec 3>&- 59 | cd "!{params.workdir}" 60 | mkdir bin 61 | mkdir genome_1 62 | mkdir get_transcripts 63 | cp "!{params.workdir}"/test/gawn_config.sh "!{params.workdir}"/genome_1/ 64 | cp "!{params.workdir}"/test/gawn_config.sh "!{params.workdir}"/bash_scripts/ 65 | cp "!{params.workdir}"/test/gawn_config.sh "!{params.workdir}" 66 | cd "!{params.workdir}" 67 | git clone https://github.com/cfarkas/shc.git 68 | cd shc/ 69 | ./autogen.sh 70 | ./configure 71 | make 72 | # Install 73 | cd "!{params.workdir}" 74 | "!{params.workdir}"/shc/src/shc -f "!{params.workdir}"/bash_scripts/annotate_my_genomes.sh -o ./annotate-my-genomes 75 | "!{params.workdir}"/shc/src/shc -f "!{params.workdir}"/bash_scripts/annotate_my_genomes.sh -o ./annotate-my-genomes 76 | "!{params.workdir}"/shc/src/shc -f "!{params.workdir}"/bash_scripts/get_transcripts.sh -o ./get-transcripts 77 | "!{params.workdir}"/shc/src/shc -f "!{params.workdir}"/bash_scripts/genome_download.sh -o ./genome-download 78 | "!{params.workdir}"/shc/src/shc -f "!{params.workdir}"/bash_scripts/genome_download_macOSX.sh -o ./genome-download-macOSX 79 | "!{params.workdir}"/shc/src/shc -f "!{params.workdir}"/bash_scripts/add_ncbi_annotation.sh -o ./add-ncbi-annotation 80 | "!{params.workdir}"/shc/src/shc -f "!{params.workdir}"/bash_scripts/isoform_identification.sh -o ./isoform-identification 81 | mv "!{params.workdir}"/annotate-my-genomes "!{params.workdir}"/get-transcripts "!{params.workdir}"/genome-download "!{params.workdir}"/genome-download-macOSX "!{params.workdir}"/add-ncbi-annotation "!{params.workdir}"/isoform-identification "!{params.workdir}"/bin/ 82 | cp "!{params.workdir}"/bin/annotate-my-genomes "!{params.workdir}"/test/ 83 | cp "!{params.workdir}"/bin/annotate-my-genomes "!{params.workdir}"/genome_1/ 84 | cp "!{params.workdir}"/bin/genome-download "!{params.workdir}"/test/ 85 | cp "!{params.workdir}"/bin/genome-download "!{params.workdir}"/genome_1/ 86 | cp "!{params.workdir}"/bin/genome-download-macOSX "!{params.workdir}"/test/ 87 | cp "!{params.workdir}"/bin/genome-download-macOSX "!{params.workdir}"/genome_1/ 88 | cp "!{params.workdir}"/bin/add-ncbi-annotation "!{params.workdir}"/test/ 89 | cp "!{params.workdir}"/bin/add-ncbi-annotation "!{params.workdir}"/genome_1/ 90 | cp "!{params.workdir}"/bin/isoform-identification "!{params.workdir}"/test/ 91 | cp "!{params.workdir}"/bin/isoform-identification "!{params.workdir}"/genome_1/ 92 | cp "!{params.workdir}"/bin/get-transcripts "!{params.workdir}"/get_transcripts/ 93 | cp "!{params.workdir}"/bin/genome-download "!{params.workdir}"/get_transcripts/ 94 | echo "" 95 | echo "::: All done. Binaries are located in "!{params.workdir}"/bin/ folder. :::" 96 | echo "" 97 | echo "::: With sudo privileges, users can do : sudo cp ./bin/* /usr/local/bin/ :::" 98 | echo "" 99 | echo "" 100 | ''' 101 | } 102 | -------------------------------------------------------------------------------- /makefile.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | dir=$(cd -P -- "$(dirname -- "$0")" && pwd -P) 4 | rm -r -f swissprot 5 | mkdir swissprot 6 | cd swissprot 7 | wget ftp://ftp.ncbi.nlm.nih.gov/blast/db/swissprot.tar.gz 8 | gunzip swissprot.tar.gz 9 | tar -xvf swissprot.tar 10 | SWISSPROT_PATH=$PWD 11 | echo "$SWISSPROT_PATH" 12 | cd .. 13 | cd test 14 | echo 'SWISSPROT_DB="'$SWISSPROT_PATH'/swissprot"' >> gawn_config.sh 15 | echo '#' >> gawn_config.sh 16 | cd .. 17 | mkdir bin 18 | mkdir genome_1 19 | mkdir get_transcripts 20 | cp ./test/gawn_config.sh ./genome_1/ 21 | cp ./test/gawn_config.sh ./bash_scripts/ 22 | cp ./test/gawn_config.sh ./ 23 | git clone https://github.com/cfarkas/shc.git 24 | cd shc/ 25 | ./autogen.sh 26 | ./configure 27 | make 28 | cd .. 29 | echo "" 30 | echo "make done. Continue with install" 31 | # Install 32 | ./shc/src/shc -f ./bash_scripts/annotate_my_genomes.sh -o ./annotate-my-genomes 33 | ./shc/src/shc -f ./bash_scripts/get_transcripts.sh -o ./get-transcripts 34 | ./shc/src/shc -f ./bash_scripts/genome_download.sh -o ./genome-download 35 | ./shc/src/shc -f ./bash_scripts/genome_download_macOSX.sh -o ./genome-download-macOSX 36 | ./shc/src/shc -f ./bash_scripts/add_ncbi_annotation.sh -o ./add-ncbi-annotation 37 | ./shc/src/shc -f ./bash_scripts/isoform_identification.sh -o ./isoform-identification 38 | mv annotate-my-genomes get-transcripts genome-download genome-download-macOSX add-ncbi-annotation isoform-identification ./bin/ 39 | cp ./bin/annotate-my-genomes ./test/ 40 | cp ./bin/annotate-my-genomes ./genome_1/ 41 | cp ./bin/genome-download ./test/ 42 | cp ./bin/genome-download ./genome_1/ 43 | cp ./bin/genome-download-macOSX ./test/ 44 | cp ./bin/genome-download-macOSX ./genome_1/ 45 | cp ./bin/add-ncbi-annotation ./test/ 46 | cp ./bin/add-ncbi-annotation ./genome_1/ 47 | cp ./bin/isoform-identification ./test/ 48 | cp ./bin/isoform-identification ./genome_1/ 49 | cp ./bin/get-transcripts ./get_transcripts/ 50 | cp ./bin/genome-download ./get_transcripts/ 51 | echo "::: All done. Binaries are located in ./bin/ folder. :::" 52 | echo "" 53 | echo "::: With sudo privileges, users can do : sudo cp ./bin/* /usr/local/bin/ :::" 54 | echo "" 55 | # 56 | -------------------------------------------------------------------------------- /nextflow_scripts/22.04_environment.yml: -------------------------------------------------------------------------------- 1 | name: annotate_my_genomes 2 | channels: 3 | - anaconda 4 | - conda-forge 5 | - bioconda 6 | - defaults 7 | - r 8 | dependencies: 9 | - _libgcc_mutex=0.1 10 | - _openmp_mutex=4.5 11 | - _r-mutex=1.0.1 12 | - argtable2=2.13 13 | - atk-1.0=2.36.0 14 | - bedtools=2.30.0 15 | - binutils_impl_linux-64=2.39 16 | - binutils_linux-64=2.39 17 | - bottleneck=1.3.5 18 | - bwidget=1.9.14 19 | - bzip2=1.0.8 20 | - c-ares=1.18.1 21 | - ca-certificates=2022.07.19 22 | - cairo=1.16.0 23 | - clustalo=1.2.4 24 | - clustalw=2.1 25 | - coreutils=9.1 26 | - curl=7.83.1 27 | - emboss=6.6.0 28 | - expat=2.5.0 29 | - fasta_ushuffle=0.2 30 | - feelnc=0.2 31 | - font-ttf-dejavu-sans-mono=2.37 32 | - font-ttf-inconsolata=3.000 33 | - font-ttf-source-code-pro=2.038 34 | - font-ttf-ubuntu=0.83 35 | - fontconfig=2.14.0 36 | - fonts-conda-ecosystem=1 37 | - fonts-conda-forge=1 38 | - freetype=2.10.4 39 | - fribidi=1.0.10 40 | - gawk=5.1.0 41 | - gcc_impl_linux-64=10.4.0 42 | - gcc_linux-64=10.4.0 43 | - gdk-pixbuf=2.42.8 44 | - gettext=0.21.1 45 | - gffcompare=0.11.2 46 | - gffread=0.12.7 47 | - gfortran_impl_linux-64=10.4.0 48 | - gfortran_linux-64=10.4.0 49 | - giflib=5.2.1 50 | - gmap=2021.08.25 51 | - graphite2=1.3.13 52 | - graphviz=2.50.0 53 | - gsl=2.7 54 | - gtk2=2.24.33 55 | - gts=0.7.6 56 | - gxx_impl_linux-64=10.4.0 57 | - gxx_linux-64=10.4.0 58 | - harfbuzz=4.4.1 59 | - htslib=1.14 60 | - icu=70.1 61 | - jpeg=9e 62 | - k8=0.2.5 63 | - kernel-headers_linux-64=2.6.32 64 | - keyutils=1.6.1 65 | - kmerinshort=1.0.1 66 | - krb5=1.19.3 67 | - ld_impl_linux-64=2.39 68 | - lerc=3.0 69 | - libblas=3.9.0 70 | - libcblas=3.9.0 71 | - libcurl=7.83.1 72 | - libdb=6.2.32 73 | - libdeflate=1.10 74 | - libedit=3.1.20191231 75 | - libev=4.33 76 | - libffi=3.4.2 77 | - libgcc=7.2.0 78 | - libgcc-devel_linux-64=10.4.0 79 | - libgcc-ng=12.2.0 80 | - libgd=2.3.3 81 | - libgfortran-ng=12.2.0 82 | - libgfortran5=12.2.0 83 | - libglib=2.70.2 84 | - libgomp=12.2.0 85 | - libiconv=1.17 86 | - liblapack=3.9.0 87 | - libnghttp2=1.47.0 88 | - libnsl=2.0.0 89 | - libopenblas=0.3.21 90 | - libpng=1.6.37 91 | - librsvg=2.54.4 92 | - libsanitizer=10.4.0 93 | - libssh2=1.10.0 94 | - libstdcxx-devel_linux-64=10.4.0 95 | - libstdcxx-ng=12.2.0 96 | - libtiff=4.3.0 97 | - libtool=2.4.6 98 | - libuuid=2.32.1 99 | - libwebp=1.2.2 100 | - libwebp-base=1.2.2 101 | - libxcb=1.13 102 | - libxml2=2.9.14 103 | - libxslt=1.1.33 104 | - libzlib=1.2.11 105 | - lz4-c=1.9.3 106 | - make=4.3 107 | - minimap2=2.24 108 | - ncurses=6.3 109 | - nomkl=1.0 110 | - numexpr=2.8.3 111 | - numpy=1.21.6 112 | - openssl=1.1.1q 113 | - packaging=21.3 114 | - paml=4.9 115 | - pandas=1.3.5 116 | - pango=1.50.8 117 | - parallel=20220922 118 | - pcre=8.45 119 | - pcre2=10.37 120 | - perl=5.26.2 121 | - perl-aceperl=1.92 122 | - perl-algorithm-diff=1.1903 123 | - perl-algorithm-munkres=0.08 124 | - perl-apache-test=1.40 125 | - perl-app-cpanminus=1.7044 126 | - perl-appconfig=1.71 127 | - perl-array-compare=3.0.1 128 | - perl-autoloader=5.74 129 | - perl-base=2.23 130 | - perl-bio-asn1-entrezgene=1.73 131 | - perl-bio-coordinate=1.007001 132 | - perl-bio-featureio=1.6.905 133 | - perl-bio-phylo=0.58 134 | - perl-bio-samtools=1.43 135 | - perl-bio-tools-phylo-paml=1.7.3 136 | - perl-bio-tools-run-alignment-clustalw=1.7.4 137 | - perl-bio-tools-run-alignment-tcoffee=1.7.4 138 | - perl-bioperl=1.7.2 139 | - perl-bioperl-core=1.007002 140 | - perl-bioperl-run=1.007002 141 | - perl-business-isbn=3.004 142 | - perl-business-isbn-data=20140910.003 143 | - perl-cache-cache=1.08 144 | - perl-capture-tiny=0.48 145 | - perl-carp=1.38 146 | - perl-cgi=4.44 147 | - perl-class-data-inheritable=0.08 148 | - perl-class-inspector=1.34 149 | - perl-class-load=0.25 150 | - perl-class-load-xs=0.10 151 | - perl-class-method-modifiers=2.12 152 | - perl-clone=0.42 153 | - perl-common-sense=3.74 154 | - perl-compress-raw-zlib=2.087 155 | - perl-constant=1.33 156 | - perl-convert-binary-c=0.78 157 | - perl-convert-binhex=1.125 158 | - perl-crypt-rc4=2.02 159 | - perl-data-dumper=2.173 160 | - perl-data-optlist=0.110 161 | - perl-data-stag=0.14 162 | - perl-date-format=2.30 163 | - perl-db-file=1.855 164 | - perl-dbd-sqlite=1.64 165 | - perl-dbi=1.642 166 | - perl-devel-globaldestruction=0.14 167 | - perl-devel-overloadinfo=0.005 168 | - perl-devel-stacktrace=2.04 169 | - perl-digest-hmac=1.03 170 | - perl-digest-md5=2.55 171 | - perl-digest-perl-md5=1.9 172 | - perl-digest-sha1=2.13 173 | - perl-dist-checkconflicts=0.11 174 | - perl-dynaloader=1.25 175 | - perl-email-date-format=1.005 176 | - perl-encode=2.88 177 | - perl-encode-locale=1.05 178 | - perl-error=0.17027 179 | - perl-eval-closure=0.14 180 | - perl-exception-class=1.44 181 | - perl-exporter=5.72 182 | - perl-exporter-tiny=1.002001 183 | - perl-extutils-makemaker=7.36 184 | - perl-file-listing=6.04 185 | - perl-file-path=2.16 186 | - perl-file-slurp-tiny=0.004 187 | - perl-file-sort=1.01 188 | - perl-file-temp=0.2304 189 | - perl-file-which=1.23 190 | - perl-font-afm=1.20 191 | - perl-font-ttf=1.06 192 | - perl-gd=2.68 193 | - perl-getopt-long=2.50 194 | - perl-graph=0.9704 195 | - perl-graphviz=2.24 196 | - perl-html-element-extended=1.18 197 | - perl-html-entities-numbered=0.04 198 | - perl-html-formatter=2.16 199 | - perl-html-parser=3.72 200 | - perl-html-tableextract=2.13 201 | - perl-html-tagset=3.20 202 | - perl-html-tidy=1.60 203 | - perl-html-tree=5.07 204 | - perl-html-treebuilder-xpath=0.14 205 | - perl-http-cookies=6.04 206 | - perl-http-daemon=6.01 207 | - perl-http-date=6.02 208 | - perl-http-message=6.18 209 | - perl-http-negotiate=6.01 210 | - perl-image-info=1.38 211 | - perl-image-size=3.300 212 | - perl-io-html=1.001 213 | - perl-io-sessiondata=1.03 214 | - perl-io-socket-ssl=2.066 215 | - perl-io-string=1.08 216 | - perl-io-stringy=2.111 217 | - perl-io-tty=1.12 218 | - perl-ipc-run=20180523.0 219 | - perl-ipc-sharelite=0.17 220 | - perl-jcode=2.07 221 | - perl-json=4.02 222 | - perl-json-xs=2.34 223 | - perl-lib=0.63 224 | - perl-libwww-perl=6.39 225 | - perl-libxml-perl=0.08 226 | - perl-list-moreutils=0.428 227 | - perl-list-moreutils-xs=0.428 228 | - perl-local-lib=2.000024 229 | - perl-lwp-mediatypes=6.04 230 | - perl-lwp-protocol-https=6.07 231 | - perl-lwp-simple=6.15 232 | - perl-mailtools=2.21 233 | - perl-math-cdf=0.1 234 | - perl-math-derivative=1.01 235 | - perl-math-random=0.72 236 | - perl-math-spline=0.02 237 | - perl-mime-base64=3.15 238 | - perl-mime-lite=3.030 239 | - perl-mime-tools=5.508 240 | - perl-mime-types=2.17 241 | - perl-mldbm=2.05 242 | - perl-module-build=0.4224 243 | - perl-module-implementation=0.09 244 | - perl-module-runtime=0.016 245 | - perl-module-runtime-conflicts=0.003 246 | - perl-moo=2.003004 247 | - perl-moose=2.2011 248 | - perl-mozilla-ca=20180117 249 | - perl-mro-compat=0.13 250 | - perl-net-http=6.19 251 | - perl-net-ssleay=1.88 252 | - perl-ntlm=1.09 253 | - perl-ole-storage_lite=0.19 254 | - perl-package-deprecationmanager=0.17 255 | - perl-package-stash=0.38 256 | - perl-package-stash-xs=0.28 257 | - perl-parallel-forkmanager=2.02 258 | - perl-params-util=1.07 259 | - perl-parent=0.236 260 | - perl-parse-recdescent=1.967015 261 | - perl-pathtools=3.75 262 | - perl-pdf-api2=2.035 263 | - perl-pod-escapes=1.07 264 | - perl-pod-usage=1.69 265 | - perl-postscript=0.06 266 | - perl-role-tiny=2.000008 267 | - perl-scalar-list-utils=1.52 268 | - perl-set-scalar=1.29 269 | - perl-soap-lite=1.19 270 | - perl-socket=2.027 271 | - perl-sort-naturally=1.03 272 | - perl-spreadsheet-parseexcel=0.65 273 | - perl-spreadsheet-writeexcel=2.40 274 | - perl-statistics-descriptive=3.0702 275 | - perl-storable=3.15 276 | - perl-sub-exporter=0.987 277 | - perl-sub-exporter-progressive=0.001013 278 | - perl-sub-identify=0.14 279 | - perl-sub-install=0.928 280 | - perl-sub-name=0.21 281 | - perl-sub-quote=2.006003 282 | - perl-sub-uplevel=0.2800 283 | - perl-svg=2.84 284 | - perl-svg-graph=0.02 285 | - perl-task-weaken=1.06 286 | - perl-template-toolkit=2.26 287 | - perl-test=1.26 288 | - perl-test-deep=1.128 289 | - perl-test-differences=0.67 290 | - perl-test-exception=0.43 291 | - perl-test-harness=3.42 292 | - perl-test-leaktrace=0.16 293 | - perl-test-most=0.35 294 | - perl-test-requiresinternet=0.05 295 | - perl-test-warn=0.36 296 | - perl-text-diff=1.45 297 | - perl-tie-ixhash=1.23 298 | - perl-time-hires=1.9760 299 | - perl-time-local=1.28 300 | - perl-timedate=2.30 301 | - perl-tree-dag_node=1.31 302 | - perl-try-tiny=0.30 303 | - perl-type-tiny=1.004004 304 | - perl-types-serialiser=1.0 305 | - perl-unicode-map=0.112 306 | - perl-uri=1.76 307 | - perl-www-robotrules=6.02 308 | - perl-xml-dom=1.46 309 | - perl-xml-dom-xpath=0.14 310 | - perl-xml-filter-buffertext=1.01 311 | - perl-xml-libxml=2.0132 312 | - perl-xml-libxslt=1.94 313 | - perl-xml-namespacesupport=1.12 314 | - perl-xml-parser=2.44_01 315 | - perl-xml-regexp=0.04 316 | - perl-xml-sax=1.02 317 | - perl-xml-sax-base=1.09 318 | - perl-xml-sax-expat=0.51 319 | - perl-xml-sax-writer=0.57 320 | - perl-xml-simple=2.25 321 | - perl-xml-twig=3.52 322 | - perl-xml-writer=0.625 323 | - perl-xml-xpath=1.44 324 | - perl-xml-xpathengine=0.14 325 | - perl-xsloader=0.24 326 | - perl-yaml=1.29 327 | - pip=22.3.1 328 | - pixman=0.40.0 329 | - pthread-stubs=0.4 330 | - pyparsing=3.0.4 331 | - python=3.7.12 332 | - python-dateutil=2.8.2 333 | - python_abi=3.7 334 | - pytz=2022.1 335 | - r-base=4.1.3 336 | - r-bitops=1.0_7 337 | - r-catools=1.18.2 338 | - r-gplots=3.1.3 339 | - r-gtools=3.9.3 340 | - r-kernsmooth=2.23_20 341 | - r-randomforest=4.7_1.1 342 | - r-rocr=1.0_11 343 | - readline=8.1.2 344 | - sed=4.8 345 | - seqkit=2.3.1 346 | - setuptools=65.5.1 347 | - six=1.16.0 348 | - sqlite=3.38.5 349 | - stringtie=2.2.1 350 | - sysroot_linux-64=2.12 351 | - t_coffee=11.0.8 352 | - tidyp=1.04 353 | - tk=8.6.12 354 | - tktable=2.10 355 | - transdecoder=5.5.0 356 | - wheel=0.38.4 357 | - xorg-kbproto=1.0.7 358 | - xorg-libice=1.0.10 359 | - xorg-libsm=1.2.3 360 | - xorg-libx11=1.7.2 361 | - xorg-libxau=1.0.9 362 | - xorg-libxdmcp=1.1.3 363 | - xorg-libxext=1.3.4 364 | - xorg-libxrender=0.9.10 365 | - xorg-libxt=1.2.1 366 | - xorg-renderproto=0.11.1 367 | - xorg-xextproto=7.3.0 368 | - xorg-xproto=7.0.31 369 | - xz=5.2.6 370 | - zlib=1.2.11 371 | - zstd=1.5.2 372 | -------------------------------------------------------------------------------- /nextflow_scripts/environment.yml: -------------------------------------------------------------------------------- 1 | name: annotate_my_genomes 2 | channels: 3 | - anaconda 4 | - conda-forge 5 | - bioconda 6 | - defaults 7 | dependencies: 8 | - _libgcc_mutex=0.1=conda_forge 9 | - _openmp_mutex=4.5=1_gnu 10 | - _r-mutex=1.0.1=anacondar_1 11 | - argtable2=2.13=h14c3975_1001 12 | - atk-1.0=2.36.0=h3371d22_4 13 | - bedtools=2.30.0=h468198e_3 14 | - binutils_impl_linux-64=2.36.1=h193b22a_2 15 | - binutils_linux-64=2.36=hf3e587d_7 16 | - bioconductor-seqlogo=1.60.0=r41hdfd78af_0 17 | - bwidget=1.9.14=ha770c72_1 18 | - bzip2=1.0.8=h7f98852_4 19 | - c-ares=1.18.1=h7f98852_0 20 | - ca-certificates=2020.10.14=0 21 | - cairo=1.16.0=ha12eb4b_1010 22 | - certifi=2020.6.20=py36_0 23 | - clustalo=1.2.4=h87f3376_5 24 | - coreutils=9.0=h7f98852_0 25 | - curl=7.82.0=h7bff187_0 26 | - emboss=6.6.0=h5a44aac_5 27 | - expat=2.4.7=h27087fc_0 28 | - fasta_ushuffle=0.2=hec16e2b_4 29 | - feelnc=0.2=pl526_0 30 | - font-ttf-dejavu-sans-mono=2.37=hab24e00_0 31 | - font-ttf-inconsolata=3.000=h77eed37_0 32 | - font-ttf-source-code-pro=2.038=h77eed37_0 33 | - font-ttf-ubuntu=0.83=hab24e00_0 34 | - fontconfig=2.13.96=h8e229c2_2 35 | - fonts-conda-ecosystem=1=0 36 | - fonts-conda-forge=1=0 37 | - freetype=2.10.4=h0708190_1 38 | - fribidi=1.0.10=h36c2ea0_0 39 | - gawk=5.1.0=h7b6447c_0 40 | - gcc_impl_linux-64=9.4.0=h03d3576_13 41 | - gcc_linux-64=9.4.0=h391b98a_7 42 | - gdk-pixbuf=2.42.6=h04a7f16_0 43 | - gettext=0.19.8.1=h73d1719_1008 44 | - gffcompare=0.11.2=h9f5acd7_3 45 | - gffread=0.12.7=hd03093a_1 46 | - gfortran_impl_linux-64=9.4.0=h0003116_13 47 | - gfortran_linux-64=9.4.0=hf0ab688_7 48 | - giflib=5.2.1=h36c2ea0_2 49 | - gmap=2021.08.25=pl5262h36cd882_0 50 | - graphite2=1.3.13=h58526e2_1001 51 | - graphviz=3.0.0=h5abf519_0 52 | - gsl=2.7=he838d99_0 53 | - gtk2=2.24.33=h90689f9_2 54 | - gts=0.7.6=h64030ff_2 55 | - gxx_impl_linux-64=9.4.0=h03d3576_13 56 | - gxx_linux-64=9.4.0=h0316aca_7 57 | - harfbuzz=3.4.0=hb4a5f5f_0 58 | - htslib=1.14=h9753748_2 59 | - icu=69.1=h9c3ff4c_0 60 | - jbig=2.1=h7f98852_2003 61 | - jpeg=9e=h7f98852_0 62 | - k8=0.2.5=hd03093a_2 63 | - kernel-headers_linux-64=2.6.32=he073ed8_15 64 | - keyutils=1.6.1=h166bdaf_0 65 | - kmerinshort=1.0.1=0 66 | - krb5=1.19.2=h3790be6_4 67 | - ld_impl_linux-64=2.36.1=hea4e1c9_2 68 | - lerc=3.0=h9c3ff4c_0 69 | - libblas=3.9.0=13_linux64_openblas 70 | - libcblas=3.9.0=13_linux64_openblas 71 | - libcurl=7.82.0=h7bff187_0 72 | - libdb=6.2.32=h9c3ff4c_0 73 | - libdeflate=1.10=h7f98852_0 74 | - libedit=3.1.20191231=he28a2e2_2 75 | - libev=4.33=h516909a_1 76 | - libffi=3.4.2=h7f98852_5 77 | - libgcc=7.2.0=h69d50b8_2 78 | - libgcc-devel_linux-64=9.4.0=hd854feb_13 79 | - libgcc-ng=11.2.0=h1d223b6_13 80 | - libgd=2.3.3=h283352f_2 81 | - libgfortran-ng=11.2.0=h69a702a_13 82 | - libgfortran5=11.2.0=h5c6108e_13 83 | - libglib=2.70.2=h174f98d_4 84 | - libgomp=11.2.0=h1d223b6_13 85 | - libiconv=1.16=h516909a_0 86 | - liblapack=3.9.0=13_linux64_openblas 87 | - libnghttp2=1.47.0=h727a467_0 88 | - libnsl=2.0.0=h7f98852_0 89 | - libopenblas=0.3.18=pthreads_h8fe5266_0 90 | - libpng=1.6.37=h21135ba_2 91 | - librsvg=2.52.5=h0a9e6e8_2 92 | - libsanitizer=9.4.0=h79bfe98_13 93 | - libssh2=1.10.0=ha56f1ee_2 94 | - libstdcxx-devel_linux-64=9.4.0=hd854feb_13 95 | - libstdcxx-ng=11.2.0=he4da1e4_13 96 | - libtiff=4.3.0=h542a066_3 97 | - libtool=2.4.6=h9c3ff4c_1008 98 | - libuuid=2.32.1=h7f98852_1000 99 | - libwebp=1.2.2=h3452ae3_0 100 | - libwebp-base=1.2.2=h7f98852_1 101 | - libxcb=1.13=h7f98852_1004 102 | - libxml2=2.9.12=h885dcf4_1 103 | - libxslt=1.1.33=h0ef7038_3 104 | - libzlib=1.2.11=h36c2ea0_1013 105 | - lz4-c=1.9.3=h9c3ff4c_1 106 | - make=4.3=hd18ef5c_1 107 | - minimap2=2.24=h7132678_1 108 | - ncurses=6.2=h58526e2_4 109 | - numpy=1.19.5=py36hfc0c790_2 110 | - openssl=1.1.1l=h7f98852_0 111 | - pandas=1.1.3=py36he6710b0_0 112 | - pango=1.50.5=h4dcc4a0_0 113 | - parallel=20220222=ha770c72_0 114 | - pcre=8.45=h9c3ff4c_0 115 | - pcre2=10.37=h032f7d1_0 116 | - perl=5.26.2=h36c2ea0_1008 117 | - perl-aceperl=1.92=pl526_2 118 | - perl-algorithm-diff=1.1903=pl526_2 119 | - perl-algorithm-munkres=0.08=pl526_1 120 | - perl-apache-test=1.40=pl526_1 121 | - perl-app-cpanminus=1.7044=pl526_1 122 | - perl-appconfig=1.71=pl526_1 123 | - perl-array-compare=3.0.1=pl526_1 124 | - perl-autoloader=5.74=pl526_2 125 | - perl-base=2.23=pl526_1 126 | - perl-bio-asn1-entrezgene=1.73=pl5262hdfd78af_2 127 | - perl-bio-featureio=1.6.905=pl5262hdfd78af_3 128 | - perl-bio-phylo=0.58=pl5262hdfd78af_3 129 | - perl-bio-samtools=1.43=pl526h1341992_1 130 | - perl-bioperl=1.6.924=6 131 | - perl-bioperl-core=1.007002=pl5262hdfd78af_3 132 | - perl-bioperl-run=1.007002=pl5262hdfd78af_5 133 | - perl-business-isbn=3.004=pl526_0 134 | - perl-business-isbn-data=20140910.003=pl526_0 135 | - perl-cache-cache=1.08=pl526_0 136 | - perl-capture-tiny=0.48=pl526_0 137 | - perl-carp=1.38=pl526_3 138 | - perl-cgi=4.44=pl526h14c3975_1 139 | - perl-class-data-inheritable=0.08=pl526_1 140 | - perl-class-inspector=1.34=pl526_0 141 | - perl-class-load=0.25=pl526_0 142 | - perl-class-load-xs=0.10=pl526h6bb024c_2 143 | - perl-class-method-modifiers=2.12=pl526_0 144 | - perl-clone=0.42=pl526h516909a_0 145 | - perl-common-sense=3.74=pl526_2 146 | - perl-compress-raw-zlib=2.087=pl526hc9558a2_0 147 | - perl-constant=1.33=pl526_1 148 | - perl-convert-binary-c=0.78=pl526h6bb024c_3 149 | - perl-convert-binhex=1.125=pl526_1 150 | - perl-crypt-rc4=2.02=pl526_1 151 | - perl-data-dumper=2.173=pl526_0 152 | - perl-data-optlist=0.110=pl526_2 153 | - perl-data-stag=0.14=pl526_1 154 | - perl-date-format=2.30=pl526_2 155 | - perl-db-file=1.855=pl526h516909a_0 156 | - perl-dbd-sqlite=1.64=pl526h516909a_0 157 | - perl-dbi=1.642=pl526_0 158 | - perl-devel-globaldestruction=0.14=pl526_0 159 | - perl-devel-overloadinfo=0.005=pl526_0 160 | - perl-devel-stacktrace=2.04=pl526_0 161 | - perl-digest-hmac=1.03=pl526_3 162 | - perl-digest-md5=2.55=pl526_0 163 | - perl-digest-perl-md5=1.9=pl526_1 164 | - perl-digest-sha1=2.13=pl526h6bb024c_1 165 | - perl-dist-checkconflicts=0.11=pl526_2 166 | - perl-dynaloader=1.25=pl526_1 167 | - perl-email-date-format=1.005=pl526_2 168 | - perl-encode=2.88=pl526_1 169 | - perl-encode-locale=1.05=pl526_6 170 | - perl-error=0.17027=pl526_1 171 | - perl-eval-closure=0.14=pl526h6bb024c_4 172 | - perl-exception-class=1.44=pl526_0 173 | - perl-exporter=5.72=pl526_1 174 | - perl-exporter-tiny=1.002001=pl526_0 175 | - perl-extutils-makemaker=7.36=pl526_1 176 | - perl-file-listing=6.04=pl526_1 177 | - perl-file-path=2.16=pl526_0 178 | - perl-file-slurp-tiny=0.004=pl526_1 179 | - perl-file-sort=1.01=pl526_2 180 | - perl-file-temp=0.2304=pl526_2 181 | - perl-file-which=1.23=pl526_0 182 | - perl-font-afm=1.20=pl526_2 183 | - perl-font-ttf=1.06=pl526_0 184 | - perl-gd=2.68=pl526he941832_0 185 | - perl-getopt-long=2.50=pl526_1 186 | - perl-graph=0.9704=pl526_1 187 | - perl-graphviz=2.20=1 188 | - perl-html-element-extended=1.18=pl526_1 189 | - perl-html-entities-numbered=0.04=pl526_1 190 | - perl-html-formatter=2.16=pl526_0 191 | - perl-html-parser=3.72=pl526h6bb024c_5 192 | - perl-html-tableextract=2.13=pl526_2 193 | - perl-html-tagset=3.20=pl526_3 194 | - perl-html-tidy=1.60=pl526_0 195 | - perl-html-tree=5.07=pl526_1 196 | - perl-html-treebuilder-xpath=0.14=pl526_1 197 | - perl-http-cookies=6.04=pl526_0 198 | - perl-http-daemon=6.01=pl526_1 199 | - perl-http-date=6.02=pl526_3 200 | - perl-http-message=6.18=pl526_0 201 | - perl-http-negotiate=6.01=pl526_3 202 | - perl-image-info=1.38=pl526_1 203 | - perl-image-size=3.300=pl526_2 204 | - perl-io-html=1.001=pl526_2 205 | - perl-io-sessiondata=1.03=pl526_1 206 | - perl-io-socket-ssl=2.066=pl526_0 207 | - perl-io-string=1.08=pl526_3 208 | - perl-io-stringy=2.111=pl526_1 209 | - perl-io-tty=1.12=pl526_1 210 | - perl-ipc-run=20180523.0=pl526_0 211 | - perl-ipc-sharelite=0.17=pl526h6bb024c_1 212 | - perl-jcode=2.07=pl526_2 213 | - perl-json=4.02=pl526_0 214 | - perl-json-xs=2.34=pl526h6bb024c_3 215 | - perl-libwww-perl=6.39=pl526_0 216 | - perl-libxml-perl=0.08=pl526_2 217 | - perl-list-moreutils=0.428=pl526_1 218 | - perl-list-moreutils-xs=0.428=pl526_0 219 | - perl-local-lib=2.000024=pl526_0 220 | - perl-lwp-mediatypes=6.04=pl526_0 221 | - perl-lwp-protocol-https=6.07=pl526_4 222 | - perl-lwp-simple=6.15=pl526h470a237_4 223 | - perl-mailtools=2.21=pl526_0 224 | - perl-math-cdf=0.1=pl526h14c3975_5 225 | - perl-math-derivative=1.01=pl526_0 226 | - perl-math-random=0.72=pl526h14c3975_2 227 | - perl-math-spline=0.02=pl526_2 228 | - perl-mime-base64=3.15=pl526_1 229 | - perl-mime-lite=3.030=pl526_1 230 | - perl-mime-tools=5.508=pl526_1 231 | - perl-mime-types=2.17=pl526_0 232 | - perl-mldbm=2.05=pl526_1 233 | - perl-module-build=0.4224=pl526h470a237_1 234 | - perl-module-implementation=0.09=pl526_2 235 | - perl-module-runtime=0.016=pl526_1 236 | - perl-module-runtime-conflicts=0.003=pl526_0 237 | - perl-moo=2.003004=pl526_0 238 | - perl-moose=2.2011=pl526hf484d3e_1 239 | - perl-mozilla-ca=20180117=pl526_1 240 | - perl-mro-compat=0.13=pl526_0 241 | - perl-net-http=6.19=pl526_0 242 | - perl-net-ssleay=1.88=pl526h90d6eec_0 243 | - perl-ntlm=1.09=pl526_4 244 | - perl-ole-storage_lite=0.19=pl526_3 245 | - perl-package-deprecationmanager=0.17=pl526_0 246 | - perl-package-stash=0.38=pl526hf484d3e_1 247 | - perl-package-stash-xs=0.28=pl526hf484d3e_1 248 | - perl-parallel-forkmanager=2.02=pl526_0 249 | - perl-params-util=1.07=pl526h6bb024c_4 250 | - perl-parent=0.236=pl526_1 251 | - perl-parse-recdescent=1.967015=pl526_0 252 | - perl-pathtools=3.75=pl526h14c3975_1 253 | - perl-pdf-api2=2.035=pl526_0 254 | - perl-postscript=0.06=pl526_2 255 | - perl-role-tiny=2.000008=pl526_0 256 | - perl-scalar-list-utils=1.52=pl526h516909a_0 257 | - perl-set-scalar=1.29=pl526_2 258 | - perl-soap-lite=1.19=pl526_1 259 | - perl-socket=2.027=pl526_1 260 | - perl-sort-naturally=1.03=pl526_2 261 | - perl-spreadsheet-parseexcel=0.65=pl526_2 262 | - perl-spreadsheet-writeexcel=2.40=pl526_2 263 | - perl-statistics-descriptive=3.0702=pl526_0 264 | - perl-storable=3.15=pl526h14c3975_0 265 | - perl-sub-exporter=0.987=pl526_2 266 | - perl-sub-exporter-progressive=0.001013=pl526_0 267 | - perl-sub-identify=0.14=pl526h14c3975_0 268 | - perl-sub-install=0.928=pl526_2 269 | - perl-sub-name=0.21=pl526_1 270 | - perl-sub-quote=2.006003=pl526_1 271 | - perl-sub-uplevel=0.2800=pl526h14c3975_2 272 | - perl-svg=2.84=pl526_0 273 | - perl-svg-graph=0.02=pl526_3 274 | - perl-task-weaken=1.06=pl526_0 275 | - perl-template-toolkit=2.26=pl526_1 276 | - perl-test-deep=1.128=pl526_1 277 | - perl-test-differences=0.67=pl526_0 278 | - perl-test-exception=0.43=pl526_2 279 | - perl-test-leaktrace=0.16=pl526h14c3975_2 280 | - perl-test-most=0.35=pl526_0 281 | - perl-test-pod=1.52=pl526_0 282 | - perl-test-requiresinternet=0.05=pl526_0 283 | - perl-test-warn=0.36=pl526_1 284 | - perl-text-diff=1.45=pl526_0 285 | - perl-threaded=5.32.1=hdfd78af_1 286 | - perl-tie-ixhash=1.23=pl526_2 287 | - perl-time-local=1.28=pl526_1 288 | - perl-timedate=2.30=pl526_1 289 | - perl-tree-dag_node=1.31=pl526_0 290 | - perl-try-tiny=0.30=pl526_1 291 | - perl-type-tiny=1.004004=pl526_0 292 | - perl-types-serialiser=1.0=pl526_2 293 | - perl-unicode-map=0.112=pl526h6bb024c_3 294 | - perl-uri=1.76=pl526_0 295 | - perl-www-robotrules=6.02=pl526_3 296 | - perl-xml-dom=1.46=pl526_0 297 | - perl-xml-dom-xpath=0.14=pl526_1 298 | - perl-xml-filter-buffertext=1.01=pl526_2 299 | - perl-xml-libxml=2.0132=pl526h7ec2d77_1 300 | - perl-xml-libxslt=1.94=pl526_1 301 | - perl-xml-namespacesupport=1.12=pl526_0 302 | - perl-xml-parser=2.44_01=pl5262hc3e0081_1002 303 | - perl-xml-regexp=0.04=pl526_2 304 | - perl-xml-sax=1.02=pl526_0 305 | - perl-xml-sax-base=1.09=pl526_0 306 | - perl-xml-sax-expat=0.51=pl526_3 307 | - perl-xml-sax-writer=0.57=pl526_0 308 | - perl-xml-simple=2.25=pl526_1 309 | - perl-xml-twig=3.52=pl526_2 310 | - perl-xml-writer=0.625=pl526_2 311 | - perl-xml-xpath=1.44=pl526_0 312 | - perl-xml-xpathengine=0.14=pl526_2 313 | - perl-xsloader=0.24=pl526_0 314 | - perl-yaml=1.29=pl526_0 315 | - pip=20.0.2=py36_1 316 | - pixman=0.40.0=h36c2ea0_0 317 | - pthread-stubs=0.4=h36c2ea0_1001 318 | - python=3.6.15=hb7a2778_0_cpython 319 | - python-dateutil=2.8.1=py_0 320 | - python_abi=3.6=2_cp36m 321 | - pytz=2020.1=py_0 322 | - r-assertthat=0.2.1=r41hc72bb7e_2 323 | - r-backports=1.4.1=r41hcfec24a_0 324 | - r-base=4.1.2=h2553ce4_1 325 | - r-bitops=1.0_7=r41hcfec24a_0 326 | - r-brio=1.1.3=r41hcfec24a_0 327 | - r-callr=3.7.0=r41hc72bb7e_0 328 | - r-catools=1.18.2=r41h03ef668_0 329 | - r-cli=3.2.0=r41h03ef668_0 330 | - r-colorspace=2.0_3=r41h06615bd_0 331 | - r-crayon=1.5.0=r41hc72bb7e_0 332 | - r-desc=1.4.0=r41hc72bb7e_0 333 | - r-diffobj=0.3.5=r41hcfec24a_0 334 | - r-digest=0.6.29=r41h03ef668_0 335 | - r-ellipsis=0.3.2=r41hcfec24a_0 336 | - r-evaluate=0.15=r41hc72bb7e_0 337 | - r-fansi=1.0.2=r41hcfec24a_0 338 | - r-farver=2.1.0=r41h03ef668_0 339 | - r-ggplot2=3.3.5=r41hc72bb7e_0 340 | - r-glue=1.6.2=r41h06615bd_0 341 | - r-gplots=3.1.1=r41hc72bb7e_0 342 | - r-gtable=0.3.0=r41hc72bb7e_3 343 | - r-gtools=3.9.2=r41hcfec24a_0 344 | - r-isoband=0.2.5=r41h03ef668_0 345 | - r-jsonlite=1.8.0=r41h06615bd_0 346 | - r-kernsmooth=2.23_20=r41h742201e_0 347 | - r-labeling=0.4.2=r41hc72bb7e_1 348 | - r-lattice=0.20_45=r41hcfec24a_0 349 | - r-lifecycle=1.0.1=r41hc72bb7e_0 350 | - r-magrittr=2.0.2=r41hcfec24a_0 351 | - r-mass=7.3_55=r41hcfec24a_0 352 | - r-matrix=1.4_0=r41he454529_0 353 | - r-mgcv=1.8_39=r41h0154571_0 354 | - r-munsell=0.5.0=r41hc72bb7e_1004 355 | - r-nlme=3.1_155=r41h859d828_0 356 | - r-pillar=1.7.0=r41hc72bb7e_0 357 | - r-pkgconfig=2.0.3=r41hc72bb7e_1 358 | - r-pkgload=1.2.4=r41h03ef668_0 359 | - r-praise=1.0.0=r41hc72bb7e_1005 360 | - r-processx=3.5.2=r41hcfec24a_0 361 | - r-ps=1.6.0=r41hcfec24a_0 362 | - r-r6=2.5.1=r41hc72bb7e_0 363 | - r-randomforest=4.6_14=r41h859d828_1004 364 | - r-rcolorbrewer=1.1_2=r41h785f33e_1003 365 | - r-rcpp=1.0.8=r41h03ef668_0 366 | - r-rematch2=2.1.2=r41hc72bb7e_1 367 | - r-rlang=0.4.12=r41hcfec24a_0 368 | - r-rocr=1.0_11=r41hc72bb7e_1 369 | - r-rprojroot=2.0.2=r41hc72bb7e_0 370 | - r-rstudioapi=0.13=r41hc72bb7e_0 371 | - r-scales=1.1.1=r41hc72bb7e_0 372 | - r-testthat=3.1.2=r41h03ef668_0 373 | - r-tibble=3.1.6=r41hcfec24a_0 374 | - r-utf8=1.2.2=r41hcfec24a_0 375 | - r-vctrs=0.3.8=r41hcfec24a_1 376 | - r-viridislite=0.4.0=r41hc72bb7e_0 377 | - r-waldo=0.3.1=r41hc72bb7e_0 378 | - r-withr=2.5.0=r41hc72bb7e_0 379 | - readline=8.1=h46c0cb4_0 380 | - seqkit=2.1.0=h9ee0642_0 381 | - setuptools=49.6.0=py36h5fab9bb_3 382 | - six=1.15.0=py_0 383 | - sqlite=3.37.0=h9cd32fc_0 384 | - stringtie=2.2.1=hecb563c_2 385 | - sysroot_linux-64=2.12=he073ed8_15 386 | - tidyp=1.04=hec16e2b_4 387 | - tk=8.6.12=h27826a3_0 388 | - tktable=2.10=hb7b940f_3 389 | - transdecoder=5.5.0=pl5262hdfd78af_4 390 | - wheel=0.37.1=pyhd8ed1ab_0 391 | - xorg-kbproto=1.0.7=h7f98852_1002 392 | - xorg-libice=1.0.10=h7f98852_0 393 | - xorg-libsm=1.2.3=hd9c2040_1000 394 | - xorg-libx11=1.7.2=h7f98852_0 395 | - xorg-libxau=1.0.9=h7f98852_0 396 | - xorg-libxdmcp=1.1.3=h7f98852_0 397 | - xorg-libxext=1.3.4=h7f98852_1 398 | - xorg-libxrender=0.9.10=h7f98852_1003 399 | - xorg-libxt=1.2.1=h7f98852_2 400 | - xorg-renderproto=0.11.1=h7f98852_1002 401 | - xorg-xextproto=7.3.0=h7f98852_1002 402 | - xorg-xproto=7.0.31=h7f98852_1007 403 | - xz=5.2.5=h516909a_1 404 | - zlib=1.2.11=h36c2ea0_1013 405 | - zstd=1.5.2=ha95c52a_0 406 | -------------------------------------------------------------------------------- /nextflow_scripts/genome-download.nf: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env nextflow 2 | 3 | /* 4 | * pipeline input parameters 5 | */ 6 | 7 | 8 | params.genome = 'galGal6' 9 | params.outdir = './' 10 | params.conda = '/home/wslab/test1_annotate/environment.yml' 11 | 12 | println """\ 13 | G E N O M E - D O W N L O A D P I P E L I N E 14 | =============================================== 15 | genome : ${params.genome} 16 | outdir : ${params.outdir} 17 | environment : ${params.conda} 18 | """ 19 | .stripIndent() 20 | 21 | process genome_download { 22 | echo true 23 | conda "${params.conda}" 24 | 25 | publishDir "${params.outdir}", mode: 'copy' 26 | 27 | output: 28 | file "${params.genome}*" 29 | 30 | shell: 31 | ''' 32 | #!/usr/bin/env bash 33 | 34 | genome="!{params.genome}" 35 | wget http://hgdownload.cse.ucsc.edu/goldenpath/${genome}/bigZips/${genome}.2bit 36 | wget http://hgdownload.soe.ucsc.edu/goldenPath/${genome}/database/refGene.txt.gz 37 | wget http://hgdownload.soe.ucsc.edu/goldenPath/${genome}/database/ncbiRefSeq.txt.gz 38 | if [ -f twoBitToFa ]; then 39 | echo "twoBitToFa script found. Continue:" 40 | echo "" 41 | : 42 | else 43 | echo "Downloading twoBitToFa script" 44 | wget http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/twoBitToFa 45 | fi 46 | chmod 755 twoBitToFa 47 | ./twoBitToFa ${genome}.2bit ${genome}.fa 48 | samtools faidx ${genome}.fa 49 | 50 | if [ -f genePredToGtf ]; then 51 | echo "genePredToGtf script found. Continue:" 52 | echo "" 53 | : 54 | else 55 | echo "Downloading genePredToGtf script" 56 | wget http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/genePredToGtf 57 | fi 58 | chmod 755 genePredToGtf 59 | gunzip refGene.txt.gz 60 | gunzip ncbiRefSeq.txt.gz 61 | cut -f 2- refGene.txt | ./genePredToGtf file stdin -source=${genome}_Ref ${genome}.gtf 62 | cut -f 2- ncbiRefSeq.txt | ./genePredToGtf file stdin -source=${genome}_Ref ${genome}_ncbiRefSeq.gtf 63 | echo "" 64 | echo "All done. ${genome} FASTA and GTF files are located in the current working directory (or specified directory with --outdir)" 65 | ''' 66 | } 67 | 68 | -------------------------------------------------------------------------------- /nextflow_scripts/isoform-identification.nf: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env nextflow 2 | 3 | /* 4 | * pipeline input parameters 5 | */ 6 | 7 | params.NCBI_tmap = '/home/wslab/test1_annotate/nextflow_scripts/NCBI_compare.stringtie_chr33.gtf.tmap' 8 | params.NCBI_transcripts = '/home/wslab/test1_annotate/nextflow_scripts/NCBI_transcripts.fa' 9 | params.genome_name = 'galGal6' 10 | params.conda = '/home/wslab/test1_annotate/nextflow_scripts/environment.yml' 11 | params.outdir ='./' 12 | 13 | println """\ 14 | I S O F O R M - I D E N T I F I C A T I O N P I P E L I N E 15 | ============================================================= 16 | NCBI_tmap : ${params.NCBI_tmap} 17 | NCBI_transcripts : ${params.NCBI_transcripts} 18 | genome_name : ${params.genome_name} 19 | environment : ${params.conda} 20 | outdir : ${params.outdir} 21 | """ 22 | .stripIndent() 23 | 24 | 25 | process check_paths { 26 | echo true 27 | stageInMode 'copy' 28 | conda "${params.conda}" 29 | 30 | output: 31 | val '${m_DIR}' into records1 32 | val '${t_DIR}' into records2 33 | val '${c_DIR}' into records3 34 | val '${o_DIR}' into records4 35 | 36 | shell: 37 | ''' 38 | # mandatory arguments 39 | if [ ! "!{params.NCBI_tmap}" ] || [ ! "!{params.NCBI_transcripts}" ] || [ ! "!{params.genome_name}" ] || [ ! "!{params.conda}" ] || [ ! "!{params.outdir}" ]; then 40 | echo "" 41 | echo "arguments -m, -t, -g, -c and -o must be provided" 42 | echo "" 43 | echo "$usage" >&2; exit 1 44 | fi 45 | 46 | # Conditions : output folder 47 | if [ ! -d "!{params.outdir}" ]; then 48 | echo "" 49 | echo "Output directory: !{params.outdir} not found. Please create the output directory first, before running the pipeline." 50 | echo "" 51 | exit 9999 # die with error code 9999 52 | fi 53 | 54 | # Conditions : Input existance 55 | if [ ! -e "!{params.NCBI_tmap}" ]; then 56 | echo "" 57 | echo "!{params.NCBI_tmap} does not exist. Check your -m input" 58 | echo "" 59 | exit 9999 # die with error code 9999 60 | fi 61 | 62 | if [ ! -e "!{params.NCBI_transcripts}" ]; then 63 | echo "" 64 | echo "!{params.NCBI_transcripts} does not exist. Check your -t input" 65 | echo "" 66 | exit 9999 # die with error code 9999 67 | fi 68 | 69 | if [ ! -e "!{params.conda}" ]; then 70 | echo "" 71 | echo "!{params.conda} does not exist. Check your -c input" 72 | echo "" 73 | exit 9999 # die with error code 9999 74 | fi 75 | 76 | # Conditions : Getting absolute path of inputs 77 | echo "" 78 | m_DIR="$( cd "$( dirname "!{params.NCBI_tmap}" )" && pwd )" 79 | echo "" 80 | echo "::: The absolute path of -m is $m_DIR" 81 | echo "" 82 | t_DIR="$( cd "$( dirname "!{params.NCBI_transcripts}" )" && pwd )" 83 | echo "" 84 | echo "::: The absolute path of -t is $t_DIR" 85 | echo "" 86 | c_DIR="$( cd "$( dirname "!{params.conda}" )" && pwd )" 87 | echo "" 88 | echo "::: The absolute path of -c is $c_DIR" 89 | echo "" 90 | o_DIR="$( cd "$( dirname "!{params.outdir}" )" && pwd )" 91 | echo "" 92 | echo "::: The absolute path of -o is $o_DIR" 93 | echo "" 94 | ''' 95 | } 96 | 97 | 98 | process check_inputs { 99 | echo true 100 | stageInMode 'copy' 101 | conda "${params.conda}" 102 | 103 | output: 104 | val '${NCBI_tmap}' into records5 105 | val '${NCBI_transcripts}' into records6 106 | val '${genome_name}' into records7 107 | val '${anaconda_env}' into records8 108 | val '${outdir}' into records9 109 | file 'ncbiRefSeqLink.txt' into ncbiRefSeqLink 110 | 111 | shell: 112 | ''' 113 | printf "::: Defining Variables :::\n" 114 | echo"" 115 | FILE1="!{params.NCBI_tmap}" 116 | basename "$FILE1" 117 | NCBI_tmap="$(basename -- $FILE1)" 118 | echo "The NCBI tmap file used as input is the following: $NCBI_tmap" 119 | echo "" 120 | FILE2="!{params.NCBI_transcripts}" 121 | basename "$FILE2" 122 | NCBI_transcripts="$(basename -- $FILE2)" 123 | echo "The NCBI transcripts used as input is the following: $NCBI_transcripts" 124 | echo "" 125 | FILE3="!{params.conda}" 126 | basename "$FILE3" 127 | anaconda_env="$(basename -- $FILE3)" 128 | echo "The anaconda environment file is the following: $anaconda_env" 129 | echo "" 130 | FILE4="!{params.outdir}" 131 | basename "$FILE4" 132 | outdir="$(basename -- $FILE4)" 133 | echo "The outdir folder name is the following: $outdir" 134 | echo "" 135 | genome_name="!{params.genome_name}" 136 | 137 | if [ -f ncbiRefSeqLink.txt ]; then 138 | echo "::: ncbiRefSeqLink.txt file found. Continue:" 139 | echo "" 140 | : 141 | else 142 | echo "::: Downloading ncbiRefSeqLink.txt file" 143 | wget http://hgdownload.cse.ucsc.edu/goldenpath/${genome_name}/database/ncbiRefSeqLink.txt.gz 144 | gunzip ncbiRefSeqLink.txt.gz 145 | echo "" 146 | echo "Number of lines in ncbiRefSeqLink.txt:" 147 | cat ncbiRefSeqLink.txt | wc -l 148 | echo "Continue with python processing steps:" 149 | echo "" 150 | fi 151 | ''' 152 | } 153 | 154 | 155 | process python_inputs { 156 | echo true 157 | stageInMode 'copy' 158 | conda "${params.conda}" 159 | 160 | input: 161 | val '${m_DIR}' from records1 162 | val '${t_DIR}' from records2 163 | val '${NCBI_tmap}' from records5 164 | val '${NCBI_transcripts}' from records6 165 | 166 | output: 167 | file 'stringtie_for_script.tmap' into records10 168 | file 'transcripts_Isoform2.tab' into records11 169 | 170 | shell: 171 | ''' 172 | # Inputs for python 173 | cp "!{params.NCBI_tmap}" ./stringtie_for_script.tmap 174 | seqkit fx2tab "!{params.NCBI_transcripts}" > transcripts_Isoform.tab 175 | # Formatting transcripts_Isoform.tab if gene= is present in file 176 | sed -i 's/gene=/\t/'g transcripts_Isoform.tab 177 | awk '{print $1"\t"$NF}' transcripts_Isoform.tab > transcripts_Isoform2.tab 178 | ''' 179 | } 180 | 181 | 182 | process gffcompare_parser { 183 | echo true 184 | stageInMode 'copy' 185 | conda "${params.conda}" 186 | 187 | input: 188 | file 'ncbiRefSeqLink.txt' from ncbiRefSeqLink 189 | file 'stringtie_for_script.tmap' from records10 190 | file 'transcripts_Isoform2.tab' from records11 191 | 192 | output: 193 | file 'Ref_Transcript_Annotation.csv' into records12 194 | file 'Novel_Transcript_Annotation.csv' into records13 195 | 196 | shell: 197 | ''' 198 | python << END 199 | 200 | import sys 201 | import pandas as pd 202 | df = pd.read_csv('stringtie_for_script.tmap', sep = '\t') 203 | print(df.sample(10)) 204 | print("Total number of transcripts:", df.shape[0]) 205 | print("") 206 | df2 = df[~df.ref_id.astype(str).str.contains('-')] 207 | novel_transcripts = df[df.ref_id.astype(str).str.contains('-')] 208 | df3 = df2[["ref_gene_id", "ref_id", "class_code", "qry_gene_id", "qry_id", "num_exons", "FPKM", "TPM"]] 209 | df_novel_transcripts = novel_transcripts[["ref_gene_id", "ref_id", "class_code", "qry_gene_id", "qry_id", "num_exons", "FPKM", "TPM"]] 210 | print("Reference transcripts:") 211 | print(df3.sample(10)) 212 | print("") 213 | print("Novel transcripts:") 214 | print(df_novel_transcripts.sample(10)) 215 | print("") 216 | colnames=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18'] 217 | dfA1 = pd.read_csv('ncbiRefSeqLink.txt', sep = '\t', low_memory=False, names=colnames, header=None) 218 | print(dfA1.head(10)) 219 | dfA2 = dfA1[['0', '1', '2', '3', '5', '14', '16']] 220 | dfA2 = dfA2.rename(columns={'0': 'ref_id', '1': 'Annotation Status', '2' : 'NCBI RefSeq Gene ID', '3' : 'Transcript Description', '5' : 'NCBI RefSeq Protein ID', '14' : 'Alternative Gene Name', '16' : 'RefSeq Transcript Info'}) 221 | print("ncbiRefSeqLink annotation:") 222 | print(dfA2.sample(10)) 223 | print("") 224 | colnames = ['qry_id', 'cds_seq', 'none'] 225 | cds = pd.read_csv('transcripts_Isoform2.tab', sep = '\t', names=colnames) 226 | cds2 = cds[["qry_id", "cds_seq"]] 227 | print("transcripts file:") 228 | print(cds2.sample(10)) 229 | print("") 230 | result1 = pd.merge(df3, dfA2, on='ref_id', how='inner') 231 | result1.sample(10) 232 | result2 = pd.merge(result1, cds2, on='qry_id', how='inner') 233 | result2.sample(10) 234 | result3 = pd.merge(df_novel_transcripts, cds2, on='qry_id', how='inner') 235 | result3.sample(10) 236 | print("Number of Joined Transcripts (reference):", result2.shape[0]) 237 | print("") 238 | print("Number of Joined Transcripts (novel):", result3.shape[0]) 239 | print("") 240 | result2.to_csv('Ref_Transcript_Annotation.csv', index=False) 241 | result3.to_csv('Novel_Transcript_Annotation.csv', index=False) 242 | print("::: Done. Ref_Transcript_Annotation.csv and Novel_Transcript_Annotation.csv were succesfully produced") 243 | print("") 244 | END 245 | ''' 246 | } 247 | 248 | 249 | process output_pipeline { 250 | echo true 251 | stageInMode 'copy' 252 | conda "${params.conda}" 253 | 254 | input: 255 | file 'Ref_Transcript_Annotation.csv' from records12 256 | file 'Novel_Transcript_Annotation.csv' from records13 257 | 258 | shell: 259 | ''' 260 | echo "" 261 | printf "::: Moving results to the output directory :::\n" 262 | mv Ref_Transcript_Annotation.csv "!{params.outdir}" 263 | mv Novel_Transcript_Annotation.csv "!{params.outdir}" 264 | echo "" 265 | echo "------------------------------------------------------------" 266 | echo "------------------------------------------------------------" 267 | echo "::: INFO: all done" ::: 268 | echo "" 269 | echo "The following files are available in the output directory : " 270 | echo "" 271 | echo "Ref_Transcript_Annotation.csv contains annotation and coordinates of known transcripts" 272 | echo "" 273 | echo "Novel_Transcript_Annotation.csv contains collection of novel transcripts" 274 | echo "" 275 | echo "------------------------------------------------------------" 276 | echo "------------------------------------------------------------" 277 | echo "" 278 | ''' 279 | } 280 | -------------------------------------------------------------------------------- /nextflow_scripts/old/add-ncbi-annotation.nf: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env nextflow 2 | 3 | /* 4 | * pipeline input parameters 5 | */ 6 | 7 | params.stringtie = '/home/wslab/test1_annotate/nextflow_scripts/stringtie_chr33.gtf' 8 | params.NCBI_annotation = '/home/wslab/test1_annotate/nextflow_scripts/galGal6_ncbiRefSeq.gtf' 9 | params.ref_annotation = '/home/wslab/test1_annotate/nextflow_scripts/galGal6.gtf' 10 | params.genome = '/home/wslab/test1_annotate/nextflow_scripts/galGal6.fa' 11 | params.config = '/home/wslab/test1_annotate/nextflow_scripts/gawn_config.sh' 12 | params.threads = '10' 13 | params.outdir = '/home/wslab/test1_annotate/nextflow_scripts/' 14 | params.conda = '/home/wslab/test1_annotate/nextflow_scripts/environment.yml' 15 | 16 | println """\ 17 | A D D - N C B I - A N N O T A T I O N P I P E L I N E 18 | ======================================================= 19 | stringtie : ${params.stringtie} 20 | NCBI_annotation : ${params.NCBI_annotation} 21 | ref_annotation : ${params.ref_annotation} 22 | genome : ${params.genome} 23 | config_file : ${params.config} 24 | threads : ${params.threads} 25 | outdir : ${params.outdir} 26 | environment : ${params.conda} 27 | """ 28 | .stripIndent() 29 | 30 | process add_ncbi_annotation { 31 | echo true 32 | stageInMode 'copy' 33 | conda "${params.conda}" 34 | 35 | shell: 36 | ''' 37 | add-ncbi-annotation -a "!{params.stringtie}" -n "!{params.NCBI_annotation}" -r "!{params.ref_annotation}" -g "!{params.genome}" -c "!{params.config}" -t "!{params.threads}" -o "!{params.outdir}" 38 | ''' 39 | } 40 | -------------------------------------------------------------------------------- /nextflow_scripts/old/annotate-my-genomes.nf: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env nextflow 2 | 3 | /* 4 | * pipeline input parameters 5 | */ 6 | 7 | params.stringtie = '/home/wslab/test1_annotate/nextflow_scripts/stringtie_chr33.gtf' 8 | params.NCBI_annotation = '/home/wslab/test1_annotate/nextflow_scripts/galGal6_ncbiRefSeq.gtf' 9 | params.ref_annotation = '/home/wslab/test1_annotate/nextflow_scripts/galGal6.gtf' 10 | params.genome = '/home/wslab/test1_annotate/nextflow_scripts/galGal6.fa' 11 | params.config = '/home/wslab/test1_annotate/nextflow_scripts/gawn_config.sh' 12 | params.threads = '10' 13 | params.outdir = '/home/wslab/test1_annotate/nextflow_scripts/' 14 | params.conda = '/home/wslab/test1_annotate/nextflow_scripts/environment.yml' 15 | 16 | println """\ 17 | A N N O T A T E - M Y - G E N O M E S P I P E L I N E 18 | ======================================================= 19 | stringtie : ${params.stringtie} 20 | ref_annotation : ${params.ref_annotation} 21 | genome : ${params.genome} 22 | config_file : ${params.config} 23 | threads : ${params.threads} 24 | outdir : ${params.outdir} 25 | environment : ${params.conda} 26 | """ 27 | .stripIndent() 28 | 29 | process annotate_my_genomes { 30 | echo true 31 | stageInMode 'copy' 32 | conda "${params.conda}" 33 | 34 | shell: 35 | ''' 36 | annotate-my-genomes -a "!{params.stringtie}" -r "!{params.ref_annotation}" -g "!{params.genome}" -c "!{params.config}" -t "!{params.threads}" -o "!{params.outdir}" 37 | ''' 38 | } 39 | 40 | -------------------------------------------------------------------------------- /nextflow_scripts/old/isoform-identification.nf: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env nextflow 2 | 3 | /* 4 | * pipeline input parameters 5 | */ 6 | 7 | params.NCBI_tmap = '/home/wslab/test1_annotate/nextflow_scripts/NCBI_compare.stringtie_chr33.gtf.tmap' 8 | params.NCBI_transcripts = '/home/wslab/test1_annotate/nextflow_scripts/NCBI_transcripts.fa' 9 | params.genome_name = 'galGal6' 10 | params.conda = '/home/wslab/test1_annotate/nextflow_scripts/environment.yml' 11 | params.outdir ='./' 12 | 13 | println """\ 14 | I S O F O R M - I D E N T I F I C A T I O N P I P E L I N E 15 | ============================================================= 16 | NCBI_tmap : ${params.NCBI_tmap} 17 | NCBI_transcripts : ${params.NCBI_transcripts} 18 | genome_name : ${params.genome_name} 19 | environment : ${params.conda} 20 | outdir : ${params.outdir} 21 | """ 22 | .stripIndent() 23 | 24 | process isoform_identification { 25 | echo true 26 | stageInMode 'copy' 27 | conda "${params.conda}" 28 | 29 | publishDir "${params.outdir}", mode: 'copy' 30 | 31 | shell: 32 | ''' 33 | isoform-identification -m "!{params.NCBI_tmap}" -t "!{params.NCBI_transcripts}" -g "!{params.genome_name}" 34 | cp *csv "!{params.outdir}" 35 | ''' 36 | } 37 | 38 | -------------------------------------------------------------------------------- /test/gawn_config.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Modify the following parameter values according to your experiment 4 | # Do not modify the parameter names or remove parameters 5 | # Do not add spaces around the equal (=) sign 6 | 7 | # Global parameters 8 | NCPUS=10 # Number of CPUs to use for analyses (int, 1+) 9 | 10 | # Genome indexing 11 | SKIP_GENOME_INDEXING=1 # 1 to skip genome indexing, 0 to index it 12 | 13 | # Genome annotation with transcriptome 14 | # NOTE: do not use compressed fasta files 15 | GENOME_NAME="genome.fasta" # Name of genome fasta file found in 03_data 16 | TRANSCRIPTOME_NAME="transcriptome.fasta" # Name of transcriptome fasta file found in 03_data 17 | 18 | # Path to swissprot database 19 | --------------------------------------------------------------------------------