├── .github └── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md ├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── _config.yml ├── assets └── bioinfo-notebook_logo.svg ├── data ├── design_table.csv ├── example_genome_annotation.gtf ├── example_nucleotide_sequence.fasta └── featCounts_S_cere_20200331.csv ├── docs ├── DE_analysis_edgeR_script.md ├── DE_analysis_edgeR_script.pdf ├── SPAdes.md ├── UniProt_downloader.md ├── annotated_snps_filter.md ├── annotating_snps.md ├── augustus.md ├── bcftools.md ├── blast.md ├── bowtie.md ├── bowtie2.md ├── cl_intro.md ├── cl_solutions.md ├── combining_featCount_tables.md ├── conda.md ├── fasterq-dump.md ├── fastq-dump.md ├── fastq-dump_to_featureCounts.md ├── featureCounts.md ├── file_formats.md ├── genome_annotation_SwissProt_CDS.md ├── htseq-count.md ├── linux_setup.md ├── orthofinder.md ├── part1.md ├── part2.md ├── part3.md ├── report_an_issue.md ├── samtools.md ├── sgRNAcas9.md ├── snp_calling.md ├── ubuntu_virtualbox.md └── wsl.md ├── envs ├── augustus.yml ├── bioinfo-notebook.txt ├── bioinfo-notebook.yml ├── orthofinder.yml └── sgRNAcas9.yml └── scripts ├── DE_analysis_edgeR_script.R ├── UniProt_downloader.sh ├── annotated_snps_filter.R ├── annotating_snps.R ├── combining_featCount_tables.py ├── fastq-dump_to_featureCounts.sh ├── genome_annotation_SwissProt_CDS.sh ├── linux_setup.sh └── snp_calling.sh /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. iOS] 28 | - Browser [e.g. chrome, safari] 29 | - Version [e.g. 22] 30 | 31 | **Smartphone (please complete the following information):** 32 | - Device: [e.g. iPhone6] 33 | - OS: [e.g. iOS8.1] 34 | - Browser [e.g. stock browser, safari] 35 | - Version [e.g. 22] 36 | 37 | **Additional context** 38 | Add any other context about the problem here. 39 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | data/*.fastq 2 | data/bowtie2_example_index* 3 | data/*.sam 4 | data/*.bam 5 | data/*.bai 6 | data/*.txt 7 | data/*.summary 8 | data/*.gz 9 | data/*.bt2 10 | data/S_cere_GCF_000146045.2_R64_genomic.* 11 | data/*.tsv 12 | data/*.log 13 | temp/ 14 | results/ 15 | .temp/ 16 | results/* 17 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | python: 4 | # We don't actually use the Travis Python, but this keeps it organized. 5 | #- "2.7" 6 | #- "3.5" 7 | #- "3.6" 8 | - "3.7" 9 | 10 | install: 11 | - sudo apt-get update 12 | # We do this conditionally because it saves us some downloading if the 13 | # version is the same. 14 | - if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then 15 | wget https://repo.continuum.io/miniconda/Miniconda2-latest-Linux-x86_64.sh -O miniconda.sh; 16 | else 17 | wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh; 18 | fi 19 | - bash miniconda.sh -b -p $HOME/miniconda 20 | - source "$HOME/miniconda/etc/profile.d/conda.sh" 21 | - hash -r 22 | - conda config --set always_yes yes --set changeps1 no 23 | - conda update -q conda 24 | # Useful for debugging any issues with conda 25 | - conda info -a 26 | 27 | # Creating conda environment using envs/bioinfo-notebook.txt 28 | - conda create --name bioinfo-notebook-explicit --file envs/bioinfo-notebook.txt 29 | - conda activate bioinfo-notebook-explicit 30 | - conda deactivate 31 | 32 | # Creating conda environment using envs/bioinfo-notebook.yml 33 | - conda env create --name bioinfo-notebook-yml --file envs/bioinfo-notebook.yml 34 | - conda activate bioinfo-notebook-yml 35 | - conda deactivate 36 | 37 | # Creating conda environment using envs/augustus.yml 38 | - conda env create --name augustus-yml --file envs/augustus.yml 39 | - conda activate augustus-yml 40 | - conda deactivate 41 | 42 | # Creating conda environment using envs/orthofinder.yml 43 | - conda env create --name orthofinder-yml --file envs/orthofinder.yml 44 | - conda activate orthofinder-yml 45 | - conda deactivate 46 | 47 | # Creating conda environment using envs/sgRNAcas9.yml 48 | - conda env create --name sgRNAcas9-yml --file envs/sgRNAcas9.yml 49 | - conda activate sgRNAcas9-yml 50 | - conda deactivate 51 | 52 | script: 53 | # Confirming that programs work in conda environments 54 | # bioinfo-notebook-explicit 55 | - conda activate bioinfo-notebook-explicit 56 | - bowtie2 --version 57 | - samtools --version 58 | - fastq-dump --version 59 | - conda deactivate 60 | 61 | # bioinfo-notebook-yml 62 | - conda activate bioinfo-notebook-yml 63 | - bowtie2 --version 64 | - samtools --version 65 | - fastq-dump --version 66 | - conda deactivate 67 | 68 | # augustus-yml 69 | - conda activate augustus-yml 70 | - augustus --help 71 | - conda deactivate 72 | 73 | # orthofinder-yml 74 | - conda activate orthofinder-yml 75 | - orthofinder --help 76 | - conda deactivate 77 | 78 | # orthofinder-yml 79 | - conda activate sgRNAcas9-yml 80 | - java --version 81 | - conda deactivate 82 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Ronan Harrington 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: Home 4 | nav_order: 1 5 | description: "Quick start guides for bioinformatics programs, with video demonstrations and scripts." 6 | permalink: / 7 | --- 8 | 9 | 10 | # [Bioinformatics Notebook](https://github.com/rnnh/bioinfo-notebook.git) 11 | 12 | by [Ronan Harrington](https://github.com/rnnh) 13 | 14 | [![Build Status](https://travis-ci.com/rnnh/bioinfo-notebook.svg?branch=master)](https://travis-ci.com/rnnh/bioinfo-notebook) 15 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) 16 | ![GitHub issues](https://img.shields.io/github/issues/rnnh/bioinfo-notebook) 17 | ![GitHub repo size](https://img.shields.io/github/repo-size/rnnh/bioinfo-notebook) 18 | ![Website](https://img.shields.io/website?url=https%3A%2F%2Frnnh.github.io%2Fbioinfo-notebook) 19 | [![DOI](https://zenodo.org/badge/243280413.svg)](https://zenodo.org/badge/latestdoi/243280413) 20 | 21 | This project provides introductions to various bioinformatics tools with short guides, video demonstrations, and scripts that tie these tools together. 22 | The documents in this project can be read locally in a plain-text editor, or viewed online at . 23 | If you are not familiar with using programs from the command line, begin with the page "[Introduction to the command line](docs/cl_intro.md)". 24 | If you have any questions, or spot any mistakes, [please submit an issue on GitHub](https://github.com/rnnh/bioinfo-notebook/issues). 25 | 26 | - [Pipeline examples](#pipeline-examples) 27 | - [Contents](#contents) 28 | - [Installation instructions](#installation-instructions) 29 | - [Repository structure](#repository-structure) 30 | 31 | ## Pipeline examples 32 | 33 | These bioinformatics pipelines can be carried out using scripts and tools described in this project. 34 | Input files for some of these scripts can be specified in the command line; other scripts will need to be altered to fit the given input data. 35 | 36 | ### SNP analysis 37 | 38 | - [FASTQ](docs/file_formats.md#fastq) reads from whole genome sequencing (WGS) can be assembled using [SPAdes](docs/SPAdes.md). 39 | - Sequencing reads can be aligned to this assembled genome using [bowtie2](docs/bowtie2.md). 40 | - The script [snp_calling.sh](docs/snp_calling.md) aligns sequencing reads to an assembled genome and detects single nucleotide polymorphisms (SNPs). This will produce a [Variant Call Format (VCF) file](docs/file_formats.md#vcf). 41 | - The proteins in the assembled reference genome- the genome to which the reads are aligned- can be annotated using [genome_annotation_SwissProt_CDS.sh](docs/genome_annotation_SwissProt_CDS.md). 42 | - The genome annotation [GFF](docs/file_formats.md#gff) file can be cross-referenced with the VCF file using [annotating_snps.R](docs/annotating_snps.md). This will produce an [annotated SNP format](docs/annotating_snps.md#annotated-snp-format) file. 43 | - Annotated SNP format files can be cross-referenced using [annotated_snps_filter.R](docs/annotated_snps_filter.md). For two annotated SNP files, this script will produce a file with annotated SNPs unique to the first file, and a file with annotated SNPs unique to the second file. 44 | 45 | ### RNA-seq analysis 46 | 47 | - [fastq-dump_to_featureCounts.sh](docs/fastq-dump_to_featureCounts.md) can be used to download RNA-seq reads from NCBI's Sequence Read Archive (SRA) and align them to a reference genome. This script uses [fastq-dump](docs/fastq-dump.md) or [fasterq-dump](docs/fasterq-dump.md) to download the sequencing reads as [FASTQ](docs/file_formats.md#fastq), and [featureCounts](docs/featureCounts.md) to align them to a reference [FASTA nucleotide file.](docs/file_formats.md#fasta) 48 | - Running [fastq-dump_to_featureCounts.sh](docs/fastq-dump_to_featureCounts.md) will produce feature count tables. These feature count tables can be combined using [combining_featCount_tables.py](docs/combining_featCount_tables.md). 49 | - These combined feature count tables can be used for differential expression (DE) analysis. An example DE analysis script is included in this project: [DE_analysis_edgeR_script.R](docs/DE_analysis_edgeR_script.md). This script uses the [R programming language](https://cran.r-project.org/) with the [edgeR](https://bioconductor.org/packages/release/bioc/html/edgeR.html) library. 50 | 51 | ### Detecting orthologs between genomes 52 | 53 | - [Augustus](docs/augustus.md) can be used to predict genes from [FASTA nucleotide files](docs/file_formats.md#fasta). 54 | - Once the FASTA amino acid sequences have been [extracted from the Augustus annotations](docs/augustus.md#extracting-the-fasta-amino-acid-sequences-of-predicted-genes-from-an-augustus-annotation), you can search for orthologs using [OrthoFinder](docs/orthofinder.md). 55 | - To find a specific gene of interest, search the amino acid sequences of the predicted genes using [BLAST](docs/blast.md). 56 | 57 | ## Contents 58 | 59 | ### [1. General guides](docs/part1.md) 60 | 61 | - [Introduction to the command line](docs/cl_intro.md) 62 | - [Windows Subsystem for Linux](docs/wsl.md) 63 | - [Using Ubuntu through a Virtual Machine](docs/ubuntu_virtualbox.md) 64 | - [File formats used in bioinformatics](docs/file_formats.md) 65 | 66 | ### [2. Program guides](docs/part2.md) 67 | 68 | - [Augustus](docs/augustus.md) 69 | - [Bcftools](docs/bcftools.md) 70 | - [BLAST](docs/blast.md) 71 | - [Bowtie](docs/bowtie.md) 72 | - [Bowtie2](docs/bowtie2.md) 73 | - [Conda](docs/conda.md) 74 | - [Fasterq-dump](docs/fasterq-dump.md) 75 | - [Fastq-dump](docs/fastq-dump.md) 76 | - [FeatureCounts](docs/featureCounts.md) 77 | - [Htseq-count](docs/htseq-count.md) 78 | - [OrthoFinder](docs/orthofinder.md) 79 | - [SAMtools](docs/samtools.md) 80 | - [sgRNAcas9](docs/sgRNAcas9.md) 81 | - [SPAdes](docs/SPAdes.md) 82 | 83 | ### [3. Scripts](docs/part3.md) 84 | 85 | - [Annotated SNPs filter](docs/annotated_snps_filter.md) 86 | - [Annotating SNPs](docs/annotating_snps.md) 87 | - [Combining featCount tables.py](docs/combining_featCount_tables.md) 88 | - [DE_analysis_edgeR_script.R](docs/DE_analysis_edgeR_script.md) 89 | - [Fastq-dump to featureCounts](docs/fastq-dump_to_featureCounts.md) 90 | - [Genome annotation script](docs/genome_annotation_SwissProt_CDS.md) 91 | - [Linux setup script](docs/linux_setup.md) 92 | - [SNP calling script](docs/snp_calling.md) 93 | - [UniProt downloader](docs/UniProt_downloader.md) 94 | 95 | ## Installation instructions 96 | 97 | After following these instructions, there will be a copy of the [bioinfo-notebook GitHub repo](https://www.github.com/rnnh/bioinfo-notebook/) on your system in the `~/bioinfo-notebook/` directory. 98 | This means there will be a copy of all the documents and scripts in this project on your computer. 99 | If you are using Linux and run the [Linux setup script](docs/linux_setup.sh), the `bioinfo-notebook` virtual environment- which includes the majority of the command line programs covered in this project- will also be installed using [conda](docs/conda.md). 100 | 101 | **1.** This project is written to be used through a UNIX (Linux or Mac with macOS Mojave or later) operating system. 102 | If you are using a Windows operating system, begin with these pages on setting up Ubuntu (a Linux operating system): 103 | 104 | - [Windows Subsystem for Linux](docs/wsl.md) 105 | - [Using Ubuntu through a Virtual Machine](docs/ubuntu_virtualbox.md) 106 | 107 | Once you have an Ubuntu system set up, run the following command to update the lists of available software: 108 | 109 | ```bash 110 | $ sudo apt-get update # Updates lists of software that can be installed 111 | ``` 112 | 113 | **2.** Run the following command in your home directory (`~`) to download this project: 114 | 115 | ```bash 116 | $ git clone https://github.com/rnnh/bioinfo-notebook.git 117 | ``` 118 | 119 | **3.** If you are using Linux, run the [Linux setup script](docs/linux_setup.md) with this command after downloading the project: 120 | 121 | ```bash 122 | $ bash ~/bioinfo-notebook/scripts/linux_setup.sh 123 | ``` 124 | 125 | ### Video demonstration of installation 126 | 127 | [![asciicast](https://asciinema.org/a/314853.svg)](https://asciinema.org/a/314853?autoplay=1) 128 | 129 | ## Repository structure 130 | 131 | ``` 132 | bioinfo-notebook/ 133 | ├── assets/ 134 | │   └── bioinfo-notebook_logo.svg 135 | ├── data/ 136 | │   ├── blastx_SwissProt_example_nucleotide_sequence.fasta.tsv 137 | │   ├── blastx_SwissProt_S_cere.tsv 138 | │   ├── design_table.csv 139 | │   ├── example_genome_annotation.gtf 140 | │   ├── example_nucleotide_sequence.fasta 141 | │   └── featCounts_S_cere_20200331.csv 142 | ├── docs/ 143 | │   ├── annotated_snps_filter.md 144 | │   ├── annotating_snps.md 145 | │   ├── augustus.md 146 | │   ├── blast.md 147 | │   ├── bowtie2.md 148 | │   ├── bowtie.md 149 | │   ├── cl_intro.md 150 | │   ├── cl_solutions.md 151 | │   ├── combining_featCount_tables.md 152 | │   ├── conda.md 153 | │   ├── DE_analysis_edgeR_script.md 154 | │   ├── DE_analysis_edgeR_script.pdf 155 | │   ├── fasterq-dump.md 156 | │   ├── fastq-dump.md 157 | │   ├── fastq-dump_to_featureCounts.md 158 | │   ├── featureCounts.md 159 | │   ├── file_formats.md 160 | │   ├── genome_annotation_SwissProt_CDS.md 161 | │   ├── htseq-count.md 162 | │   ├── linux_setup.md 163 | │   ├── orthofinder.md 164 | │   ├── part1.md # Navigation page for website 165 | │   ├── part2.md # Navigation page for website 166 | │   ├── part3.md # Navigation page for website 167 | │   ├── report_an_issue.md 168 | │   ├── samtools.md 169 | │   ├── sgRNAcas9.md 170 | │   ├── snp_calling.md 171 | │   ├── SPAdes.md 172 | │   ├── ubuntu_virtualbox.md 173 | │   ├── UniProt_downloader.md 174 | │   └── wsl.md 175 | ├── envs/ # conda environment files 176 | │   ├── augustus.yml # environment for Augustus 177 | │   ├── bioinfo-notebook.txt 178 | │   ├── bioinfo-notebook.yml 179 | │   ├── orthofinder.yml # environment for OrthoFinder 180 | │   └── sgRNAcas9.yml # environment for sgRNAcas9 181 | ├── scripts/ 182 | │   ├── annotated_snps_filter.R 183 | │   ├── annotating_snps.R 184 | │   ├── combining_featCount_tables.py 185 | │   ├── DE_analysis_edgeR_script.R 186 | │   ├── fastq-dump_to_featureCounts.sh 187 | │   ├── genome_annotation_SwissProt_CDS.sh 188 | │   ├── linux_setup.sh 189 | │   ├── snp_calling.sh 190 | │   └── UniProt_downloader.sh 191 | ├── _config.yml # Configures github.io project website 192 | ├── .gitignore 193 | ├── LICENSE 194 | ├── README.md 195 | └── .travis.yml # Configures Travis CI testing for GitHub repo 196 | ``` 197 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | remote_theme: pmarsceill/just-the-docs 2 | baseurl: "/bioinfo-notebook" # the subpath of your site, e.g. /blog 3 | url: "https://rnnh.github.io" # the base hostname & protocol for your site, e.g. http://example.com 4 | title: Bioinformatics Notebook 5 | logo: assets/bioinfo-notebook_logo.svg 6 | search_enabled: true 7 | search_tokenizer_separator: /[\s/]+/ 8 | aux_links: 9 | "Bioinformatics Notebook on GitHub": 10 | - "//github.com/rnnh/bioinfo-notebook" 11 | heading_anchors: true 12 | -------------------------------------------------------------------------------- /assets/bioinfo-notebook_logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 19 | 21 | 40 | 42 | 43 | 45 | image/svg+xml 46 | 48 | 49 | 50 | 51 | 52 | 57 | 63 | 67 | 71 | 75 | 79 | 83 | 87 | 91 | 95 | 99 | 103 | 107 | 111 | 115 | 119 | 123 | 127 | 131 | 135 | 139 | 143 | 147 | 151 | 152 | 153 | 154 | -------------------------------------------------------------------------------- /data/design_table.csv: -------------------------------------------------------------------------------- 1 | run,name,condition 2 | SRR8933532,SCEhightemp3,high_temp 3 | SRR8933534,SCEhightemp1,high_temp 4 | SRR8933509,SCEkcl3,osmotic_pressure 5 | SRR8933530,SCElowPH2,low_pH 6 | SRR8933511,SCEanaer2,anaerobic 7 | SRR8933533,SCEhightemp2,high_temp 8 | SRR8933537,SCEstan1,standard 9 | SRR8933506,SCEanaer3,anaerobic 10 | SRR8933531,SCElowPH1,low_pH 11 | SRR8933538,SCEkcl1,osmotic_pressure 12 | SRR8933512,SCEanaer1,anaerobic 13 | SRR8933510,SCEkcl2,osmotic_pressure 14 | SRR8933535,SCEstan3,standard 15 | SRR8933536,SCEstan2,standard 16 | SRR8933539,SCElowPH3,low_pH 17 | -------------------------------------------------------------------------------- /docs/DE_analysis_edgeR_script.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rnnh/bioinfo-notebook/aa1c8f5318d40c4105a50108ea1a6102433be8a0/docs/DE_analysis_edgeR_script.pdf -------------------------------------------------------------------------------- /docs/SPAdes.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: SPAdes 4 | parent: 2. Program guides 5 | --- 6 | 7 | # SPAdes 8 | 9 | SPAdes [is an assembly toolkit containing various assembly pipelines](https://github.com/ablab/spades/blob/spades_3.14.1/README.md). 10 | 11 | ## Assembling a genome from Illumina paired-end reads using `SPAdes` 12 | 13 | `SPAdes` can be used to assemble paired-end reads as follows: 14 | 15 | ```bash 16 | $ spades -1 reads_1.fq.gz -2 reads_2.fq.gz -t 5 -m 200 -o results/directory/ 17 | ``` 18 | 19 | In this command... 20 | 21 | 1. **`-1`** is the file with forward reads. 22 | 2. **`-2`** is the file with reverse reads. 23 | 3. **`-t`** or **`--threads`** sets the number of processors/threads to use. The default is 16. 24 | 4. **`-m`** or **`--memory`** is memory the limit in Gb. SPAdes terminates if it reaches this limit. The default value is 250Gb. 25 | 5. **`-o`** or **`--outdir`** is the output directory to use. The default is the current directory. 26 | 27 | SPAdes supports uncompressed (**`.fastq`** or **`.fq`**) or compressed (**`.fastq.gz`** or **`.fq.gz`**) sequencing read inputs. 28 | In the output directory, the assembled genome will be available as contigs (**`contigs.fasta`**) and scaffolds (**`scaffolds.fasta`**), both of which are FASTA nucleotide files. 29 | 30 | ## See also 31 | 32 | - [conda](conda.md): The `bioinfo-notebook` conda environment includes SPAdes 33 | - [File formats used in bioinformatics](file_formats.md) 34 | 35 | ## Further reading 36 | 37 | - [SPAdes on GitHub](https://github.com/ablab/spades/) 38 | -------------------------------------------------------------------------------- /docs/UniProt_downloader.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: UniProt downloader 4 | parent: 3. Scripts 5 | --- 6 | 7 | # UniProt downloader 8 | 9 | [UniProt_downloader.sh](../scripts/UniProt_downloader.sh) is a `bash` shell script for downloading [UniProt](https://www.uniprot.org/) protein sequences to a FASTA amino acid ([.faa](file_formats.md)) file. 10 | It takes a list of UniProt accession numbers as input, and then pipes each one into a `curl` command to download the corresponding protein. 11 | This is essentially a [one-line program](https://en.wikipedia.org/wiki/One-liner_program) wrapped in a shell script to make downloading UniProt sequences easier. 12 | 13 | ## Usage 14 | 15 | ``` 16 | UniProt_downloader.sh [-h|--help] [-p|--processors n -o|--output] -i|--input 17 | 18 | This script takes a list of UniProt primary accession numbers (*.list), and 19 | downloads the corresponding protein sequences from UniProt as a FASTA amino 20 | acid (.faa) file. 21 | 22 | This list can be generated by searching UniProtKB for a desired term (e.g. 23 | 'taxonomy:147537' for the Saccharomycotina subphylum), selecting 'Download' 24 | and 'Format: List' to download the accession numbers of the corresponding 25 | results. 26 | 27 | arguments: 28 | -h | --help show this help text and exit 29 | -i | --input the list of UniProt proteins to download 30 | -p | --processors optional: set the number (n) of processors to 31 | use (default: 1) 32 | -o | --output optional: name of the output .faa file 33 | (default: uniprot_{date}.faa) 34 | ``` 35 | 36 | ## See also 37 | 38 | - [UniProt_downloader.sh on GitHub](https://github.com/rnnh/bioinfo-notebook/blob/master/scripts/UniProt_downloader.sh) 39 | - [File formats used in bioinformatics](file_formats.md) 40 | - [UniProt](https://www.uniprot.org/) 41 | -------------------------------------------------------------------------------- /docs/annotated_snps_filter.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: Annotated SNPs filter 4 | parent: 3. Scripts 5 | --- 6 | 7 | # Annotated SNPs filter 8 | 9 | [annotated_snps_filter.R](../scripts/annotated_snps_filter.R) is an `R` script cross-references annotated SNP files created using [annotating_snps.R](annotating_snps.md). 10 | It takes two files created using this script, and returns unique SNPs for each file. 11 | If a SNP in File 1 is not found at the same position on the same sequence as File 2, it is returned as a unique SNP, and vice versa. 12 | These unique SNPs are then written to new `.tsv` files. 13 | 14 | ## Usage 15 | 16 | To use this script, variables need to be defined on lines 21 and 22 of the script: 17 | 18 | - Assign the name of the first annotated SNP file to be filtered to 'annotated_SNP_file_1'. 19 | - Assign the name of the second annotated SNP file to be filtered to 'annotated_SNP_file_2'. 20 | - These files should be in the `~/bioinfo-notebook/data/` directory. 21 | - Optional: the name of the output files can be assigned on lines 109 and 115 respectively. 22 | 23 | ## See also 24 | 25 | - [annotated_snps_filter.R on GitHub](https://github.com/rnnh/bioinfo-notebook/blob/master/scripts/annotated_snps_filter.R) 26 | - [annotating_snps.R](annotating_snps.md) 27 | - [File formats used in bioinformatics](file_formats.md) 28 | - [snp_calling.sh](snp_calling.md), a script for generating VCF files of SNPs. 29 | - [genome_annotation_SwissProt_CDS.sh](genome_annotation_SwissProt_CDS.md), a script for generating genome annotation GFF files. 30 | -------------------------------------------------------------------------------- /docs/annotating_snps.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: Annotating SNPs 4 | parent: 3. Scripts 5 | --- 6 | 7 | # Annotating SNPs 8 | 9 | [annotating_snps.R](../scripts/annotating_snps.R) is an `R` script that cross-references annotations of genome assemblies with VCF files containing SNPs of sequencing reads aligned against 10 | those genome assemblies. 11 | If a SNP falls within- or upstream of- an annotated genome feature (start codon, stop codon, CDS, etc.), the script will return that feature along with the SNP. 12 | For this script to work, these files need to use the same sequence names: e.g. if the first sequence in the VCF is called "chrI", there should be a corresponding sequence called "chrI" in the GFF file. 13 | 14 | ## Usage 15 | 16 | To use this script, variables need to be defined on lines 28 to 32 of the script: 17 | 18 | - The GFF file name should be assigned to the variable `GFF_file`. 19 | - The VCF file name should be assigned to the variable `VCF_file`. 20 | - The VCF and GFF files should be in the directory `~/bioinfo-notebook/data/`. 21 | - The number of lines in the VCF file header should be specified in the `VCF_header.int` variable. This is the number of lines that begin with `#` in the VCF file. 22 | - The variable `upstream.int` is used to determine how far upstream from an annotated feature a SNP can be. This can be set to 0 if you do not want upstream SNPs to be considered. Setting it to 1000 will mean that SNPs up to 1,000 bases/1kb upstream from a feature will be annotated. 23 | - The variable 'output_name' is used to specify the name of the output file, which should end in '.tsv' as it will be a tab-separated values text file. 24 | 25 | ## Annotated SNP format 26 | 27 | The `.tsv` files created by this script have a combination of columns from the [GFF and VCF formats](file_formats.md) as follows... 28 | 29 | 1. `sequence` The name of the sequence where the feature is located. 30 | 2. `source` Keyword identifying the source of the feature, like a program (e.g. Augustus) or an organization (e.g. [SGD](https://www.yeastgenome.org/)). 31 | 3. `feature` The feature type name, like `gene` or `exon`. In a well-structured GFF file, all the children features always follow their parents in a single block (so all exons of a transcript are put after their parent `transcript` feature line and before any other parent transcript line). 32 | 4. `start` Genomic start of the feature, with a 1-base offset. 33 | 5. `end` Genomic end of the feature, with a 1-base offset. 34 | 6. `score` Numeric value that generally indicates the confidence of the source in the annotated feature. A value of `.` (a dot) is used to define a null value. 35 | 7. `strand` Single character that indicates the strand of the feature; it can assume the values of `+` (positive, or `5'->3'`), `-`, (negative, or `3'->5'`), `.` (undetermined). 36 | 8. `phase` Phase of coding sequence (CDS) features, indicating where the feature starts in relation to the reading frame. It can be either one of `0`, `1`, `2` (for CDS features) or `.` (for everything else). 37 | 9. `attributes` All the other information pertaining to this feature. The format, structure and content of this field is the one which varies the most between GFF formats. 38 | 10. `POS` The 1-based position of the variation on the given sequence. 39 | 11. `REF` The reference base (or bases in the case of an indel) at the given position on the given reference sequence. 40 | 12. `ALT` The list of alternative alleles at this position. 41 | 13. `QUAL` A quality score associated with the inference of the given alleles. 42 | 14. `FILTER` A flag indicating which of a given set of filters the variation has passed. 43 | 15. `INFO` An extensible list of key-value pairs (fields) describing the variation. Multiple fields are separated by semicolons with optional values in the format: =[,data]. 44 | 16. `SAMPLE` For each (optional) sample described in the file, values are given for the fields listed in FORMAT. If multiple samples have been aligned to the reference sequence, each sample will have its own column. 45 | 46 | 47 | ## See also 48 | 49 | - [annotating_snps.R on GitHub](https://github.com/rnnh/bioinfo-notebook/blob/master/scripts/annotating_snps.R) 50 | - [annotated_snps_filter.R](annotated_snps_filter.md) 51 | - [File formats used in bioinformatics](file_formats.md) 52 | - [snp_calling.sh](snp_calling.md), a script for generating VCF files of SNPs. 53 | - [genome_annotation_SwissProt_CDS.sh](genome_annotation_SwissProt_CDS.md), a script for generating genome annotation GFF files. 54 | -------------------------------------------------------------------------------- /docs/augustus.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: Augustus 4 | parent: 2. Program guides 5 | --- 6 | 7 | # Augustus 8 | 9 | Augustus is a program that predicts genes in eukaryotic genomic sequences. 10 | It can be run online, with a server for [smaller files](http://bioinf.uni-greifswald.de/augustus/submission.php) and one for [larger files](http://bioinf.uni-greifswald.de/webaugustus/), or locally. 11 | The local version of Augustus can be installed through [conda](conda.md). 12 | This project includes an example [augustus conda environment](../envs/augustus.yml). 13 | 14 | ## Predicting genes in a eukaryotic FASTA nucleic acid file using `augustus` 15 | 16 | `augustus` can be used to predict genes as follows: 17 | 18 | ```bash 19 | $ augustus --species=species_name input_file.fna > output_file.gff 20 | ``` 21 | 22 | In this command... 23 | 24 | 1. `--species` is used to specify the target species for gene predictions (`species_name`). 25 | 2. `input_file.fna` is the input FASTA nucleic acid file ([.fna](file_formats.md#fasta)). 26 | 3. `output_file.gff` is the general feature format ([GFF](file_formats.md#generic-feature-formats)) genome annotation output file. 27 | Lines beginning with `#` are Augustus comments: these lines do not follow the GFF structure. 28 | 29 | The following command gives the list of valid species names for use with Augustus: 30 | 31 | ```bash 32 | $ augustus --species=help 33 | ``` 34 | 35 | ## Extracting the FASTA amino acid sequences of predicted genes from an Augustus annotation 36 | 37 | The genome annotation file produced by `augustus` (`output_file.gff`) contains the amino acid sequences of predicted genes in comment lines. 38 | These amino acid sequences can be extracted to a FASTA file with the following command: 39 | 40 | ```bash 41 | $ getAnnoFasta.pl output_file.gff 42 | ``` 43 | 44 | The amino acid sequences will be written to `output_file.aa`. 45 | This is a FASTA amino acid ([.faa](file_formats.md#fasta)). 46 | The extension of this file can be changed from ".aa" to ".faa" with the following command: 47 | 48 | ```bash 49 | $ mv output_file.aa output_file.faa 50 | ``` 51 | 52 | ## Removing comments from Augustus annotations 53 | 54 | Genome annotations produced by Augustus follow the [Generic Feature Format](file_formats.md#generic-feature-formats), with the addition of comment lines for amino acid sequences. 55 | These are the same FASTA amino acid sequences that are extracted using `getAnnoFasta.pl`. 56 | These lines begin with the character `#`, and removing them results a standard GFF file. 57 | 58 | Here is one method for removing these amino acid lines, using `grep -v` to select lines which do not contain the `#` character: 59 | 60 | ```bash 61 | $ grep -v "#" augustus_annotation.gff > clean_augustus_annotation.gff 62 | ``` 63 | 64 | ## Demonstration 65 | 66 | In this video, `augustus` is used to predict genes in `example_nucleotide_sequence.fasta`. 67 | This results in a genome annotation file: `augustus_example.gff`. 68 | The script `getAnnoFasta.pl` is used to extract the amino acid sequences in this genome annotation file to a new FASTA amino acid file: `augustus_example.aa`. 69 | The `mv` command is used to change the extension of this file from ".aa" to ".faa". 70 | 71 | [![asciicast](https://asciinema.org/a/346541.svg)](https://asciinema.org/a/346541?autoplay=1) 72 | 73 | ## See also 74 | 75 | - [conda](conda.md) 76 | - [augustus conda environment](../envs/augustus.yml) 77 | - [File formats used in bioinformatics](file_formats.md) 78 | 79 | ## References 80 | 81 | - [The Augustus website](http://bioinf.uni-greifswald.de/augustus/) 82 | - [GNU grep](https://www.gnu.org/software/grep/manual/grep.html) 83 | -------------------------------------------------------------------------------- /docs/bcftools.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: Bcftools 4 | parent: 2. Program guides 5 | --- 6 | 7 | # Bcftools 8 | 9 | Bcftools are a set of [utilities for variant calling and manipulating VCFs and BCFs](https://samtools.github.io/bcftools/bcftools.html). 10 | 11 | ## Generating genotype likelihoods for alignment files using `bcftools mpileup` 12 | 13 | `bcftools mpileup` can be used to generate VCF or BCF files containing genotype likelihoods for one or multiple alignment (BAM or CRAM) files as follows: 14 | 15 | ```bash 16 | $ bcftools mpileup --max-depth 10000 --threads n -f reference.fasta -o genotype_likelihoods.bcf reference_sequence_alignmnet.bam 17 | ``` 18 | 19 | In this command... 20 | 21 | 1. **`--max-depth`** or **`-d`** sets the reads per input file for each position in the alignment. In this case, it is set to 10000 22 | 2. **`--threads`** sets the number (*n*) of processors/threads to use. 23 | 3. **`--fasta-ref`** or **`-f`** is used to select the [faidx-indexed FASTA](samtools.md#indexing-a-fasta-file-using-samtools-faidx) nucleotide reference file (*reference.fasta*) used for the alignment. 24 | 4. **`--output `** or **`-o`** is used to name the ouput file (*genotype_likelihoods.bcf*). 25 | 5. The final argument given is the input BAM alignment file (*reference_sequence_alignment.bam*). Multiple input files can be given here. 26 | 27 | ## Variant calling using `bcftools call` 28 | 29 | `bcftools call` can be used to call SNP/indel variants from a BCF file as follows: 30 | 31 | ```bash 32 | $ bcftools call -O b --threads n -vc --ploidy 1 -p 0.05 -o variants_unfiltered.bcf genotype_likelihoods.bcf 33 | ``` 34 | 35 | In this command... 36 | 37 | 1. **`--output-type`** or **`-O`** is used to select the output format. In this case, *b* for BCF. 38 | 2. **`--threads`** sets the number (*n*) of processors/threads to use. 39 | 3. **`-vc`** specifies that we want the output to contain variants only, using the original [SAMtools](samtools.md) consensus caller. 40 | 4. **`--ploidy`** specifies the ploidy of the assembly. 41 | 5. **`--pval-threshold`** or **`-p`** is used to the set the p-value threshold for variant sites (*0.05*). 42 | 6. **`--output `** or **`-o`** is used to name the ouput file (*variants_unfiltered.bcf*). 43 | 7. The final argument is the input BCF file (*genotype_likelihoods.bcf*). 44 | 45 | ## Filtering variants using `bcftools filter` 46 | 47 | `bcftools filter` can be used to filter variants from a BCF file as follows... 48 | 49 | ```bash 50 | $ bcftools filter --threads n -i '%QUAL>=20' -O v -o variants_filtered.vcf variants_unfiltered.bcf 51 | ``` 52 | 53 | In this command... 54 | 55 | 1. **`--threads`** sets the number (*n*) of processors/threads to use. 56 | 2. **`--include`** or **`-i`** is used to define the expression used to filter sites. In this case, *`%QUAL>=20`* results in sites with a quality score greater than or equal to 20. 57 | 3. **`--output-type`** or **`-O`** is used to select the output format. In this case, *v* for VCF. 58 | 4. **`--output `** or **`-o`** is used to name the ouput file (*variants_filtered.vcf*). 59 | 5. The final argument is the input BCF file (*genotype_likelihoods.bcf*). 60 | 61 | ## See also 62 | 63 | - [File formats used in bioinformatics](file_formats.md) 64 | - [SNP calling script](snp_calling.md) 65 | 66 | ## Futher reading 67 | 68 | - [bcftools documentation](https://samtools.github.io/bcftools/bcftools.html) 69 | -------------------------------------------------------------------------------- /docs/blast.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: BLAST 4 | parent: 2. Program guides 5 | --- 6 | 7 | # BLAST 8 | 9 | The Basic Local Alignment Search Tool (BLAST) is an algorithm and program for comparing primary biological sequence information, such as the amino-acid sequences of proteins or the nucleotides of DNA and/or RNA sequences. 10 | BLAST is one of the most widely used tools in bioinformatics; it can be applied to different problems or projects in a myriad ways. 11 | 12 | ## Contents 13 | 14 | - [How BLAST works](#how-blast-works) 15 | - [The command line version of BLAST](#the-command-line-version-of-blast) 16 | - [Types of BLAST search](#types-of-blast-search) 17 | - [E-value and Bit-score](#e-value-and-bit-score) 18 | - [Creating a BLAST database using `makeblastdb`](#creating-a-blast-database-using-makeblastdb) 19 | - [Creating local BLAST database from Swiss-Prot](#downloading-swiss-prot-fasta-sequences-and-creating-a-blast-protein-database) 20 | - [Searching against a BLAST nucleotide database using `blastn`](#searching-against-a-blast-nucleotide-database-using-blastn) 21 | - [BLAST `-outfmt 6` results](#blast--outfmt-6-results) 22 | - [Video demonstration](#video-demonstration) 23 | - [See also](#see-also) 24 | - [References](#references) 25 | 26 | ## How BLAST works 27 | 28 | There are two main steps in BLAST: 29 | 30 | 1. A list of "words" (sets of characters/residues) of length *k* is created for the query sequence. By default, *k* = 3 for amino acid sequences, and *k* = 11 for nucleotide sequences. 31 | 2. An alignment is made for database (subject) sequences that share many words with the query sequence. This is a local alignment in which only High-scoring Segment Pairs (HSPs) are reported. In other words, BLAST finds islands of similarity between sequences. 32 | 33 | ![An outline of the BLAST algorithm](https://www.ncbi.nlm.nih.gov/books/NBK62051/bin/blast_glossary-Image001.jpg "An outline of the BLAST algorithm") 34 | 35 | ## The command line version of BLAST 36 | 37 | BLAST can be used online, or through the command line. 38 | Most biologists are familiar with [NCBI's web application for BLAST](). 39 | If you use this web application regularly, the command line BLAST program is worth your consideration. 40 | The command line version of BLAST has several advantages over the web version: 41 | 42 | 1. BLAST on the command line can be used to run *local searches*, i.e. searches which use files that are on your computer, instead of files that are on an NCBI database. 43 | 2. BLAST searches on the command line can be made more specific by adding additional arguments. 44 | 3. BLAST searches carried out on the command line can be automated, and incorporated into larger scripts. 45 | 4. The command line BLAST program can output search results in various structured text formats. 46 | 47 | The command line version of BLAST can be downloaded via [conda](conda.md) using the following command: 48 | 49 | ```bash 50 | $ conda install -c bioconda blast 51 | ``` 52 | 53 | This program is included in the [bioinfo-notebook conda environment](../envs/bioinfo-notebook.txt). 54 | 55 | ## Types of BLAST search 56 | 57 | There are five main types of BLAST search: 58 | 59 | 1. **BLASTp** searches a protein database with a protein query sequence. 60 | 2. **BLASTn** searches a nucleic acid database with nucleic acid query sequence. 61 | 3. **BLASTx** searches a protein database with nucleic acid query sequence, which is translated into an amino acid sequence. 62 | 4. **tBLASTx** searches a nucleic acid database with nucleic acid query sequence. In this case, both the database (subject) sequences and query sequence are translated into amino acid sequences. 63 | 5. **tBLASTn** searches a nucleic acid database with protein query sequence. In this case, the nucleic acid database is translated into a set of amino acid sequences. 64 | 65 | While the type of query and subject sequences required for each of these BLAST searches differs, the command line arguments that can be used for these BLAST searches are interchangeable. 66 | 67 | ## E-value and Bit-score 68 | 69 | Two important variables when interpreting BLAST results are *E-value* and *bit-score*. 70 | These are both derived from the *raw alignment score (S)*, which is based on the number of residues (i.e. individual amino/nucleic acids) that two sequences have in common. 71 | The more identical residues that two sequences have at the same position in an alignment, the higher the alignment score. 72 | 73 | - **Bit-score (S')** is the raw alignment score (S) normalised with respect to the scoring system used for the alignment. 74 | - **E-value** or Expectation value is the number of different alignments with scores equivalent to or better than S that is expected to occur in a database search by chance. The lower the E value, the more significant the score and the alignment. An exact match between query and subject sequences results in an E-value of zero. 75 | 76 | While bit-scores are comparable between searches, as they are normalised, they do not take the size of the database into account. 77 | E-values, however, do account for the size of the database. 78 | The lower the E-value and the higher the bit-score, the better the BLAST result. 79 | 80 | ## Creating a BLAST database using `makeblastdb` 81 | 82 | To search against a set of nucleotide or amino acid sequences using BLAST, a database must be created. 83 | This can be done using the `makeblastdb` command. 84 | 85 | ```bash 86 | $ makeblastdb -dbtype prot/nucl -in input_file -out database_name 87 | ``` 88 | 89 | In this command... 90 | 91 | 1. `-dbtype` specifies the type of sequences used to create the database. For amino acid (protein) sequences, `prot` is used ("`-dbtype prot`"). For nucleic acid sequences, `nucl` is used ("`-dbtype nucl`"). 92 | 2. `-in` is used to specify the input file. The database created can be used to search against the sequences in this file. 93 | 3. `-out` is used to name the database that will be created from the input file. 94 | 95 | ## Downloading Swiss-Prot FASTA sequences and creating a BLAST protein database 96 | 97 | In this video, the FASTA amino acid sequences of Swiss-Prot are downloaded, and a BLAST protein database is created from these sequences using `makeblastdb`. 98 | [UniProtKB/Swiss-Prot](https://en.wikipedia.org/wiki/UniProt#UniProtKB.2FSwiss-Prot) is a manually annotated, non-redundant protein sequence database. 99 | As it is well-annotated and curated, the Swiss-Prot database gives informative results when searched locally using `blastp` and `blastx`. 100 | The link used in the `wget` command is copied and pasted from the [UniProt downloads page](https://www.uniprot.org/downloads). 101 | The compressed FASTA sequences of the Swiss-Prot database on `ftp.uniprot.org`. 102 | 103 | These FASTA amino acid sequences are compressed into a `.gz` (gzip) file. 104 | Before using the `makeblastdb` command, this FASTA file is uncompressed using `gunzip`, turning `uniprot_sprot`**`.fasta.gz`** into `uniprot_sprot`**`.fasta`**. 105 | Once the FASTA file is downloaded and uncompressed, `makeblastdb` is used to create a BLAST protein database of the amino acid sequences in this FASTA file. 106 | This BLAST protein database is named `swissprot`, and consists of three binary files. 107 | 108 | Once the BLAST protein database is created, `blastp` and `blastx` can be used to search sequences against it. 109 | This database can be selected using the argument `-db swissprot`with `blastp` or `blastx` (the path to the `swissprot` database will need to be given if the command is run from a different directory). 110 | 111 | [![asciicast](https://asciinema.org/a/338534.svg)](https://asciinema.org/a/338534?autoplay=1) 112 | 113 | ## Searching against a BLAST nucleotide database using `blastn` 114 | 115 | The program `blastn` is used for searching nucleotide databases with a nucleotide query. 116 | 117 | ```bash 118 | $ blastn -query query_file.fna -db nucl_database_name -out results_file.tsv -outfmt 6 -evalue x -max_hsps y -num_threads n 119 | ``` 120 | 121 | In this command... 122 | 123 | 1. `-query` is used to select the FASTA nucleic acids file you want to search against the BLAST database (the `query_file.fna`). 124 | 2. `-db` is used to select the BLAST nucleotide database you want to search against (`nucl_database_name`). 125 | 3. `-out` is used to direct the results to an output file (`results_file.tsv`). 126 | 4. `-outfmt` is used to specify how this results file should be formatted. In this case, as `-outfmt` is `6`, the results will be written to a file as tab-separated values: this is why `results_file.tsv` has a `.tsv` extension. 127 | 5. `-evalue` is used to set an E-value threshold (`x`). Results which have an E-value greater than this threshold will not be written to the results file. 128 | 6. `-max_hsps` is used to set a High-scoring Segment Pairs (HSPs) threshold (`y`). When given, no more than `y` HSPs (alignments) for each query-subject pair will be written to the results file. 129 | 7. `-num_threads` is used to set the number (*`n`*) of threads/processors to use (default 1). 130 | 131 | The last two arguments given in this command- `-evalue` and `-max_hsps`- are optional, but they are useful as they allow the results to be filtered before being written to the file. 132 | Using these arguments will result in more specific results, and will reduce the need to manually filter results later. 133 | 134 | ## BLAST `-outfmt 6` results 135 | 136 | These BLAST results are taken from the [video demonstration](#video_demonstration) and are in BLAST output format 6. 137 | 138 | ``` 139 | gi|242120357|gb|FJ461870.1| NC_001144.5 93.252 163 11 0 196 358 454921 454759 7.57e-63 241 140 | gi|242120357|gb|FJ461870.1| NC_001144.5 93.252 163 11 0 196 358 464058 463896 7.57e-63 241 141 | gi|242120357|gb|FJ461870.1| CP036478.1 93.252 163 11 0 196 358 454829 454667 7.57e-63 241 142 | gi|242120357|gb|FJ461870.1| CP036478.1 93.252 163 11 0 196 358 463966 463804 7.57e-63 241 143 | gi|242120357|gb|FJ461870.1| CP024006.1 93.252 163 11 0 196 358 453978 453816 7.57e-63 241 144 | ``` 145 | 146 | These results are tab-separated values, meaning each column in the results is separated by a `Tab` character. 147 | These columns always appear in the same order: 148 | 149 | ``` 150 | query_id subject_id per_identity aln_length mismatches gap_openings q_start q_end s_start s_end e-value bit_score 151 | ``` 152 | 153 | In this format... 154 | 155 | 1. `query_id` is the FASTA header of the sequence being searched against the database (the query sequence). 156 | 2. `subject_id` is the FASTA header of the sequence in the database that the query sequence has been aligned to (the subject sequence). 157 | 3. `per_identity` is the percentage identity- the extent to which the query and subject sequences have the same residues at the same positions. 158 | 4. `aln_length` is the alignment length. 159 | 5. `mismatches` is the number of mismatches. 160 | 6. `gap_openings` is the number of gap openings in the alignment. 161 | 7. `q_start` is the start of the alignment in the query sequence. 162 | 8. `q_end` is the end of the alignment in the query sequence. 163 | 9. `s_start` is the start of the alignment in the subject sequence. 164 | 10. `s_end` is the end of the alignment in the subject sequence. 165 | 11. `e_value` is the expect value (E-value) for the alignment. 166 | 12. `bit_score` is the bit-score of the alignment. 167 | 168 | All BLAST output formats above 4 (i.e. `--outfmt > 4`) use this tabular layout, formatted in different ways. 169 | For example, `--outfmt 10` gives the same information in a comma-separated values (`.csv`) file instead of a tab-separated values (`.tsv`) file. 170 | 171 | ## Video demonstration 172 | 173 | In this demonstration, `makeblastdb` is used to create a BLAST database from the file `S_cere_genomes.fna`. 174 | This FASTA nucleic acids (`.fna`) file was created by concatenating the following *Saccharomyces cerevisiae* genome assemblies, which were downloaded from NCBI: [GCA_003086655.1](https://www.ncbi.nlm.nih.gov/assembly/GCA_003086655.1), [GCA_003709285.1](https://www.ncbi.nlm.nih.gov/assembly/GCA_003709285.1) and [GCA_004328465.1](https://www.ncbi.nlm.nih.gov/assembly/GCA_004328465.1). 175 | 176 | The program `blastn` is then used to query `23S_rRNA_gene.fna` against this database. 177 | This file is a copy of the [*Scutellospora fulgida* isolate NC303A 25S ribosomal RNA gene](https://www.ncbi.nlm.nih.gov/nuccore/FJ461870.1?report=fasta) from NCBI. 178 | 179 | The program `tblastn` is also used to query `YPK1.faa` against this database multiple times. 180 | This FASTA amino acid (`.faa`) file is a copy of the [serine/threonine-protein kinase YPK1](https://www.uniprot.org/uniprot/P12688) from UniProt. 181 | This search is carried out multiple times with additional parameters: the flag `-evalue`is used to set an E-value threshold, and the flag `-max_hsps` is used to set a maximum number of High-scoring Segment Pairs (HSPs). 182 | 183 | The results from these BLAST searches are written to tab-separated values (`.tsv`) files. 184 | This output format is specified with the flag `-outfmt 6`. 185 | 186 | [![asciicast](https://asciinema.org/a/327279.svg)](https://asciinema.org/a/327279?autoplay=1) 187 | 188 | ## See also 189 | 190 | - [File formats used in bioinformatics](file_formats.md) 191 | - [Introduction to the command line](cl_intro.md) 192 | - [conda](conda.md) 193 | - [NCBI's web application for BLAST]() 194 | 195 | ## References 196 | 197 | - [BLAST® Command Line Applications User Manual](https://www.ncbi.nlm.nih.gov/books/NBK279690/) 198 | - [BLAST Glossary](https://www.ncbi.nlm.nih.gov/books/NBK62051/) 199 | -------------------------------------------------------------------------------- /docs/bowtie.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: Bowtie 4 | parent: 2. Program guides 5 | --- 6 | 7 | # Bowtie 8 | 9 | `bowtie` can be used to: 10 | - index reference FASTA nucleotide genomes/sequences 11 | - align FASTQ sequencing reads to those genomes/sequences 12 | 13 | If you want to align short reads (50bp or less), [bowtie is more suitable than bowtie2](bowtie2.md#differences-between-bowtie-and-bowtie2). 14 | 15 | ## Indexing a reference genome/sequence using `bowtie-build` 16 | 17 | Before aligning reads to a reference genome with `bowtie`, it must be indexed using `bowtie-bui 18 | ld`. 19 | This command will create six files with the extensions `.1.ebwt`, `.2.ebwt`, `.3.ebwt`, `.4.ebwt`, `.rev.1.ebwt`, and `.rev.2.ebwt`. 20 | These six files together are the index. 21 | Once an index has been created, the original reference genome/sequence is no longer needed to align reads. 22 | Here's an example `bowtie2-build` command: 23 | 24 | ``` 25 | $ bowtie-build reference_sequence.fasta index_name 26 | ``` 27 | 28 | In this command, the `reference_sequence.FASTA` is the nucleotide FASTA sequence we want to index, and `index_name` is the name of the index. 29 | There will be six files beginning with the `index_name` in the output directory: `index_name.1.ebwt`, `index_name.2.ebwt`, `index_name.3.ebwt`, `index_name.4.ebwt`, `index_name.rev.1.ebwt`, and `index_name.rev.2.ebwt`. 30 | There's no need to specify any of these files individually in subsequent `bowtie` commands, the `index_name` alone is enough to refer to the entire index. 31 | 32 | ## Aligning reads to an indexed genome/sequence using `bowtie` 33 | 34 | Now that the genome has been indexed, FASTQ sequencing reads can be aligned to it. 35 | This is done using the `bowtie` command. 36 | Here is an example `bowtie2` command: 37 | 38 | ``` 39 | $ bowtie --no-unal --threads n --sam index_name -1 reads_1.fastq -2 reads_2.fastq output.sam 40 | ``` 41 | 42 | In this command... 43 | 44 | 1. **`--no-unal`** is an optional argument, meaning reads that do not align to the reference genome will not be written to `sam` output 45 | 2. **`--threads`** is the number (*n*) of processors/threads used 46 | 3. **`--sam`** specifies that the output should be written in the [SAM format](file_formats.md#sam) 47 | 4. **`index_name`** is the name of the genome index 48 | 4. **`-1`** is the file(s) containing mate 1 reads ([`reads_1.fastq`](file_formats.md#fastq)) 49 | 5. **`-2`** is the file(s) containing mate 2 reads ([`reads_2.fastq`](file_formats.md#fastq)) 50 | 6. **`output.sam`** is the output alignment in `sam` format 51 | 52 | ## Demonstration 53 | 54 | In this video, `bowtie-build` is used to index `S_cere_GCF_000146045.2_R64_genomic.fna`, which is a copy of the [*Saccharomyces cerevisiae* S288C genome from RefSeq](https://www.ncbi.nlm.nih.gov/assembly/GCF_000146045.2). 55 | The `bowtie` command is then used to align [*Saccharomyces cerevisiae* RNAseq reads](https://www.ncbi.nlm.nih.gov/sra/SRR11462797) to this bowtie index. 56 | 57 | [![asciicast](https://asciinema.org/a/316272.svg)](https://asciinema.org/a/316272?autoplay=1) 58 | 59 | ## Further reading 60 | 61 | 1. The `bowtie` manual: 62 | -------------------------------------------------------------------------------- /docs/bowtie2.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: Bowtie2 4 | parent: 2. Program guides 5 | --- 6 | 7 | # Bowtie2 8 | 9 | From the manual: [*"Bowtie 2 is an ultrafast and memory-efficient tool for aligning sequencing reads to long reference sequences"*](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml). 10 | 11 | `bowtie2` can be used to: 12 | - index reference FASTA nucleotide genomes/sequences 13 | - align FASTQ sequencing reads to those genomes/sequences 14 | 15 | ## Differences between `bowtie` and `bowtie2` 16 | 17 | - `bowtie2` has no upper limit on read length 18 | - `bowtie2` can make gapped alignments 19 | - `bowtie2` is more flexible for paired-end alignment 20 | - `bowtie2` is faster and more memory efficient 21 | - `bowtie` is advantageous over `bowtie2` for relatively short sequencing reads (50bp or less) 22 | 23 | ## Indexing a reference genome/sequence using `bowtie2-build` 24 | 25 | Before aligning reads to a reference genome with `bowtie2`, it must be indexed using `bowtie2-build`. 26 | This command will create six files with the extensions `.1.bt2`, `.2.bt2`, `.3.bt2`, `.4.bt2`, `.rev.1.bt2`, and `.rev.2.bt2`. 27 | These six files together are the index. 28 | Once an index has been created, the original reference genome/sequence is no longer needed to align reads. 29 | Here's an example `bowtie2-build` command: 30 | 31 | ``` 32 | $ bowtie2-build reference_sequence.fasta index_name 33 | ``` 34 | 35 | In this command, the `reference_sequence.FASTA` is the nucleotide FASTA sequence we want to index, and `index_name` is the name of the index. 36 | There will be six files beginning with the `index_name` in the output directory: `index_name.1.bt2`, `index_name.2.bt2`, `index_name.3.bt2`, `index_name.4.bt2`, `index_name.rev.1.bt2`, and `index_name.rev.2.bt2`. 37 | There's no need to specify any of these files individually, just the `index_name` alone is enough to refer to the entire index. 38 | 39 | ## Aligning reads to an indexed genome/sequence using `bowtie2` 40 | 41 | Now that the genome has been indexed, FASTQ sequencing reads can be aligned to it. 42 | This is done using the `bowtie2` command. 43 | Here's an example `bowtie2` command: 44 | 45 | ``` 46 | $ bowtie2 --no-unal -p n -x index_name -1 reads_1.fastq -2 reads_2.fastq -S output.sam 47 | ``` 48 | 49 | In this command... 50 | 51 | 1. **`--no-unal`** is an optional argument, meaning reads that do not align to the reference genome will not be written to `sam` output 52 | 2. **`-p`** is the number (*n*) of processors/threads used 53 | 3. **`-x`** is the genome index 54 | 4. **`-1`** is the file(s) containing mate 1 reads 55 | 5. **`-2`** is the file(s) containing mate 2 reads 56 | 6. **`-S`** is the output alignment in `sam` format 57 | 58 | ## Demonstration 59 | 60 | In this video, `bowtie2-build` is used to index `example_nucleotide_sequence.fasta`, and the command `bowtie2` is used to align reads to this bowtie2 index. 61 | 62 | [![asciicast](https://asciinema.org/a/306546.svg)](https://asciinema.org/a/306546?autoplay=1) 63 | 64 | ## Further reading 65 | 66 | 1. The `bowtie2` manual: 67 | -------------------------------------------------------------------------------- /docs/cl_solutions.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: Command line exercise solutions 4 | nav_exclude: true 5 | --- 6 | 7 | # Command line exercise solutions 8 | 9 | The `pwd` commands in these solutions are added to clarify the working directories used. 10 | 11 | **1.** Change the working directory from `bioinfo-notebook/` to `bioinfo-notebook/data/`. 12 | 13 | ```bash 14 | ronan@dell:~/bioinfo-notebook$ pwd 15 | /home/ronan/bioinfo-notebook 16 | ronan@dell:~/bioinfo-notebook$ cd data/ 17 | ronan@dell:~/bioinfo-notebook/data$ pwd 18 | /home/ronan/bioinfo-notebook/data 19 | ``` 20 | 21 | **2.** Change the working directory from `bioinfo-notebook/data` to `bioinfo-notebook/docs`, using `../` in your command. 22 | 23 | ```bash 24 | ronan@dell:~/bioinfo-notebook/data$ pwd 25 | /home/ronan/bioinfo-notebook/data 26 | ronan@dell:~/bioinfo-notebook/data$ cd ../docs/ 27 | ronan@dell:~/bioinfo-notebook/docs$ pwd 28 | /home/ronan/bioinfo-notebook/docs 29 | ``` 30 | 31 | **3.** List the files in the `bioinfo-notebook/docs/` directory. 32 | 33 | ```bash 34 | ronan@dell:~/bioinfo-notebook/docs$ pwd 35 | /home/ronan/bioinfo-notebook/docs 36 | ronan@dell:~/bioinfo-notebook/docs$ ls 37 | bowtie2.md file_formats.md 38 | bowtie.md htseq-count.md 39 | cl_intro.md linux_setup.md 40 | cl_solutions.md part1.md 41 | combining_featCount_tables.md part2.md 42 | conda.md part3.md 43 | fasterq-dump.md samtools.md 44 | fastq-dump.md to_do.md 45 | fastq-dump_to_featureCounts.md ubuntu_virtualbox.md 46 | featureCounts.md wsl.md 47 | ``` 48 | 49 | **4.** Select a file in the `bioinfo-notebook/docs/` directory, and display the first 6 lines of it using the `head` command. 50 | 51 | ```bash 52 | ronan@dell:~/bioinfo-notebook/docs$ pwd 53 | /home/ronan/bioinfo-notebook/docs 54 | ronan@dell:~/bioinfo-notebook/docs$ head cl_solutions.md 55 | --- 56 | layout: default 57 | title: Command line exercise solutions 58 | nav_exclude: true 59 | --- 60 | 61 | # Command line exercise solutions 62 | 63 | 1. Change the working directory from `bioinfo-notebook/` to `bioinfo-notebook/data/`. 64 | ``` 65 | 66 | **5.** Display the last 2 lines of all the files in the `bioinfo-notebook/docs/` directory, using the `tail` command. 67 | 68 | ```bash 69 | ronan@dell:~/bioinfo-notebook/docs$ pwd 70 | /home/ronan/bioinfo-notebook/docs 71 | ronan@dell:~/bioinfo-notebook/docs$ tail -n 2 * 72 | ``` 73 | ``` 74 | ==> bowtie2.md <== 75 | 76 | 1. The `bowtie2` manual: 77 | 78 | ==> bowtie.md <== 79 | 80 | 1. The `bowtie` manual: 81 | 82 | ==> cl_intro.md <== 83 | - [File formats used in bioinformatics](file_formats.md) 84 | - [The DataCamp "Introduction to Shell" interactive course](https://www.datacamp.com/courses/introduction-to-shell-for-data-science) 85 | 86 | ==> cl_solutions.md <== 87 | 5. Display the last 2 lines of all the files in the `bioinfo-notebook/docs/` directory, using the `tail` command. 88 | 6. From the `bioinfo-notebook/docs/` directory, list the files in the `bioinfo-notebook/envs/` directory. 89 | 90 | ==> combining_featCount_tables.md <== 91 | 92 | - [fastq-dump_to_featureCounts.sh](fastq-dump_to_featureCounts.md) 93 | 94 | ==> conda.md <== 95 | 2. Conda packages: 96 | 3. Conda environments: 97 | 98 | ==> fasterq-dump.md <== 99 | 100 | 1. [How to use fasterq-dump from the sra-tools wiki on GitHub](https://github.com/ncbi/sra-tools/wiki/HowTo:-fasterq-dump) 101 | 102 | ==> fastq-dump.md <== 103 | 104 | 1. Rob Edward's notes on `fastq-dump`: 105 | 106 | ==> fastq-dump_to_featureCounts.md <== 107 | 108 | 1. [fastq-dump_to_featureCounts.sh on GitHub](https://github.com/rnnh/bioinfo-notebook/blob/master/scripts/fastq-dump_to_featureCounts.sh) 109 | 110 | ==> featureCounts.md <== 111 | 1. The `subread` user guide: 112 | 2. The `featureCounts` paper: 113 | 114 | ==> file_formats.md <== 115 | - [GTF2.2: A Gene Annotation Format (Revised Ensembl GTF)](http://mblab.wustl.edu/GTF22.html) 116 | - [GFF3 Specification](https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md) 117 | 118 | ==> htseq-count.md <== 119 | 120 | 1. The `htseq-count` manual: 121 | 122 | ==> linux_setup.md <== 123 | - [Using Ubuntu through a Virtual Machine](ubuntu_virtualbox.md) 124 | - [Windows Subsystem for Linux](wsl.md) 125 | 126 | ==> part1.md <== 127 | 128 | These are general guides for installing Ubuntu, using the command line, and the types of files used in bioinformatics. 129 | 130 | ==> part2.md <== 131 | 132 | These are guides to individual programs. 133 | 134 | ==> part3.md <== 135 | 136 | These are scripts that use the programs discussed in this project. 137 | 138 | ==> samtools.md <== 139 | - [Alignment formats](file_formats.md#alignment-formats) 140 | - The `samtools` manual: 141 | 142 | ==> to_do.md <== 143 | - Add page on `trimmomatic` 144 | - Entry on BED/bigWig 145 | 146 | ==> ubuntu_virtualbox.md <== 147 | - [What is a Virtual Machine?](https://azure.microsoft.com/en-us/overview/what-is-a-virtual-machine/) 148 | - [How to Install Ubuntu on VirtualBox](https://www.wikihow.com/Install-Ubuntu-on-VirtualBox) 149 | 150 | ==> wsl.md <== 151 | - [Using Ubuntu through a Virtual Machine](ubuntu_virtualbox.md) 152 | - [conda](conda.md) 153 | ``` 154 | 155 | **6.** From the `bioinfo-notebook/docs/` directory, list the files in the `bioinfo-notebook/envs/` directory. 156 | 157 | ```bash 158 | ronan@dell:~/bioinfo-notebook/docs$ pwd 159 | /home/ronan/bioinfo-notebook/docs 160 | ronan@dell:~/bioinfo-notebook/docs$ ls ../envs/ 161 | bioinfo-notebook.txt bioinfo-notebook.yml 162 | ``` 163 | -------------------------------------------------------------------------------- /docs/combining_featCount_tables.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: Combining featCount tables.py 4 | parent: 3. Scripts 5 | --- 6 | 7 | 8 | # Combining featCount tables.py 9 | 10 | This is a Python script that creates a single CSV feature count table from the featureCounts output tables in the target directory. 11 | This combined feature count table can be used for differential expression analysis (e.g. using DESeq2 or edgeR in R). 12 | 13 | ## Demonstration 14 | 15 | This is a video demonstartion of [combining_featCount_tables.py](../scripts/combining_featCount_tables.py). 16 | 17 | [![asciicast](https://asciinema.org/a/311771.svg)](https://asciinema.org/a/311771?autoplay=1) 18 | 19 | In this video, `combining_featCount_tables.py` is used to combine the following [featureCounts](featureCounts.md) tables: 20 | 21 | ``` 22 | feature_counts_SRR8933506_S_cere_GCF_000146045.2_R64_genomic.fna.tsv 23 | feature_counts_SRR8933509_S_cere_GCF_000146045.2_R64_genomic.fna.tsv 24 | feature_counts_SRR8933510_S_cere_GCF_000146045.2_R64_genomic.fna.tsv 25 | feature_counts_SRR8933511_S_cere_GCF_000146045.2_R64_genomic.fna.tsv 26 | feature_counts_SRR8933512_S_cere_GCF_000146045.2_R64_genomic.fna.tsv 27 | feature_counts_SRR8933530_S_cere_GCF_000146045.2_R64_genomic.fna.tsv 28 | feature_counts_SRR8933531_S_cere_GCF_000146045.2_R64_genomic.fna.tsv 29 | feature_counts_SRR8933532_S_cere_GCF_000146045.2_R64_genomic.fna.tsv 30 | feature_counts_SRR8933533_S_cere_GCF_000146045.2_R64_genomic.fna.tsv 31 | feature_counts_SRR8933534_S_cere_GCF_000146045.2_R64_genomic.fna.tsv 32 | feature_counts_SRR8933535_S_cere_GCF_000146045.2_R64_genomic.fna.tsv 33 | feature_counts_SRR8933536_S_cere_GCF_000146045.2_R64_genomic.fna.tsv 34 | feature_counts_SRR8933537_S_cere_GCF_000146045.2_R64_genomic.fna.tsv 35 | feature_counts_SRR8933538_S_cere_GCF_000146045.2_R64_genomic.fna.tsv 36 | feature_counts_SRR8933539_S_cere_GCF_000146045.2_R64_genomic.fna.tsv 37 | ``` 38 | 39 | These featureCounts results were generated using the following [fastq-dump_to_featureCounts.sh](fastq-dump_to_featureCounts.md) command: 40 | 41 | ```bash 42 | $ bash ../scripts/fastq-dump_to_featureCounts.sh -a S_cere_GCF_000146045.2_R64_genomic.gtf -f S_cere_GCF_000146045.2_R64_genomic.fna --verbose -p 3 SRR8933506 SRR8933509 SRR8933510 SRR8933511 SRR8933512 SRR8933530 SRR8933531 SRR8933532 SRR8933533 SRR8933534 SRR8933535 SRR8933536 SRR8933537 SRR8933538 SRR8933539 43 | ``` 44 | 45 | In this command, the full genome sequence (`S_cere_GCF_000146045.2_R64_genomic.fna`) and genome annotation (`S_cere_GCF_000146045.2_R64_genomic.gtf`) for [*Saccharomyces cerevisiae* S288C](https://www.ncbi.nlm.nih.gov/assembly/GCF_000146045.2) are used. 46 | 47 | These featureCounts results were then combined using the following command: 48 | 49 | ```bash 50 | $ python ../scripts/combining_featCount_tables.py 51 | ``` 52 | 53 | Running this script combines all the featureCounts results in a directory into a single CSV file. 54 | If a custom name for this file is not given, it will be given a name using this scheme: `featCounts_{species}_{date}.csv`. 55 | 56 | ## Usage 57 | 58 | ``` 59 | usage: combining_featCount_tables.py [-h] [-d PATH] [-o CUSTOM_FILENAME] 60 | 61 | Combines the featureCounts output tables in the target directory. 62 | 63 | optional arguments: 64 | -h, --help show this help message and exit 65 | -d PATH, --directory PATH 66 | path to target directory. Default: current directory 67 | -o CUSTOM_FILENAME, --output CUSTOM_FILENAME 68 | output filename. Default: 69 | featCounts_{species}_{date}.csv 70 | ``` 71 | 72 | ## See also 73 | - [combining_featCount_tables.py on GitHub](https://github.com/rnnh/bioinfo-notebook/blob/master/scripts/combining_featCount_tables.py) 74 | - [fastq-dump_to_featureCounts.sh](fastq-dump_to_featureCounts.md) 75 | -------------------------------------------------------------------------------- /docs/conda.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: Conda 4 | parent: 2. Program guides 5 | --- 6 | 7 | # Conda 8 | 9 | From the website, `conda` provides ["Package, dependency and environment management for any language"](https://docs.conda.io/en/latest/). 10 | 11 | Conda is a package manager allows specific versions of programs to be installed, alongside their dependencies. 12 | Different sets of programs can be installed to different [virtual environments](https://www.anaconda.com/moving-conda-environments/). 13 | A virtual environment is basically a set of programs. 14 | 15 | ## Installing `conda` 16 | 17 | Conda is part of [Anaconda](https://www.anaconda.com/distribution/), which is available for free. 18 | Conda is also available through [Miniconda](https://docs.conda.io/en/latest/miniconda.html), a free minimal installer for conda. 19 | 20 | Conda can be installed on a 64-bit Linux system with the following commands... 21 | 22 | ```bash 23 | # Downloading miniconda 24 | $ wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh 25 | # Installing miniconda 26 | $ bash miniconda.sh -b -p $HOME/miniconda 27 | # Updating conda 28 | $ conda update -q conda 29 | ``` 30 | 31 | ## Cloning and activating a `conda` environment 32 | 33 | Conda virtual environments can be shared, either as a `.yml` file or a `.txt` file. 34 | A `.yml` copy of a conda environment can be used to recreate that environment on another machine, regardless of the operating system platform used. 35 | A `.txt` copy of a conda environment is more explicit: it can be used to create an identical copy of a conda environment using the same operating system platform as the original machine. 36 | A conda virtual environment is used throughout this project: a [`.yml` copy](../envs/bioinfo-notebook.yml) and an [explicit `.txt` copy](../envs/bioinfo-notebook.txt) of this conda environment are provided. 37 | 38 | A conda environment can be activated using `$ conda activate name_of_environment`. 39 | Once activated, the programs installed in this environment are available. 40 | Conda can be deactivated using `$ conda deactivate`. 41 | 42 | The `conda` environment used throughout this project can be created from [bioinfo-notebook.txt](../envs/bioinfo-notebook.txt) and activated using the following commands... 43 | 44 | ```bash 45 | # Creating the bioinfo-notebook environment 46 | /bioinfo-notebook $ conda create --name bioinfo-notebook --file envs/bioinfo-notebook.txt 47 | # Activating the bioinfo-notebook environment 48 | $ conda activate bioinfo-notebook 49 | # Once activated, the environment name is at the start of the bash prompt 50 | (bioinfo-notebook) $ 51 | ``` 52 | 53 | ## Demonstration 54 | 55 | In this video demonstration, a conda virtual environment is created using [bioinfo-notebook.txt](../envs/bioinfo-notebook.txt). 56 | This virtual environment is then activated using `conda activate bioinfo-notebook`. 57 | Note that the name of the active conda environment is displayed in brackets at the start of the bash prompt: `(name of active environment) ... $`. 58 | 59 | [![asciicast](https://asciinema.org/a/305992.svg)](https://asciinema.org/a/305992?autoplay=1) 60 | 61 | ## Further reading 62 | 1. Downloading conda: 63 | 2. Conda packages: 64 | 3. Conda environments: 65 | -------------------------------------------------------------------------------- /docs/fasterq-dump.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: Fasterq-dump 4 | parent: 2. Program guides 5 | --- 6 | 7 | # Fasterq-dump 8 | 9 | `fasterq-dump` is a tool for downloading sequencing reads from [NCBI's Sequence Read Archive (SRA)](https://www.ncbi.nlm.nih.gov/sra). 10 | These sequence reads will be downloaded as [FASTQ files](file_formats.md#fastq). 11 | `fasterq-dump` is a newer, streamlined alternative to [fastq-dump](fastq-dump.md); both of these programs are a part of [sra-tools](https://anaconda.org/bioconda/sra-tools). 12 | 13 | ## `fasterq-dump` vs `fastq-dump` 14 | 15 | Here are a few of the differences between `fastq-dump` and `fasterq-dump`: 16 | 17 | 1. In `fastq-dump`, the flag `--split-3` is required to separate paired reads into left and right ends. This is the default setting in `fasterq-dump`. 18 | 2. The `fastq-dump` flag `--skip-technical` is no longer required to skip technical reads in `fasterq-dump`. Instead, the flag `--include-technical` is required to include technical reads when using `fasterq-dump`. 19 | 3. There is no `--gzip` or `--bzip2` flag in `fasterq-dump` to download compressed reads with `fasterq-dump`. However, FASTQ files downloaded using `fasterq-dump` can still be subsequently compressed. 20 | 21 | The following commands are equivalent, but will be executed faster using `fasterq-dump`: 22 | 23 | ``` 24 | $ fastq-dump SRR_ID --split-3 --skip-technical 25 | $ fasterq-dump SRR_ID 26 | ``` 27 | 28 | ## Downloading reads from the SRA using `fasterq-dump` 29 | 30 | In this example, we want to download FASTQ reads for a mate-pair library. 31 | 32 | ``` 33 | fastq-dump --threads n --progress SRR_ID 34 | ``` 35 | 36 | In this command... 37 | 38 | 1. **`--threads`** specifies the number (*`n`*) processors/threads to be used. 39 | 2. **`--progress`** is an optional argument that displays a progress bar when the reads are being downloaded. 40 | 3. **`SRR_ID`** is the ID of the run from the SRA to be downloaded. This ID begins with "SRR" and is followed by around seven digits (e.g. `SRA1234567`). 41 | 42 | ## Demonstration 43 | 44 | In this video, `fasterq-dump` is used to download [*Saccharomyces cerevisiae* RNAseq reads](https://www.ncbi.nlm.nih.gov/sra/SRR11462797) from the SRA. 45 | 46 | [![asciicast](https://asciinema.org/a/316273.svg)](https://asciinema.org/a/316273?autoplay=1) 47 | 48 | ## See also 49 | 50 | - [fastq-dump](fastq-dump.md) 51 | 52 | ## References 53 | 54 | 1. [How to use fasterq-dump from the sra-tools wiki on GitHub](https://github.com/ncbi/sra-tools/wiki/HowTo:-fasterq-dump) 55 | -------------------------------------------------------------------------------- /docs/fastq-dump.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: Fastq-dump 4 | parent: 2. Program guides 5 | --- 6 | 7 | # Fastq-dump 8 | 9 | `fastq-dump` is a tool for downloading sequencing reads from [NCBI's Sequence Read Archive (SRA)](https://www.ncbi.nlm.nih.gov/sra). 10 | These sequence reads will be downloaded as FASTQ files. 11 | How these FASTQ files are formatted depends on the `fastq-dump` options used. 12 | 13 | ## Downloading reads from the SRA using `fastq-dump` 14 | 15 | In this example, we want to download FASTQ reads for a mate-pair library. 16 | 17 | ``` 18 | $ fastq-dump --gzip --skip-technical --readids --read-filter pass --dumpbase --split-3 --clip --outdir path/to/reads/ SRR_ID 19 | ``` 20 | 21 | In this command... 22 | 23 | 1. **`--gzip`**: Compress output using gzip. Gzip archived reads can be read directly by [bowtie2](bowtie2.md). 24 | 2. **`--skip-technical`**: Dump only biological reads, skip the technical reads. 25 | 3. **`--readids`** or **`-I`**: Append read ID after spot ID as 'accession.spot.readid'. With this flag, one sequence gets appended the ID `.1` and the other `.2`. Without this option, pair-ended reads will have identical IDs. 26 | 4. **`--read-filter pass`**: Only returns reads that pass filtering (without `N`s). 27 | 5. **`--dumpbase`** or **`-B`**: Formats sequence using base space (default for other than SOLiD). Included to avoid colourspace (in which pairs of bases are represented by numbers). 28 | 6. **`--split-3`** separates the reads into left and right ends. If there is a left end without a matching right end, or a right end without a matching left end, they will be put in a single file. 29 | 7. **`--clip`** or **`-W`**: Some of the sequences in the SRA contain tags that need to be removed. This will remove those sequences. 30 | 8. **`--outdir`** or **`-O`**: *(Optional)* Output directory, default is current working directory. 31 | 9. **`SRR_ID`**: This is is the ID of the run from SRA to be downloaded. This ID begins with "SRR" and is followed by around seven digits (e.g. `SRA1234567`). 32 | 33 | Other options that can be used instead of `--split-3`: 34 | 35 | 1. **`--split-files`** splits the FASTQ reads into two files: one file for mate 1s (`...1`), and another for mate 2s (`..._2`). This option will not mateless pairs into a third file. 36 | 2. **`--split-spot`** splits the FASTQ reads into two (mate 1s and mate 2s) within one file. `--split-spot` gives you an 8-line fastq format where forward precedes reverse (see ). 37 | 38 | ## Demonstration 39 | 40 | In this demo, `fastq-dump` is used to download compressed FASTQ reads. 41 | 42 | [![asciicast](https://asciinema.org/a/306937.svg)](https://asciinema.org/a/306937?autoplay=1) 43 | 44 | ## Further reading 45 | 46 | 1. Rob Edward's notes on `fastq-dump`: 47 | -------------------------------------------------------------------------------- /docs/fastq-dump_to_featureCounts.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: Fastq-dump to featureCounts 4 | parent: 3. Scripts 5 | --- 6 | 7 | # Fastq-dump to featureCounts.sh 8 | 9 | [fastq-dump_to_featureCounts.sh](../scripts/fastq-dump_to_featureCounts.sh) is a `bash` script that... 10 | 11 | 1. Downloads FASTQ reads from NCBI's SRA using [fastq-dump](fastq-dump.md) 12 | 2. Indexes a reference genome and aligns reads to that index using [bowtie2](bowtie2.md) 13 | 3. Converts the alignment file created by bowtie2 to BAM format and sorts it using [samtools](samtools.md) 14 | 4. Assigns the read alignments to genes in a genome annotation file using [featureCounts](featureCounts.md) 15 | 16 | ## Demonstration 17 | 18 | This is a video demonstration of [fastq-dump_to_featureCounts.sh](../scripts/fastq-dump_to_featureCounts.sh). 19 | 20 | During this demonstration, the full genome sequence and genome annotation for [*Saccharomyces cerevisiae* S288C](https://www.ncbi.nlm.nih.gov/assembly/GCF_000146045.2) are used. The files [example_nucleotide_sequence.fasta](../data/example_nucleotide_sequence.fasta) and [example_genome_annotation.gtf](../data/example_genome_annotation.gtf) are fragments of the nucleotide sequence and annotation for this genome. [RNA-Seq reads for *Saccharomyces cerevisiae* (SRR8933512)](https://www.ncbi.nlm.nih.gov/sra/SRR8933512) are used as the example FASTQ files in this demonstration. 21 | 22 | [![asciicast](https://asciinema.org/a/308745.svg)](https://asciinema.org/a/308745?autoplay=1) 23 | 24 | ## Usage 25 | 26 | ``` 27 | fastq-dump_to_featureCounts.sh [options] -a|--annotation -f|--fasta 28 | 29 | This script downloads FASTQ reads from NCBI's SRA, aligns them to an annotated 30 | genome using bowtie2, and generates gene count table(s) using featureCounts. 31 | It can take a single SRR ID as an input, or multiple SRR IDs separated by 32 | spaces. 33 | 34 | Required arguments: 35 | -a | --annotation input genome annotation file 36 | -f | --fasta input FASTA file for annotated genome 37 | SRR ID(s) Sequence Read Archive Run ID(s) (SRR...) 38 | 39 | Optional arguments: 40 | -h | --help show this help text and exit 41 | -p | --processors number (n) of processors to use (default: 1) 42 | --fastq-dump use 'fastq-dump' instead of the 'fasterq-dump' 43 | --verbose make output of script more verbose 44 | --removetemp remove read and alignment files once they are 45 | no longer needed (minimises disk space needed) 46 | --log redirect terminal output to log file 47 | ``` 48 | 49 | ## See also 50 | 51 | 1. [fastq-dump_to_featureCounts.sh on GitHub](https://github.com/rnnh/bioinfo-notebook/blob/master/scripts/fastq-dump_to_featureCounts.sh) 52 | -------------------------------------------------------------------------------- /docs/featureCounts.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: FeatureCounts 4 | parent: 2. Program guides 5 | --- 6 | 7 | 8 | # FeatureCounts 9 | 10 | `featureCounts` is a program that counts how many reads map to genomic features, such as genes, exon, promoter and genomic bins. 11 | 12 | ## Counting how many reads align to each gene in a genome annotation using `featureCounts` 13 | 14 | `featureCounts` can be used to count how many reads align to genes as follows: 15 | 16 | ``` 17 | $ featureCounts -p -O -T n -a example_genome_annotation.gtf -o example_featureCounts_output.txt sorted_example_alignment.bam 18 | ``` 19 | 20 | In this command... 21 | 22 | 1. **`-p`** species that fragments (or templates) will be counted instead of reads. This is only applicable for paired-end reads. 23 | 2. **`-O`** assigns reads to all their overlapping meta-features. 24 | 3. **`-T`** specifies the number (*`n`*) of threads to be used. 25 | 4. **`-a`** is the genome annotation file (`example_genome_annotation.gtf`). 26 | 5. **`-o`** specifies the name of the output file, which includes the read counts (`example_featureCounts_output.txt`). 27 | 6. **`sorted_example_alignment.bam`** is an alignment file: in this file, the reads we want to count are aligned to the same genome as the annotation file. 28 | 29 | ### Demonstration 30 | 31 | In this video, `featureCounts` is used to assign reads in an alignment file (`sorted_example_alignment.bam`) to genes in a genome annotation file (`example_genome_annotation.gtf`). 32 | 33 | [![asciicast](https://asciinema.org/a/306584.svg)](https://asciinema.org/a/306584?autoplay=1) 34 | 35 | ## More important options for `featureCounts` 36 | 37 | 1. **`-s`** specifies strand-specific read counting. `0` for unstranded reads, `1` for stranded reads and `2` for reversely stranded reads. This depends on the library used in the sequencing protocol. 38 | 39 | ## Further reading 40 | 41 | 1. The `subread` user guide: 42 | 2. The `featureCounts` paper: 43 | -------------------------------------------------------------------------------- /docs/genome_annotation_SwissProt_CDS.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: Genome annotation script 4 | parent: 3. Scripts 5 | --- 6 | 7 | # Genome annotation SwissProt CDS.sh 8 | 9 | [genome annotation SwissProt CDS.sh](../scripts/genome_annotation_SwissProt_CDS.sh) is a bash script that annotates the coding sequences (CDS) in a given genome assembly. 10 | It uses [BLAST](blast.md) and [MGKit](https://github.com/frubino/mgkit), which are included in the `bioinfo-notebook` [conda environment](conda.md). 11 | 12 | ## Usage 13 | 14 | ``` 15 | genome_annotation_SwissProt_CDS.sh [-h|--help] [-d|--demo] [-i|--input] 16 | [-l|--log -p|--processors n -e|--email] 17 | 18 | A script to annotate proteins in a genome assembly, using BLASTx with 19 | UniProtKB/Swiss-Prot. 20 | 21 | When run with the arugment '-d' or '--demo' this script... 22 | 23 | 1. Downloads a Saccharomyces cerevisiae S288C genome assembly, and 24 | the UniProtKB/Swiss-Prot amino acid sequences. 25 | 2. Creates a BLAST database from the downloaded Swiss-Prot sequences, 26 | and searches the S. cerevisiae genome against it using BLASTx with an 27 | E-value threshold of 1e-100. 28 | 3. Filters the BLASTx results, removing results with less than 90% 29 | identity. 30 | 4. Creates a genome annotation GFF file from these BLASTx results. 31 | 5. Adds information to the genome annotation from UniProt (protein 32 | names, KeGG ortholog information, EC numbers, etc.) 33 | 34 | The end result ('S_cere.gff') is an annotation of the coding sequences (CDS) 35 | in the S. cerevisiae genome that are described in UniProtKB/Swiss-Prot. 36 | 37 | This script can also be run with the argument '-i' or '--input', which is used 38 | to specify a FASTA nucleotide file (.fasta or .fna) to annotate, instead of 39 | the demo sequence. The end result is also an annotation of the CDS in the input 40 | sequence based on UniProtKB/Swiss-Prot, called '.gff'. 41 | 42 | This script should be called from the 'bioinfo-notebook/' directory.The 43 | programs required for this script are in the 'bioinfo-notebook' conda 44 | environment (bioinfo-notebook/envs/bioinfo-notebook.yml or 45 | bioinfo-notebook/envs/bioinfo-notebook.txt). 46 | If the input file is not in the 'bioinfo-notebook/data/' directory, the full 47 | file path should be given. 48 | 49 | arguments: 50 | -h | --help show this help text and exit 51 | -i | --input name of input FASTA nucleotide file to annotate 52 | -d | --demo run the script with demonstration inputs 53 | 54 | optional arguments: 55 | -l | --log redirect terminal output to a log file 56 | -p | --processors set the number (n) of processors to use 57 | (default: 1) 58 | -e | --email contact email for UniProt queries 59 | ``` 60 | 61 | ## See also 62 | 63 | - [genome_annotation_SwissProt_CDS.sh on GitHub](https://github.com/rnnh/bioinfo-notebook/blob/master/scripts/genome_annotation_SwissProt_CDS.sh) 64 | - [BLAST](blast.md) 65 | - [MGKit](https://github.com/frubino/mgkit) 66 | - [Conda](conda.md) 67 | -------------------------------------------------------------------------------- /docs/htseq-count.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: Htseq-count 4 | parent: 2. Program guides 5 | --- 6 | 7 | 8 | # Htseq-count 9 | 10 | Given a file with aligned sequencing reads and a list of genomic features, `htseq-count` can be used to count how many reads map to each feature. 11 | 12 | ## Aligining reads to a genome annotation using `htseq-count` 13 | 14 | `htseq-count` can be used to align reads to a genome annotation as follows: 15 | 16 | ``` 17 | $ htseq-count --format bam sorted_alignment_file.bam genome_annotation > output_file.txt 18 | ``` 19 | 20 | In this command... 21 | 22 | 1. **`--format`** or **`-f`** is the format of the input data. Possible values are `sam` (for text SAM files) and `bam` (for binary BAM files). Default is `sam`. A `bam` file is used in this example. 23 | 2. **`--order`** specifies whether the alignments have been sorted by name (`name`) or coordinates/position (`pos`). 24 | 3. **`sorted_alignment_file.bam`** is a `bam` format alignment file, sorted by name. 25 | 4. **`genome_annotation`** is the genome annotation file the reads in the `alignment_file` are aligned to (`.gtf` or `.gff`). 26 | 5. **`> output_file.txt`** redirects the output (`STDOUT`) to `output_file.txt`. 27 | 28 | ### Demonstration 29 | 30 | In this video, `htseq-counts` is used to count how many reads in an alignment file (`sorted_example_alignment.bam`) match the genes in a genome annotation (`example_genome_annotation.gtf`). 31 | 32 | [![asciicast](https://asciinema.org/a/306597.svg)](https://asciinema.org/a/306597?autoplay=1) 33 | 34 | ## The `htseq-count` output file 35 | 36 | The program outputs a table with counts for each feature, followed by the special counters, which count reads that were not counted for any feature for various reasons. 37 | The names of the special counters all start with a double underscore, to facilitate filtering (**Note:** The double underscore was absent up to version 0.5.4). 38 | The special counters are: 39 | 40 | 1. **`__no_feature`**: reads (or read pairs) which could not be assigned to any feature (set S as described above was empty). 41 | 2. **`__ambiguous`**: reads (or read pairs) which could have been assigned to more than one feature and hence were not counted for any of these, unless the --nonunique all option was used (set S had more than one element). 42 | 3. **`__too_low_aQual`**: reads (or read pairs) which were skipped due to the optional minimal alignment quality flag. 43 | 4. **`__not_aligned`**: reads (or read pairs) in the SAM/BAM file without an alignment. 44 | 5. **`__alignment_not_unique`**: reads (or read pairs) with more than one reported alignment. 45 | 46 | ## Further reading 47 | 48 | 1. The `htseq-count` manual: 49 | -------------------------------------------------------------------------------- /docs/linux_setup.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: Linux setup script 4 | parent: 3. Scripts 5 | --- 6 | 7 | # Linux setup script 8 | 9 | [linux_setup.sh](../scripts/linux_setup.sh) is a `bash` shell script that... 10 | 11 | 1. Downloads and installs [Miniconda3](conda.md) 12 | 2. Installs the `bioinfo-notebook` [virtual environment using conda](conda.md#cloning-and-activating-a-conda-environment) 13 | 14 | This will use around 2.3 GB of hard disk space in total. 15 | 16 | If you are using a Linux system that does not have Anaconda/Miniconda installed, this script will set up everything you need to follow the guides on this website. 17 | If you are using a freshly installed [Ubuntu virtual machine](ubuntu_virtualbox.md) or [Ubuntu through Windows Subsystem for Linux](wsl.md), this script is the ideal way to set up your new system. 18 | 19 | ## Demonstration 20 | 21 | This is a video demonstration of [linux_setup.sh](../scripts/linux_setup.sh). 22 | 23 | In this demonstration, the [bioinfo-notebook GitHub repository](https://github.com/rnnh/bioinfo-notebook) (or "repo") is cloned into the home directory of the Linux system (Ubuntu). 24 | This means that all the files for this project will be downloaded from GitHub into the `~/bioinfo-notebook/` directory. 25 | A GitHub repo can be cloned using the command `$ git clone` followed by the URL of the target repo (which can be found on GitHub using the "Clone or download" button). 26 | The Linux setup script is then run from this cloned GitHub repo. 27 | 28 | [![asciicast](https://asciinema.org/a/314853.svg)](https://asciinema.org/a/314853?autoplay=1) 29 | 30 | ## Usage 31 | 32 | ``` 33 | This script downloads and installs Miniconda3, and uses conda to install 34 | the 'bioinfo-notebook' virtual environment. 35 | 36 | Before running this script... 37 | 38 | 1. Please run the following command: 39 | $ sudo apt-get update 40 | This will ensure that the software installed will be up-to-date. 41 | 42 | 2. Please ensure that the 'bioinfo-notebook/' directory is in your 43 | home directory (~). The path to this directory should look like this: 44 | $HOME/bioinfo-notebook 45 | 46 | The 'bash' command is used to run this script: 47 | $ bash $0 48 | 49 | Optional arguments: 50 | -h | --help show this help text and exit 51 | ``` 52 | 53 | ## See also 54 | 55 | - [linux_setup.sh on GitHub](https://github.com/rnnh/bioinfo-notebook/blob/master/scripts/linux_setup.sh) 56 | - [Conda](conda.md) 57 | - [Cloning and activating a conda environment](conda.md#cloning-and-activating-a-conda-environment) 58 | - [Using Ubuntu through a Virtual Machine](ubuntu_virtualbox.md) 59 | - [Windows Subsystem for Linux](wsl.md) 60 | -------------------------------------------------------------------------------- /docs/orthofinder.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: OrthoFinder 4 | parent: 2. Program guides 5 | --- 6 | 7 | # OrthoFinder 8 | 9 | OrthoFinder is [a program for phylogenetic orthology inference](https://davidemms.github.io/). 10 | It can be installed using the [orthofinder.yml](../envs/orthofinder.yml) virtual environment using [conda](conda.md). 11 | 12 | ## Running `OrthoFinder` to find orthologs between sets of FASTA amino acid sequences 13 | 14 | `OrthoFinder` can be used to find orthologs between sets of FASTA amino acid files as follows: 15 | 16 | ```bash 17 | $ orthofinder -t n -S diamond -f path/to/fasta/files/ 18 | ``` 19 | 20 | In this command... 21 | 22 | 1. **`-t`** sets the number of threads/processors to use (*n*). 23 | 2. **`-S`** is used to select the search tool OrthoFinder uses. Setting it to [`diamond` is far faster than the default BLAST method](https://github.com/davidemms/OrthoFinder/releases/tag/v2.2.7). 24 | 3. **`-f`** is used to select the directory of [FASTA amino acid sequences](file_formats.md#fasta) files you want to compare. 25 | 26 | OrthoFinder will create a `Results` directory (ending with the current month and day, e.g. `Results_Sep16/`) in the target directory specified with **`-f`**. 27 | This directory will contain summary statistics of orthologs found between the FASTA files, as well as putative gene duplication events, and phylogenetic trees of the detected orthogroups. 28 | 29 | ## See also 30 | 31 | - [conda](conda.md) 32 | - [File formats used in bioinformatics](file_formats.md) 33 | 34 | ## Further reading 35 | 36 | - [OrthoFinder tutorials](https://davidemms.github.io/menu/tutorials.html) 37 | -------------------------------------------------------------------------------- /docs/part1.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: 1. General guides 4 | nav_order: 2 5 | description: "These are general guides for installing Ubuntu, using the command line, and the types of files used in bioinformatics." 6 | has_children: true 7 | has_toc: True 8 | --- 9 | 10 | # 1. General guides 11 | 12 | These are general guides for installing Ubuntu, using the command line, and the types of files used in bioinformatics. 13 | -------------------------------------------------------------------------------- /docs/part2.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: 2. Program guides 4 | nav_order: 3 5 | description: "These are guides to individual programs." 6 | has_children: true 7 | has_toc: True 8 | --- 9 | 10 | # 2. Program guides 11 | 12 | These are brief guides to individual programs. 13 | They are not comprehensive, but instead aim to introduce the essential features of each program. 14 | -------------------------------------------------------------------------------- /docs/part3.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: 3. Scripts 4 | nav_order: 4 5 | description: "These are scripts that use the programs and file formats discussed in this project." 6 | has_children: true 7 | has_toc: True 8 | --- 9 | 10 | # 3. Scripts 11 | 12 | These are scripts that use the programs and file formats discussed in this project. 13 | -------------------------------------------------------------------------------- /docs/report_an_issue.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: Report an Issue 4 | nav_order: 4 5 | description: "Report an Issue" 6 | --- 7 | 8 | # Report an Issue 9 | 10 | [If there are any errors or mistakes, please let me know.](https://github.com/rnnh/bioinfo-notebook/issues) 11 | -------------------------------------------------------------------------------- /docs/samtools.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: SAMtools 4 | parent: 2. Program guides 5 | --- 6 | 7 | # SAMtools 8 | 9 | SAMtools is a set of utilities that can manipulate alignment formats. 10 | It imports from and exports to the [SAM](file_formats.md#sam), [BAM](file_formats.md#bam) & [CRAM](file_formats.md#cram); does sorting, merging & indexing; and allows reads in any region to be retrieved swiftly. 11 | 12 | ## Converting a `sam` alignment file to a sorted, indexed `bam` file using `samtools` 13 | 14 | Sequence Alignment Map (SAM/`.sam`) is a text-based file is a text-based file format for sequence alignments. 15 | It's binary equivalent is Binary Alignment Map (BAM/`.bam`), which stores the same data as a compressed binary file. 16 | A binary file for a sequence alignment is preferable over a text file, as binary files are faster to work with. 17 | A SAM alignment file (`example_alignment.sam`) can be converted to a BAM alignment using `samtools view`. 18 | 19 | ``` 20 | $ samtools view -@ n -Sb -o example_alignment.bam example_alignment.sam 21 | ``` 22 | 23 | In this command... 24 | 25 | 1. **`-@`** sets the number (*`n`*) of threads/CPUs to be used. This flag is optional and can be used with other `samtools` commands. 26 | 2. **`-Sb`** specifies that the input is in SAM format (`S`) and the output will be be BAM format(`b`). 27 | 3. **`-o`** sets the name of the output file (`example_alignment.bam`). 28 | 4. **`example_alignment.sam`** is the name of the input file. 29 | 30 | Now that the example alignment is in BAM format, we can sort it using `samtools sort`. 31 | Sorting this alignment will allow us to create a index. 32 | 33 | ``` 34 | $ samtools sort -O bam -o sorted_example_alignment.bam example_alignment.bam 35 | ``` 36 | 37 | In this command... 38 | 39 | 1. **`-O`** specifies the output format (`bam`, `sam`, or `cram`). 40 | 2. **`-o`** sets the name of the output file (`sorted_example_alignment.bam`). 41 | 3. **`example_alignment.bam`** is the name of the input file. 42 | 43 | This sorted BAM alignment file can now be indexed using `samtools index`. 44 | Indexing speeds allows fast random access to this alignment, allowing the information in the alignment file to be processed faster. 45 | 46 | ``` 47 | $ samtools index sorted_example_alignment.bam 48 | ``` 49 | 50 | In this command... 51 | 52 | 1. **`sorted_example_alignment.bam`** is the name of the input file. 53 | 54 | ### Demonstration 1 55 | 56 | In this video, `samtools` is used to convert `example_alignment.sam` into a BAM file, sort that BAM file, and index it. 57 | 58 | [![asciicast](https://asciinema.org/a/U1Flwg3EljOfI1Sx77h8PvuNf.svg)](https://asciinema.org/a/U1Flwg3EljOfI1Sx77h8PvuNf?autoplay=1) 59 | 60 | ## Simulating short reads using `wgsim` 61 | 62 | `wgsim` is a SAMtools program that can simulate short sequencing reads from a reference genome. 63 | This is useful for creating FASTQ files to practice with. 64 | 65 | ``` 66 | $ wgsim example_nucleotide_sequence.fasta example_reads_1.fastq example_reads_2.fastq 67 | ``` 68 | 69 | In this command... 70 | 71 | 1. **`example_nucleotide_sequence.fasta`** is the reference genome input. 72 | 2. **`example_reads_1.fastq`** and **`example_reads_2.fastq`** are the names of the simulated read output files. 73 | 74 | ### Demonstration 2 75 | 76 | In this video, `wgsim` is used to simulate reads from `example_nucleotide_sequence.fasta`. 77 | 78 | [![asciicast](https://asciinema.org/a/m89gXtx4cKRnKpI6amWj3BEAH.svg)](https://asciinema.org/a/m89gXtx4cKRnKpI6amWj3BEAH?autoplay=1) 79 | 80 | ## Indexing a FASTA file using `samtools faidx` 81 | 82 | SAMtools can be used to index a FASTA file as follows... 83 | 84 | ```bash 85 | $ samtools faidx file.fasta 86 | ``` 87 | 88 | After running this command, `file.fasta` can now be used by [bcftools](bcftools.md). 89 | 90 | ## See also 91 | 92 | - [Alignment formats](file_formats.md#alignment-formats) 93 | - The `samtools` manual: 94 | -------------------------------------------------------------------------------- /docs/sgRNAcas9.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: sgRNAcas9 4 | parent: 2. Program guides 5 | --- 6 | 7 | # sgRNAcas9 8 | 9 | sgRNAcas9 is [a package for designing CRISPR sgRNA and evaluating potential off-target cleavage sites](https://doi.org/10.1371/journal.pone.0100448). 10 | 11 | ## Running sgRNAcas9 12 | 13 | 1. Install the [conda](conda.md) virutal environment for [sgRNAcas9](../envs/sgRNAcas9.yml). 14 | 2. Download [the GUI version of sgRNAcas9 from SourceForge](https://sourceforge.net/projects/sgrnacas9/). 15 | 3. Activate the sgRNAcas9 virtual environment. 16 | 4. In the directory for sgRNAcas9, run the following command to launch the sgRNAcas9 graphical user interface (GUI): 17 | 18 | ```bash 19 | (sgRNAcas9) ~/sgRNAcas9_V3.0_GUI$ java -jar sgRNAcas9.jar 20 | ``` 21 | 22 | ## Using sgRNAcas9 23 | 24 | In the sgRNAcas9 GUI... 25 | 26 | - Select the [FASTA nucleic acid](file_formats.md#fasta) file of the target sequences in the "Target sequences(FASTA):" dialog box. 27 | - Select the [FASTA nucleic acid](file_formats.md#fasta) file of the genome you want to design the guide RNAs for in the "Genome sequence(FASTA):" dialog box. 28 | - Click "RUN" to run the program 29 | 30 | sgRNAcas9 will create a `report` directory in the current working directory. 31 | This directory contains its results. 32 | The most important file in this directory is `sgRNAcas9_report.xls`. 33 | This Excel files contains reported guide RNA sequences for CRISPR with quality score, and counts of potential off-target sites. 34 | 35 | ## References 36 | 37 | - [sgRNAcas9 paper](https://sourceforge.net/projects/sgrnacas9/) 38 | - [sgRNAcas9 website](http://biootools.com/software.html) 39 | - [sgRNAcas9 on SourceForge](https://sourceforge.net/projects/sgrnacas9/) 40 | -------------------------------------------------------------------------------- /docs/snp_calling.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: SNP calling script 4 | parent: 3. Scripts 5 | --- 6 | 7 | # SNP calling script 8 | 9 | [snp_calling.sh](../scripts/snp_calling.sh) is a `bash` shell script that downloads [FASTQ](file_formats.md) sequencing reads using [fastq-dump](fastq-dump.md), aligns them to a genome using [bowtie2](bowtie2.md), and writes variants (SNPs and indels) to a variant call format (VCF) file. 10 | 11 | ## Usage 12 | 13 | ``` 14 | snp_calling.sh [-h|--help] [-1|--one -2|--two -r|--reference] 15 | [-d|--demo] [-o|--output -l|--log -p|--processors n] 16 | 17 | This script aligns sequencing reads to a reference genome, and finds genetic 18 | variants (SNPs/indels) based on this alignment, which are written to a variant 19 | call format (VCF) file. 20 | 21 | Calling this script with the argument '-d' or '--demo' will run this script 22 | using Saccharomyces cerevisiae FASTQ sequencing reads and a Saccharomyces 23 | cerevisiae reference genome, which will be downloaded from NCBI. 24 | 25 | This script should be called from the 'bioinfo-notebook/' directory.The 26 | programs required for this script are in the 'bioinfo-notebook' conda 27 | environment (bioinfo-notebook/envs/bioinfo-notebook.yml or 28 | bioinfo-notebook/envs/bioinfo-notebook.txt). 29 | If the input files are not in the 'bioinfo-notebook/data/' directory, the full 30 | file paths should be given. 31 | 32 | 33 | arguments: 34 | -h | --help show this help text and exit 35 | -1 | --one forward reads to align with reference sequence 36 | (FASTQ: .fastq or .fastq.gz) 37 | -2 | --two reverse reads to align with reference sequence 38 | (FASTQ: .fastq or .fastq.gz) 39 | -r | --reference reference sequence to align reads against 40 | (FASTA nucleotide file: .fna) 41 | -d | --demo run the script with demonstration inputs 42 | 43 | optional arguments: 44 | -o | --output optional: name of final output file 45 | (default: 'reference_seq_vs_reads_var.vcf', or 46 | 'S_cere_DRR237290_var.vcf' if demo is used). 47 | -l | --log redirect terminal output to a log file in the 48 | directory bioinfo-notebook/results/ 49 | -p | --processors optional: set the number (n) of processors to 50 | use (default: 1) 51 | ``` 52 | 53 | ## See also 54 | 55 | - [snp_calling.sh on GitHub](https://github.com/rnnh/bioinfo-notebook/blob/master/scripts/snp_calling.sh) 56 | - [File formats used in bioinformatics](file_formats.md) 57 | - [samtools](samtools.md) 58 | - [fastq-dump](fastq-dump.md) 59 | - [bowtie2](bowtie2.md) 60 | -------------------------------------------------------------------------------- /docs/ubuntu_virtualbox.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: Using Ubuntu through a Virtual Machine 4 | parent: 1. General guides 5 | nav_order: 3 6 | --- 7 | 8 | 9 | # Using Ubuntu through a Virtual Machine 10 | 11 | *Ubuntu* is a Linux operating system that is widely used for bioinformatics. 12 | If you have not used a Linux system before, an Ubuntu virtual machine is an ideal way to try the programs documented on this website. 13 | 14 | A *virtual machine* is a computer file, typically called an image, that behaves like an actual computer. 15 | It acts as a computer within a computer. 16 | Virtual machines run in a window, much like any other program running in a window on your computer. 17 | The virtual machine is sequestered from the rest of the system, meaning that the software inside a virtual machine can not tamper with the computer itself. 18 | This produces an ideal environment for testing other operating systems, and running software or applications on operating systems they were not originally intended for. 19 | 20 | An Ubuntu virtual machine can be created using *VirtualBox*, and an Ubuntu *disk image*. 21 | VirtualBox is a program that can be used to create, manage, and access virtual machines. 22 | A disk image is a file that acts like a compact disc, or another storage device. 23 | VirtualBox and the Ubuntu disk image are freely available online. 24 | 25 | ## Contents 26 | 27 | - [Files required to set up an Ubuntu virtual machine](#files-required-to-set-up-an-ubuntu-virtual-machine) 28 | - [Direct links to download required files](#direct-links-to-download-required-files) 29 | - [How to create an Ubuntu virtual machine using VirtualBox](#how-to-create-an-ubuntu-virtual-machine-using-virtualbox) 30 | - [Increasing the screen resolution of the Ubuntu virtual machine](#increasing-the-screen-resolution-of-the-ubuntu-virtual-machine) 31 | - [See also](#see-also) 32 | - [References](#references) 33 | 34 | ## Files required to set up an Ubuntu virtual machine 35 | 36 | To set up an Ubuntu virtual machine, you will need an Ubuntu disk image, a file to install VirtualBox, and the VirtualBox Extension Package. 37 | This requires around 13 GB of free hard drive space on your computer in total. 38 | The Ubuntu disk image is around 2 GB in size, and may take a while to download depending on your internet connection. 39 | The file required to install VirtualBox is around 108 or 123 MB in size, depending on the platform of your computer (i.e. Windows or Mac). 40 | 41 | ### Direct links to download required files 42 | 43 | 1. [The Ubuntu disk image (filename: `ubuntu-18.04.4-desktop-amd64.iso`)](http://releases.ubuntu.com/18.04.4/ubuntu-18.04.4-desktop-amd64.iso) 44 | 2. [VirtualBox installer for Windows](https://download.virtualbox.org/virtualbox/6.1.4/VirtualBox-6.1.4-136177-Win.exe) 45 | 3. [VirtualBox installer for Mac](https://download.virtualbox.org/virtualbox/6.1.4/VirtualBox-6.1.4-136177-OSX.dmg) 46 | 4. [VirtualBox Extension Pack (all platforms)](https://download.virtualbox.org/virtualbox/6.1.4/Oracle_VM_VirtualBox_Extension_Pack-6.1.4.vbox-extpack) 47 | 48 | If the above links do not work, they may have expired. 49 | In this case, the above files can be found on the [VirtualBox website](https://www.virtualbox.org/wiki/Downloads) and the [Ubuntu website](https://ubuntu.com/download/desktop). 50 | 51 | ## How to create an Ubuntu virtual machine using VirtualBox 52 | 53 | 1. Download the [VirtualBox installer](#direct-links-to-download-required-files) for your computer (either Windows or Mac). 54 | 2. Once the VirtualBox installer is downloaded, open it and follow the on-screen instructions to install the VirtualBox program. 55 | 3. **Windows only:** If you get a "Windows Security" prompt asking *"Would you like to install this device software?"* for driver software from *"Publisher: Oracle Corporation"*, select "Install". 56 | 4. **Mac only:** If you get a *"This package will run a program to determine if the software can be installed"* prompt while installing VirtualBox, select "Continue". You may also be asked to enter your user password while installing VirtualBox on a Mac. 57 | 5. Once installed, open the VirtualBox program. 58 | 6. In VirtualBox, click on "New" (the blue badge). This will open a menu to create a new virtual machine. 59 | 7. In the "Name" field of the "Name and operating system" window, type "ubuntu". VirtualBox will automatically set the type and version for this virtual machine as "Linux" and "Ubuntu". 60 | 8. Select "Next" to proceed to the "Memory size" section. 61 | 9. In this section, you can set the amount of Random Access Memory (RAM) that the virtual machine can use. A suggested amount of RAM will automatically be selected when you get to this page, but you can increase the amount of RAM allocated using the slider on this page. 62 | 10. **Note:** If you use the slider to increase the amount of RAM allocated on the "Memory Size" page, keep the slider in the green zone. Setting the slider in the orange or red zone (>50% of your computer's available RAM) will negatively affect the performance of the virtual machine. 63 | 11. Select "Next" to proceed to the "Hard disk" page. 64 | 12. Select "Create a virtual hard disk now", and then select "Create". 65 | 13. On the "Hard disk file type" page, select "VDI (VirtualBox Disk Image)", and then select "Next". 66 | 14. On the "Store on physical hard disk", select "Dynamically allocated", and then select "Next" to proceed to the "File location and size" page. 67 | 15. On this page, you can change the location and size of the virtual hard disk. There is no need to adjust the size of the virtual hard disk, but take note of its location (the folder/directory it will be created in). Select "Create". 68 | 16. In the left side of the VirtualBox main menu, double-click the name of the virtual machine you just created ("ubuntu"). 69 | 17. This will bring up the "Select start-up disk" window. In this window, select the folder icon to open the "Optical Disk Selector" menu. 70 | 18. In this menu, select "Add", which will open a window titled "Please choose a virtual optical disk file". 71 | 19. In this window, go to the folder into which the Ubuntu disk image downloaded (e.g. "Downloads"), and click the [Ubuntu disk image (filename: `ubuntu-18.04.4-desktop-amd64.iso`)](#direct-links-to-download-required-files) to select it, and then select "Open". 72 | 20. This will bring you back to the "Optical Disk Selector" window. Select the Ubuntu disk image you selected in the previous window, and click on "Choose". 73 | 21. This will bring you back to the "Select start-up disk" window. The Ubuntu disk image should be selected in the drop down menu (this read "Empty" before the Ubuntu disk image was added). Select "Start" to start the virtual machine. 74 | 22. The Ubuntu virtual machine is now running in its own window. It may take a few minutes to start up the first time. 75 | 23. On the "Welcome" screen in Ubuntu, select "Install Ubuntu". 76 | 24. In the "Keyboard layout" section, select your keyboard layout, and then select "Continue". This will bring you to the "Updates and other software" window. 77 | 25. In this window, in the section "What apps would you like to install to start with?", select "Minimal installation". 78 | 26. In the "Other options" section, select "Download updates while installing Ubuntu", and leave "Install third-party software..." unselected. Select "Continue" to proceed to the "Installation type" window. 79 | 27. In this window, select "Erase disk and install Ubuntu". As this is a virtual machine, in this instance "disk" refers to the virtual disk image (`.vdi`) file created earlier (see steps 12 to 15). Select "Install now". 80 | 28. A window titled "Write the changes to disks?" will appear. In this window, select "Continue". 81 | 29. This will bring you to the "Where are you?" window. In this window, enter your location (which is needed to set the system clock) and select "Continue". 82 | 30. Fill in the requested details in the "Who are you?" window: your name, your computer's name, your username (both of which will be filled in automatically when you enter your name), and your password. Make sure you remember your password, you will need it to install programs in your Ubuntu virtual machine. Select "Continue" to proceed. 83 | 31. At this point, Ubuntu will begin installing on the virtual disk image created earlier (the `.vdi` file). This will take a few minutes. 84 | 32. Once the installation is complete, select "Restart Now" from the "Installation complete" dialog window. 85 | 33. When asked "Please remove the installation media and press ENTER", press Enter (a.k.a. Return). 86 | 34. The virtual machine will then restart, and the Ubuntu login page will load. On this page, select the user you created during the installation, and enter your password to log in. 87 | 35. Once you have logged in, you have finished setting up your Ubuntu virtual machine. Click through the "What's new in Ubuntu" window for a brief introduction to Ubuntu. 88 | 36. When you want to close your Ubuntu virtual machine, close the window it is running in to bring up the "Close Virtual Machine" window, select "Power off the machine" and click "OK". This is the equivalent of shutting down the machine. Alternatively, you can select "Power off" within the Ubuntu virtual machine. 89 | 90 | Once you have finished installing the Ubuntu virtual machine, you can delete the Ubuntu disk image (filename: `ubuntu-18.04.4-desktop-amd64.iso`), and the VirtualBox installer. 91 | 92 | ## Increasing the screen resolution of the Ubuntu virtual machine 93 | 94 | At this point, the Ubuntu virtual machine takes up only a small portion of the VirtualBox window it runs in. 95 | To increase the screen resolution of the Ubuntu virtual machine, you will need to download the [VirtualBox Extension Package](#direct-links-to-download-required-files) and follow the steps below. 96 | 97 | 1. Once downloaded, double click the VirtualBox Extension Pack (file extension `.vbox-extpack`). If you have installed the VirtualBox program, it will open this file. 98 | 2. VirtualBox will open with a window notifying that an extension pack is about to be installed. In this window, select "Install" to proceed with the extension pack installation. 99 | 3. Scroll to the bottom of the Terms and Conditions window that opens, and select "I Agree" to install the extension pack. 100 | 4. Open the Ubuntu virtual machine in VirtualBox. 101 | 5. In the menu bar of the VirtualBox window in which Ubuntu is running, select the "Devices" menu, and select "Insert Guest Additions CD image...". 102 | 6. A notification will appear in the Ubuntu virtual machine: '"VBox_GAs_6.1.4" contains software intended to be automatically started. Would you like to run it?". In this window, select "Run", and enter your Ubuntu password to install the VirtualBox Guest Additions on the Ubuntu virtual machine. 103 | 7. A terminal window will open showing the VirtualBox Guest Additions installation progress. Once the installation has finished, press Return (Enter) to close this window. 104 | 8. Close the Ubuntu virtual machine by closing the window it is running in, and selecting "Power off the machine" from the "Close Virtual Machine" window. 105 | 9. Open the Ubuntu virtual machine in VirtualBox. 106 | 10. In the menu bar of the window in which Ubuntu is running, select the "View" menu, and confirm that "Auto-resize Guest Display" is enabled. 107 | 108 | ## See also 109 | 110 | - [Introduction to the command line](cl_intro.md) 111 | - [Windows Subsystem for Linux](wsl.md) 112 | - [The Ubuntu Website](https://ubuntu.com/) 113 | - [The VirtualBox Website](https://www.virtualbox.org/) 114 | 115 | ## References 116 | 117 | - [What is a Virtual Machine?](https://azure.microsoft.com/en-us/overview/what-is-a-virtual-machine/) 118 | - [How to Install Ubuntu on VirtualBox](https://www.wikihow.com/Install-Ubuntu-on-VirtualBox) 119 | -------------------------------------------------------------------------------- /docs/wsl.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: Windows Subsystem for Linux 4 | parent: 1. General guides 5 | nav_order: 2 6 | --- 7 | 8 | # Windows Subsystem for Linux 9 | 10 | Windows Subsystem for Linux (WSL) is a feature of Windows 10. 11 | When enabled, WSL allows Linux systems (e.g. Ubuntu) to be used as Windows applications. 12 | These Linux systems can be downloaded directly from the Microsoft Store. 13 | The bioinfo-notebook [conda](conda.md) environment can be installed in an Ubuntu system running using WSL. 14 | 15 | ## Installing Ubuntu on Windows 10 using WSL 16 | 17 | Before you begin, make sure you have around 1.20 GB of free disk space. 18 | 19 | ### Enable WSL 20 | 21 | *Note:* Enabling the WSL feature will take a few minutes, and you will need to restart your computer for it to take effect. 22 | 23 | 1. In the search box on the taskbar, type "control panel", and then select Control Panel. 24 | 2. In the Control Panel, select "Programs". 25 | 3. Under Programs and Features, select "Turn Windows features on or off". 26 | 4. If asked "Do you want this app to make changes to your device?", select "Yes". 27 | 5. From the list of Windows features, tick the box next to "Windows Subsystem for Linux" to enable WSL, and click OK. 28 | 29 | ### Download Ubuntu from the Microsoft Store 30 | 31 | 1. In the search box on the taskbar, type "microsoft store", and select Microsoft Store. 32 | 2. In the Microsoft Store, search for "Ubuntu". 33 | 3. Select the Ubuntu app. 34 | 4. On the app page, select "Get" to download Ubuntu. 35 | 5. If asked to sign in with a Microsoft account, select "No, thanks". 36 | 37 | After enabling WSL and downloading Ubuntu from the Microsoft Store, Ubuntu can be used like a regular Windows application. 38 | 39 | ### Running Ubuntu for the first time 40 | 41 | 1. In the search box on the taskbar, type "Ubuntu", and select the Ubuntu app to launch it. It will take a few minutes to install the first time it runs. 42 | 2. When prompted, enter a UNIX username- this does not need to be the same as your Windows account name. 43 | 3. You will need to set a UNIX password. This is only used for the Ubuntu app, it does not need to be the same as your Windows password. Make sure you remember your UNIX password, as you will need it for installing new programs in Ubuntu. 44 | 45 | Once your UNIX password has been updated successfully, you will see the `bash` command prompt in the Ubuntu window: 46 | 47 | ``` 48 | (Your UNIX username)@(Your computer's alias):~$ _ 49 | ``` 50 | 51 | In this command prompt, the tilde character (`~`) indicates that you are currently in your home directory. 52 | The dollar sign (`$`) indicates that this command line uses the `bash` shell language. 53 | 54 | ## See also 55 | 56 | - [Introduction to the command line](cl_intro.md) 57 | - [Using Ubuntu through a Virtual Machine](ubuntu_virtualbox.md) 58 | - [conda](conda.md) 59 | -------------------------------------------------------------------------------- /envs/augustus.yml: -------------------------------------------------------------------------------- 1 | name: augustus 2 | channels: 3 | - bioconda 4 | - conda-forge 5 | - cf-staging 6 | - defaults 7 | dependencies: 8 | - _libgcc_mutex=0.1=conda_forge 9 | - _openmp_mutex=4.5=0_gnu 10 | - augustus=3.3.3=pl526hce533f5_0 11 | - biopython=1.77=py38h1e0a361_0 12 | - boost=1.70.0=py38h9de70de_1 13 | - boost-cpp=1.70.0=h7b93d67_3 14 | - bzip2=1.0.8=h516909a_2 15 | - ca-certificates=2020.6.20=hecda079_0 16 | - certifi=2020.6.20=py38h32f6830_0 17 | - curl=7.71.1=he644dc0_0 18 | - gsl=2.5=h294904e_1 19 | - htslib=1.9=h4da6232_3 20 | - icu=67.1=he1b5a44_0 21 | - krb5=1.17.1=hfafb76e_1 22 | - ld_impl_linux-64=2.34=h53a641e_5 23 | - libblas=3.8.0=11_openblas 24 | - libcblas=3.8.0=11_openblas 25 | - libcurl=7.71.1=hcdd3856_0 26 | - libdeflate=1.2=h516909a_1 27 | - libedit=3.1.20191231=h46ee950_1 28 | - libffi=3.2.1=he1b5a44_1007 29 | - libgcc-ng=9.2.0=h24d8f2e_2 30 | - libgfortran-ng=7.5.0=hdf63c60_9 31 | - libgomp=9.2.0=h24d8f2e_2 32 | - liblapack=3.8.0=11_openblas 33 | - libopenblas=0.3.6=h6e990d7_6 34 | - libssh2=1.9.0=hab1572f_3 35 | - libstdcxx-ng=9.2.0=hdf63c60_2 36 | - lp_solve=5.5.2.5=h14c3975_1001 37 | - lz4-c=1.9.2=he1b5a44_1 38 | - metis=5.1.0=he1b5a44_1005 39 | - ncurses=6.1=hf484d3e_1002 40 | - numpy=1.18.5=py38h8854b6b_0 41 | - openblas=0.3.6=h6e990d7_6 42 | - openssl=1.1.1g=h516909a_0 43 | - perl=5.26.2=h516909a_1006 44 | - perl-apache-test=1.40=pl526_1 45 | - perl-app-cpanminus=1.7044=pl526_1 46 | - perl-base=2.23=pl526_1 47 | - perl-carp=1.38=pl526_3 48 | - perl-class-load=0.25=pl526_0 49 | - perl-class-load-xs=0.10=pl526h6bb024c_2 50 | - perl-class-method-modifiers=2.12=pl526_0 51 | - perl-constant=1.33=pl526_1 52 | - perl-cpan-meta=2.150010=pl526_0 53 | - perl-cpan-meta-requirements=2.140=pl526_0 54 | - perl-cpan-meta-yaml=0.018=pl526_0 55 | - perl-data-dumper=2.173=pl526_0 56 | - perl-data-optlist=0.110=pl526_2 57 | - perl-dbi=1.642=pl526_0 58 | - perl-devel-globaldestruction=0.14=pl526_0 59 | - perl-devel-overloadinfo=0.005=pl526_0 60 | - perl-devel-stacktrace=2.04=pl526_0 61 | - perl-dist-checkconflicts=0.11=pl526_2 62 | - perl-encode=2.88=pl526_1 63 | - perl-eval-closure=0.14=pl526h6bb024c_4 64 | - perl-exporter=5.72=pl526_1 65 | - perl-extutils-cbuilder=0.280230=pl526_1 66 | - perl-extutils-makemaker=7.36=pl526_1 67 | - perl-extutils-manifest=1.72=pl526_0 68 | - perl-extutils-parsexs=3.35=pl526_0 69 | - perl-file-path=2.16=pl526_0 70 | - perl-file-temp=0.2304=pl526_2 71 | - perl-file-which=1.23=pl526_0 72 | - perl-getopt-long=2.50=pl526_1 73 | - perl-ipc-cmd=1.02=pl526_0 74 | - perl-json-pp=4.04=pl526_0 75 | - perl-locale-maketext-simple=0.21=pl526_2 76 | - perl-module-build=0.4224=pl526_3 77 | - perl-module-corelist=5.20190524=pl526_0 78 | - perl-module-implementation=0.09=pl526_2 79 | - perl-module-load=0.32=pl526_1 80 | - perl-module-load-conditional=0.68=pl526_2 81 | - perl-module-metadata=1.000036=pl526_0 82 | - perl-module-runtime=0.016=pl526_1 83 | - perl-module-runtime-conflicts=0.003=pl526_0 84 | - perl-moo=2.003004=pl526_0 85 | - perl-moose=2.2011=pl526hf484d3e_1 86 | - perl-mro-compat=0.13=pl526_0 87 | - perl-package-deprecationmanager=0.17=pl526_0 88 | - perl-package-stash=0.38=pl526hf484d3e_1 89 | - perl-package-stash-xs=0.28=pl526hf484d3e_1 90 | - perl-parallel-forkmanager=2.02=pl526_0 91 | - perl-params-check=0.38=pl526_1 92 | - perl-params-util=1.07=pl526h6bb024c_4 93 | - perl-parent=0.236=pl526_1 94 | - perl-pathtools=3.75=pl526h14c3975_1 95 | - perl-perl-ostype=1.010=pl526_1 96 | - perl-role-tiny=2.000008=pl526_0 97 | - perl-scalar-list-utils=1.52=pl526h516909a_0 98 | - perl-storable=3.15=pl526h14c3975_0 99 | - perl-sub-exporter=0.987=pl526_2 100 | - perl-sub-exporter-progressive=0.001013=pl526_0 101 | - perl-sub-identify=0.14=pl526h14c3975_0 102 | - perl-sub-install=0.928=pl526_2 103 | - perl-sub-name=0.21=pl526_1 104 | - perl-sub-quote=2.006003=pl526_1 105 | - perl-text-abbrev=1.02=pl526_0 106 | - perl-text-parsewords=3.30=pl526_0 107 | - perl-try-tiny=0.30=pl526_1 108 | - perl-version=0.9924=pl526_0 109 | - perl-xsloader=0.24=pl526_0 110 | - perl-yaml=1.29=pl526_0 111 | - pip=20.1.1=py_1 112 | - python=3.8.3=cpython_he5300dc_0 113 | - python_abi=3.8=1_cp38 114 | - readline=8.0=h46ee950_1 115 | - setuptools=49.1.0=py38h32f6830_0 116 | - sqlite=3.32.3=hcee41ef_1 117 | - suitesparse=4.5.6=h717dc36_1204 118 | - tbb=2020.1=hc9558a2_0 119 | - tk=8.6.10=hed695b0_0 120 | - wheel=0.34.2=py_1 121 | - xz=5.2.5=h516909a_1 122 | - zlib=1.2.11=h516909a_1006 123 | - zstd=1.4.4=h6597ccf_3 124 | -------------------------------------------------------------------------------- /envs/bioinfo-notebook.txt: -------------------------------------------------------------------------------- 1 | # This file may be used to create an environment using: 2 | # $ conda create --name --file 3 | # platform: linux-64 4 | @EXPLICIT 5 | https://repo.anaconda.com/pkgs/main/linux-64/_libgcc_mutex-0.1-main.conda 6 | https://repo.anaconda.com/pkgs/main/linux-64/blas-1.0-mkl.conda 7 | https://repo.anaconda.com/pkgs/main/linux-64/ca-certificates-2020.10.14-0.conda 8 | https://repo.anaconda.com/pkgs/main/linux-64/intel-openmp-2020.1-217.conda 9 | https://repo.anaconda.com/pkgs/main/linux-64/jpeg-9b-h024ee3a_2.conda 10 | https://repo.anaconda.com/pkgs/main/linux-64/ld_impl_linux-64-2.33.1-h53a641e_7.conda 11 | https://repo.anaconda.com/pkgs/main/linux-64/libgcc-ng-9.1.0-hdf63c60_0.conda 12 | https://repo.anaconda.com/pkgs/main/linux-64/libgfortran-ng-7.3.0-hdf63c60_0.conda 13 | https://repo.anaconda.com/pkgs/main/linux-64/libstdcxx-ng-9.1.0-hdf63c60_0.conda 14 | https://repo.anaconda.com/pkgs/main/linux-64/bzip2-1.0.8-h7b6447c_0.conda 15 | https://repo.anaconda.com/pkgs/main/linux-64/expat-2.2.9-he6710b0_2.conda 16 | https://repo.anaconda.com/pkgs/main/linux-64/icu-58.2-he6710b0_3.conda 17 | https://conda.anaconda.org/bioconda/linux-64/libdeflate-1.0-h14c3975_1.tar.bz2 18 | https://repo.anaconda.com/pkgs/main/linux-64/libffi-3.3-he6710b0_2.conda 19 | https://repo.anaconda.com/pkgs/main/linux-64/libiconv-1.15-h63c8f33_5.conda 20 | https://repo.anaconda.com/pkgs/main/linux-64/libuuid-1.0.3-h1bed415_2.conda 21 | https://repo.anaconda.com/pkgs/main/linux-64/libxcb-1.14-h7b6447c_0.conda 22 | https://repo.anaconda.com/pkgs/main/linux-64/lz4-c-1.9.2-he6710b0_1.conda 23 | https://repo.anaconda.com/pkgs/main/linux-64/lzo-2.10-h7b6447c_2.conda 24 | https://repo.anaconda.com/pkgs/main/linux-64/mkl-2020.1-217.conda 25 | https://repo.anaconda.com/pkgs/main/linux-64/ncurses-6.2-he6710b0_1.conda 26 | https://repo.anaconda.com/pkgs/main/linux-64/openssl-1.1.1h-h7b6447c_0.conda 27 | https://repo.anaconda.com/pkgs/main/linux-64/pcre-8.44-he6710b0_0.conda 28 | https://repo.anaconda.com/pkgs/main/linux-64/perl-5.26.2-h14c3975_0.conda 29 | https://repo.anaconda.com/pkgs/main/linux-64/snappy-1.1.8-he6710b0_0.conda 30 | https://repo.anaconda.com/pkgs/main/linux-64/tbb-2020.0-hfd86e86_0.conda 31 | https://repo.anaconda.com/pkgs/main/linux-64/xz-5.2.5-h7b6447c_0.conda 32 | https://repo.anaconda.com/pkgs/main/linux-64/zlib-1.2.11-h7b6447c_3.conda 33 | https://repo.anaconda.com/pkgs/main/linux-64/blosc-1.19.0-hd408876_0.conda 34 | https://repo.anaconda.com/pkgs/main/linux-64/glib-2.65.0-h3eb4bd4_0.conda 35 | https://repo.anaconda.com/pkgs/main/linux-64/gstreamer-1.14.0-hb31296c_0.conda 36 | https://repo.anaconda.com/pkgs/main/linux-64/hdf5-1.10.4-hb1b8bf9_0.conda 37 | https://repo.anaconda.com/pkgs/main/linux-64/libedit-3.1.20191231-h14c3975_1.conda 38 | https://repo.anaconda.com/pkgs/main/linux-64/libpng-1.6.37-hbc83047_0.conda 39 | https://repo.anaconda.com/pkgs/main/linux-64/libssh2-1.9.0-h1ba5d50_1.conda 40 | https://repo.anaconda.com/pkgs/main/linux-64/libxml2-2.9.10-he19cac6_1.conda 41 | https://conda.anaconda.org/bioconda/linux-64/perl-app-cpanminus-1.7044-pl526_1.tar.bz2 42 | https://conda.anaconda.org/bioconda/linux-64/perl-base-2.23-pl526_1.tar.bz2 43 | https://conda.anaconda.org/bioconda/linux-64/perl-common-sense-3.74-pl526_2.tar.bz2 44 | https://conda.anaconda.org/bioconda/linux-64/perl-compress-raw-bzip2-2.087-pl526he1b5a44_0.tar.bz2 45 | https://conda.anaconda.org/bioconda/linux-64/perl-compress-raw-zlib-2.087-pl526hc9558a2_0.tar.bz2 46 | https://conda.anaconda.org/bioconda/linux-64/perl-constant-1.33-pl526_1.tar.bz2 47 | https://conda.anaconda.org/bioconda/linux-64/perl-data-dumper-2.173-pl526_0.tar.bz2 48 | https://conda.anaconda.org/bioconda/linux-64/perl-digest-hmac-1.03-pl526_3.tar.bz2 49 | https://conda.anaconda.org/bioconda/linux-64/perl-digest-md5-2.55-pl526_0.tar.bz2 50 | https://conda.anaconda.org/bioconda/linux-64/perl-exporter-5.72-pl526_1.tar.bz2 51 | https://conda.anaconda.org/bioconda/linux-64/perl-exporter-tiny-1.002001-pl526_0.tar.bz2 52 | https://conda.anaconda.org/bioconda/linux-64/perl-extutils-makemaker-7.36-pl526_1.tar.bz2 53 | https://conda.anaconda.org/bioconda/linux-64/perl-html-tagset-3.20-pl526_3.tar.bz2 54 | https://conda.anaconda.org/bioconda/linux-64/perl-io-html-1.001-pl526_2.tar.bz2 55 | https://conda.anaconda.org/bioconda/linux-64/perl-io-zlib-1.10-pl526_2.tar.bz2 56 | https://conda.anaconda.org/bioconda/linux-64/perl-mozilla-ca-20180117-pl526_1.tar.bz2 57 | https://conda.anaconda.org/bioconda/linux-64/perl-parent-0.236-pl526_1.tar.bz2 58 | https://conda.anaconda.org/bioconda/linux-64/perl-scalar-list-utils-1.52-pl526h516909a_0.tar.bz2 59 | https://conda.anaconda.org/bioconda/linux-64/perl-socket-2.027-pl526_1.tar.bz2 60 | https://conda.anaconda.org/bioconda/linux-64/perl-try-tiny-0.30-pl526_1.tar.bz2 61 | https://conda.anaconda.org/bioconda/linux-64/perl-xml-parser-2.44-pl526h4e0c4b3_7.tar.bz2 62 | https://conda.anaconda.org/bioconda/linux-64/perl-xml-sax-base-1.09-pl526_0.tar.bz2 63 | https://conda.anaconda.org/bioconda/linux-64/perl-xsloader-0.24-pl526_0.tar.bz2 64 | https://repo.anaconda.com/pkgs/main/linux-64/readline-8.0-h7b6447c_0.conda 65 | https://conda.anaconda.org/bioconda/linux-64/subread-2.0.0-hed695b0_0.tar.bz2 66 | https://repo.anaconda.com/pkgs/main/linux-64/tk-8.6.10-hbc83047_0.conda 67 | https://repo.anaconda.com/pkgs/main/linux-64/zstd-1.4.5-h9ceee32_0.conda 68 | https://repo.anaconda.com/pkgs/main/linux-64/dbus-1.13.16-hb2f20db_0.conda 69 | https://repo.anaconda.com/pkgs/main/linux-64/freetype-2.10.2-h5ab3b9f_0.conda 70 | https://repo.anaconda.com/pkgs/main/linux-64/gst-plugins-base-1.14.0-hbbd80ab_1.conda 71 | https://repo.anaconda.com/pkgs/main/linux-64/krb5-1.18.2-h173b8e3_0.conda 72 | https://conda.anaconda.org/bioconda/linux-64/ncbi-ngs-sdk-2.10.0-hdf6179e_0.tar.bz2 73 | https://conda.anaconda.org/bioconda/linux-64/perl-carp-1.38-pl526_3.tar.bz2 74 | https://conda.anaconda.org/bioconda/linux-64/perl-encode-2.88-pl526_1.tar.bz2 75 | https://conda.anaconda.org/bioconda/linux-64/perl-file-path-2.16-pl526_0.tar.bz2 76 | https://conda.anaconda.org/bioconda/linux-64/perl-html-parser-3.72-pl526h6bb024c_5.tar.bz2 77 | https://conda.anaconda.org/bioconda/linux-64/perl-io-compress-2.087-pl526he1b5a44_0.tar.bz2 78 | https://conda.anaconda.org/bioconda/linux-64/perl-list-moreutils-xs-0.428-pl526_0.tar.bz2 79 | https://conda.anaconda.org/bioconda/linux-64/perl-mime-base64-3.15-pl526_1.tar.bz2 80 | https://conda.anaconda.org/bioconda/linux-64/perl-ntlm-1.09-pl526_4.tar.bz2 81 | https://conda.anaconda.org/bioconda/linux-64/perl-storable-3.15-pl526h14c3975_0.tar.bz2 82 | https://conda.anaconda.org/bioconda/linux-64/perl-test-requiresinternet-0.05-pl526_0.tar.bz2 83 | https://conda.anaconda.org/bioconda/linux-64/perl-types-serialiser-1.0-pl526_2.tar.bz2 84 | https://conda.anaconda.org/bioconda/linux-64/perl-xml-namespacesupport-1.12-pl526_0.tar.bz2 85 | https://repo.anaconda.com/pkgs/main/linux-64/sqlite-3.32.3-h62c20be_0.conda 86 | https://repo.anaconda.com/pkgs/main/linux-64/fontconfig-2.13.0-h9420a91_0.conda 87 | https://repo.anaconda.com/pkgs/main/linux-64/libcurl-7.71.1-h20c2e04_1.conda 88 | https://conda.anaconda.org/bioconda/linux-64/perl-business-isbn-data-20140910.003-pl526_0.tar.bz2 89 | https://conda.anaconda.org/bioconda/linux-64/perl-encode-locale-1.05-pl526_6.tar.bz2 90 | https://conda.anaconda.org/bioconda/linux-64/perl-file-temp-0.2304-pl526_2.tar.bz2 91 | https://conda.anaconda.org/bioconda/linux-64/perl-html-tree-5.07-pl526_1.tar.bz2 92 | https://conda.anaconda.org/bioconda/linux-64/perl-json-xs-2.34-pl526h6bb024c_3.tar.bz2 93 | https://conda.anaconda.org/bioconda/linux-64/perl-list-moreutils-0.428-pl526_1.tar.bz2 94 | https://conda.anaconda.org/bioconda/linux-64/perl-lwp-mediatypes-6.04-pl526_0.tar.bz2 95 | https://conda.anaconda.org/bioconda/linux-64/perl-net-ssleay-1.88-pl526h90d6eec_0.tar.bz2 96 | https://conda.anaconda.org/bioconda/linux-64/perl-pathtools-3.75-pl526h14c3975_1.tar.bz2 97 | https://conda.anaconda.org/bioconda/linux-64/perl-time-local-1.28-pl526_1.tar.bz2 98 | https://repo.anaconda.com/pkgs/main/linux-64/python-3.7.7-hcff3b4d_5.conda 99 | https://conda.anaconda.org/conda-forge/linux-64/asciinema-2.0.2-py37_1000.tar.bz2 100 | https://conda.anaconda.org/bioconda/linux-64/bowtie-1.2.3-py37hc9558a2_0.tar.bz2 101 | https://conda.anaconda.org/bioconda/linux-64/bowtie2-2.3.5.1-py37he513fc3_0.tar.bz2 102 | https://repo.anaconda.com/pkgs/main/noarch/certifi-2020.6.20-pyhd3eb1b0_3.conda 103 | https://repo.anaconda.com/pkgs/main/linux-64/chardet-3.0.4-py37_1003.conda 104 | https://repo.anaconda.com/pkgs/main/noarch/click-7.1.2-py_0.conda 105 | https://repo.anaconda.com/pkgs/main/linux-64/curl-7.71.1-hbc83047_1.conda 106 | https://repo.anaconda.com/pkgs/main/noarch/decorator-4.4.2-py_0.conda 107 | https://repo.anaconda.com/pkgs/main/linux-64/future-0.18.2-py37_1.conda 108 | https://repo.anaconda.com/pkgs/main/noarch/idna-2.10-py_0.conda 109 | https://repo.anaconda.com/pkgs/main/linux-64/kiwisolver-1.2.0-py37hfd86e86_0.conda 110 | https://repo.anaconda.com/pkgs/main/noarch/mock-4.0.2-py_0.conda 111 | https://repo.anaconda.com/pkgs/main/linux-64/msgpack-python-1.0.0-py37hfd86e86_1.conda 112 | https://conda.anaconda.org/bioconda/linux-64/perl-archive-tar-2.32-pl526_0.tar.bz2 113 | https://conda.anaconda.org/bioconda/linux-64/perl-business-isbn-3.004-pl526_0.tar.bz2 114 | https://conda.anaconda.org/bioconda/linux-64/perl-http-date-6.02-pl526_3.tar.bz2 115 | https://conda.anaconda.org/bioconda/linux-64/perl-io-socket-ssl-2.066-pl526_0.tar.bz2 116 | https://conda.anaconda.org/bioconda/linux-64/perl-json-4.02-pl526_0.tar.bz2 117 | https://conda.anaconda.org/bioconda/noarch/perl-xml-sax-1.02-pl526_0.tar.bz2 118 | https://repo.anaconda.com/pkgs/main/noarch/pycparser-2.20-py_2.conda 119 | https://repo.anaconda.com/pkgs/main/linux-64/pymongo-3.11.0-py37he6710b0_0.conda 120 | https://repo.anaconda.com/pkgs/main/noarch/pyparsing-2.4.7-py_0.conda 121 | https://repo.anaconda.com/pkgs/main/linux-64/pysocks-1.7.1-py37_1.conda 122 | https://repo.anaconda.com/pkgs/main/noarch/pytz-2020.1-py_0.conda 123 | https://repo.anaconda.com/pkgs/main/linux-64/qt-5.9.7-h5867ecd_1.conda 124 | https://conda.anaconda.org/bioconda/noarch/semidbm-0.5.1-pyh864c0ab_3.tar.bz2 125 | https://repo.anaconda.com/pkgs/main/linux-64/sip-4.19.8-py37hf484d3e_0.conda 126 | https://repo.anaconda.com/pkgs/main/noarch/six-1.15.0-py_0.conda 127 | https://conda.anaconda.org/bioconda/linux-64/spades-3.13.0-0.tar.bz2 128 | https://repo.anaconda.com/pkgs/main/linux-64/tornado-6.0.4-py37h7b6447c_1.conda 129 | https://repo.anaconda.com/pkgs/main/noarch/tqdm-4.48.2-py_0.conda 130 | https://conda.anaconda.org/bioconda/linux-64/bcftools-1.9-ha228f0b_4.tar.bz2 131 | https://repo.anaconda.com/pkgs/main/linux-64/cffi-1.14.1-py37he30daa8_0.conda 132 | https://repo.anaconda.com/pkgs/main/linux-64/cycler-0.10.0-py37_0.conda 133 | https://repo.anaconda.com/pkgs/main/linux-64/mkl-service-2.3.0-py37he904b0f_0.conda 134 | https://conda.anaconda.org/bioconda/linux-64/perl-file-listing-6.04-pl526_1.tar.bz2 135 | https://conda.anaconda.org/bioconda/linux-64/perl-uri-1.76-pl526_0.tar.bz2 136 | https://conda.anaconda.org/bioconda/linux-64/perl-xml-libxml-2.0132-pl526h7ec2d77_1.tar.bz2 137 | https://conda.anaconda.org/bioconda/linux-64/perl-xml-sax-expat-0.51-pl526_3.tar.bz2 138 | https://repo.anaconda.com/pkgs/main/linux-64/pyqt-5.9.2-py37h05f1152_2.conda 139 | https://conda.anaconda.org/bioconda/linux-64/pysam-0.15.3-py37hda2845c_1.tar.bz2 140 | https://repo.anaconda.com/pkgs/main/noarch/python-dateutil-2.8.1-py_0.tar.bz2 141 | https://conda.anaconda.org/bioconda/linux-64/samtools-1.6-h244ad75_5.tar.bz2 142 | https://repo.anaconda.com/pkgs/main/linux-64/setuptools-49.4.0-py37_0.conda 143 | https://repo.anaconda.com/pkgs/main/linux-64/brotlipy-0.7.0-py37h7b6447c_1000.conda 144 | https://repo.anaconda.com/pkgs/main/linux-64/cryptography-2.9.2-py37h1ba5d50_0.conda 145 | https://repo.anaconda.com/pkgs/main/noarch/networkx-2.4-py_1.conda 146 | https://repo.anaconda.com/pkgs/main/linux-64/numpy-base-1.19.2-py37hfa32c7d_0.conda 147 | https://conda.anaconda.org/bioconda/linux-64/perl-http-message-6.18-pl526_0.tar.bz2 148 | https://conda.anaconda.org/bioconda/noarch/perl-net-http-6.19-pl526_0.tar.bz2 149 | https://conda.anaconda.org/bioconda/linux-64/perl-www-robotrules-6.02-pl526_3.tar.bz2 150 | https://conda.anaconda.org/bioconda/linux-64/perl-xml-simple-2.25-pl526_1.tar.bz2 151 | https://conda.anaconda.org/bioconda/linux-64/sra-tools-2.10.0-pl526he1b5a44_0.tar.bz2 152 | https://repo.anaconda.com/pkgs/main/linux-64/wheel-0.34.2-py37_0.conda 153 | https://conda.anaconda.org/bioconda/linux-64/perl-http-cookies-6.04-pl526_0.tar.bz2 154 | https://conda.anaconda.org/bioconda/linux-64/perl-http-daemon-6.01-pl526_1.tar.bz2 155 | https://conda.anaconda.org/bioconda/linux-64/perl-http-negotiate-6.01-pl526_3.tar.bz2 156 | https://repo.anaconda.com/pkgs/main/linux-64/pip-20.2.2-py37_0.conda 157 | https://repo.anaconda.com/pkgs/main/noarch/pyopenssl-19.1.0-py_1.conda 158 | https://conda.anaconda.org/bioconda/noarch/perl-libwww-perl-6.39-pl526_0.tar.bz2 159 | https://repo.anaconda.com/pkgs/main/noarch/urllib3-1.25.10-py_0.conda 160 | https://conda.anaconda.org/bioconda/linux-64/perl-lwp-protocol-https-6.07-pl526_4.tar.bz2 161 | https://repo.anaconda.com/pkgs/main/noarch/requests-2.24.0-py_0.conda 162 | https://conda.anaconda.org/bioconda/linux-64/entrez-direct-13.3-pl526h375a9b1_0.tar.bz2 163 | https://conda.anaconda.org/bioconda/linux-64/blast-2.9.0-pl526h3066fca_4.tar.bz2 164 | https://repo.anaconda.com/pkgs/main/linux-64/matplotlib-3.2.2-0.conda 165 | https://conda.anaconda.org/bioconda/linux-64/htseq-0.11.2-py37h637b7d7_1.tar.bz2 166 | https://repo.anaconda.com/pkgs/main/linux-64/matplotlib-base-3.2.2-py37hef1b27d_0.conda 167 | https://repo.anaconda.com/pkgs/main/linux-64/mkl_fft-1.1.0-py37h23d657b_0.conda 168 | https://repo.anaconda.com/pkgs/main/linux-64/mkl_random-1.1.1-py37h0573a6f_0.conda 169 | https://repo.anaconda.com/pkgs/main/linux-64/numpy-1.19.2-py37h54aff64_0.conda 170 | https://repo.anaconda.com/pkgs/main/linux-64/numexpr-2.7.1-py37h423224d_0.conda 171 | https://repo.anaconda.com/pkgs/main/linux-64/pandas-1.1.0-py37he6710b0_0.conda 172 | https://repo.anaconda.com/pkgs/main/linux-64/scipy-1.5.0-py37h0b6359f_0.conda 173 | https://repo.anaconda.com/pkgs/main/linux-64/patsy-0.5.1-py37_0.conda 174 | https://repo.anaconda.com/pkgs/main/linux-64/pytables-3.6.1-py37h71ec239_0.conda 175 | https://repo.anaconda.com/pkgs/main/linux-64/statsmodels-0.11.1-py37h7b6447c_0.conda 176 | https://conda.anaconda.org/bioconda/linux-64/mgkit-0.4.2-py37h516909a_0.tar.bz2 177 | -------------------------------------------------------------------------------- /envs/bioinfo-notebook.yml: -------------------------------------------------------------------------------- 1 | name: bioinfo-notebook 2 | channels: 3 | - bioconda 4 | - conda-forge 5 | - defaults 6 | dependencies: 7 | - _libgcc_mutex=0.1=main 8 | - asciinema=2.0.2=py37_1000 9 | - bcftools=1.9=ha228f0b_4 10 | - blas=1.0=mkl 11 | - blast=2.9.0=pl526h3066fca_4 12 | - blosc=1.19.0=hd408876_0 13 | - bowtie=1.2.3=py37hc9558a2_0 14 | - bowtie2=2.3.5.1=py37he513fc3_0 15 | - brotlipy=0.7.0=py37h7b6447c_1000 16 | - bzip2=1.0.8=h7b6447c_0 17 | - ca-certificates=2020.10.14=0 18 | - certifi=2020.6.20=pyhd3eb1b0_3 19 | - cffi=1.14.1=py37he30daa8_0 20 | - chardet=3.0.4=py37_1003 21 | - click=7.1.2=py_0 22 | - cryptography=2.9.2=py37h1ba5d50_0 23 | - curl=7.71.1=hbc83047_1 24 | - cycler=0.10.0=py37_0 25 | - dbus=1.13.16=hb2f20db_0 26 | - decorator=4.4.2=py_0 27 | - entrez-direct=13.3=pl526h375a9b1_0 28 | - expat=2.2.9=he6710b0_2 29 | - fontconfig=2.13.0=h9420a91_0 30 | - freetype=2.10.2=h5ab3b9f_0 31 | - future=0.18.2=py37_1 32 | - glib=2.65.0=h3eb4bd4_0 33 | - gst-plugins-base=1.14.0=hbbd80ab_1 34 | - gstreamer=1.14.0=hb31296c_0 35 | - hdf5=1.10.4=hb1b8bf9_0 36 | - htseq=0.11.2=py37h637b7d7_1 37 | - icu=58.2=he6710b0_3 38 | - idna=2.10=py_0 39 | - intel-openmp=2020.1=217 40 | - jpeg=9b=h024ee3a_2 41 | - kiwisolver=1.2.0=py37hfd86e86_0 42 | - krb5=1.18.2=h173b8e3_0 43 | - ld_impl_linux-64=2.33.1=h53a641e_7 44 | - libcurl=7.71.1=h20c2e04_1 45 | - libdeflate=1.0=h14c3975_1 46 | - libedit=3.1.20191231=h14c3975_1 47 | - libffi=3.3=he6710b0_2 48 | - libgcc-ng=9.1.0=hdf63c60_0 49 | - libgfortran-ng=7.3.0=hdf63c60_0 50 | - libiconv=1.15=h63c8f33_5 51 | - libpng=1.6.37=hbc83047_0 52 | - libssh2=1.9.0=h1ba5d50_1 53 | - libstdcxx-ng=9.1.0=hdf63c60_0 54 | - libuuid=1.0.3=h1bed415_2 55 | - libxcb=1.14=h7b6447c_0 56 | - libxml2=2.9.10=he19cac6_1 57 | - lz4-c=1.9.2=he6710b0_1 58 | - lzo=2.10=h7b6447c_2 59 | - matplotlib=3.2.2=0 60 | - matplotlib-base=3.2.2=py37hef1b27d_0 61 | - mgkit=0.4.2=py37h516909a_0 62 | - mkl=2020.1=217 63 | - mkl-service=2.3.0=py37he904b0f_0 64 | - mkl_fft=1.1.0=py37h23d657b_0 65 | - mkl_random=1.1.1=py37h0573a6f_0 66 | - mock=4.0.2=py_0 67 | - msgpack-python=1.0.0=py37hfd86e86_1 68 | - ncbi-ngs-sdk=2.10.0=hdf6179e_0 69 | - ncurses=6.2=he6710b0_1 70 | - networkx=2.4=py_1 71 | - numexpr=2.7.1=py37h423224d_0 72 | - numpy=1.19.2=py37h54aff64_0 73 | - numpy-base=1.19.2=py37hfa32c7d_0 74 | - openssl=1.1.1h=h7b6447c_0 75 | - pandas=1.1.0=py37he6710b0_0 76 | - patsy=0.5.1=py37_0 77 | - pcre=8.44=he6710b0_0 78 | - perl=5.26.2=h14c3975_0 79 | - perl-app-cpanminus=1.7044=pl526_1 80 | - perl-archive-tar=2.32=pl526_0 81 | - perl-base=2.23=pl526_1 82 | - perl-business-isbn=3.004=pl526_0 83 | - perl-business-isbn-data=20140910.003=pl526_0 84 | - perl-carp=1.38=pl526_3 85 | - perl-common-sense=3.74=pl526_2 86 | - perl-compress-raw-bzip2=2.087=pl526he1b5a44_0 87 | - perl-compress-raw-zlib=2.087=pl526hc9558a2_0 88 | - perl-constant=1.33=pl526_1 89 | - perl-data-dumper=2.173=pl526_0 90 | - perl-digest-hmac=1.03=pl526_3 91 | - perl-digest-md5=2.55=pl526_0 92 | - perl-encode=2.88=pl526_1 93 | - perl-encode-locale=1.05=pl526_6 94 | - perl-exporter=5.72=pl526_1 95 | - perl-exporter-tiny=1.002001=pl526_0 96 | - perl-extutils-makemaker=7.36=pl526_1 97 | - perl-file-listing=6.04=pl526_1 98 | - perl-file-path=2.16=pl526_0 99 | - perl-file-temp=0.2304=pl526_2 100 | - perl-html-parser=3.72=pl526h6bb024c_5 101 | - perl-html-tagset=3.20=pl526_3 102 | - perl-html-tree=5.07=pl526_1 103 | - perl-http-cookies=6.04=pl526_0 104 | - perl-http-daemon=6.01=pl526_1 105 | - perl-http-date=6.02=pl526_3 106 | - perl-http-message=6.18=pl526_0 107 | - perl-http-negotiate=6.01=pl526_3 108 | - perl-io-compress=2.087=pl526he1b5a44_0 109 | - perl-io-html=1.001=pl526_2 110 | - perl-io-socket-ssl=2.066=pl526_0 111 | - perl-io-zlib=1.10=pl526_2 112 | - perl-json=4.02=pl526_0 113 | - perl-json-xs=2.34=pl526h6bb024c_3 114 | - perl-libwww-perl=6.39=pl526_0 115 | - perl-list-moreutils=0.428=pl526_1 116 | - perl-list-moreutils-xs=0.428=pl526_0 117 | - perl-lwp-mediatypes=6.04=pl526_0 118 | - perl-lwp-protocol-https=6.07=pl526_4 119 | - perl-mime-base64=3.15=pl526_1 120 | - perl-mozilla-ca=20180117=pl526_1 121 | - perl-net-http=6.19=pl526_0 122 | - perl-net-ssleay=1.88=pl526h90d6eec_0 123 | - perl-ntlm=1.09=pl526_4 124 | - perl-parent=0.236=pl526_1 125 | - perl-pathtools=3.75=pl526h14c3975_1 126 | - perl-scalar-list-utils=1.52=pl526h516909a_0 127 | - perl-socket=2.027=pl526_1 128 | - perl-storable=3.15=pl526h14c3975_0 129 | - perl-test-requiresinternet=0.05=pl526_0 130 | - perl-time-local=1.28=pl526_1 131 | - perl-try-tiny=0.30=pl526_1 132 | - perl-types-serialiser=1.0=pl526_2 133 | - perl-uri=1.76=pl526_0 134 | - perl-www-robotrules=6.02=pl526_3 135 | - perl-xml-libxml=2.0132=pl526h7ec2d77_1 136 | - perl-xml-namespacesupport=1.12=pl526_0 137 | - perl-xml-parser=2.44=pl526h4e0c4b3_7 138 | - perl-xml-sax=1.02=pl526_0 139 | - perl-xml-sax-base=1.09=pl526_0 140 | - perl-xml-sax-expat=0.51=pl526_3 141 | - perl-xml-simple=2.25=pl526_1 142 | - perl-xsloader=0.24=pl526_0 143 | - pip=20.2.2=py37_0 144 | - pycparser=2.20=py_2 145 | - pymongo=3.11.0=py37he6710b0_0 146 | - pyopenssl=19.1.0=py_1 147 | - pyparsing=2.4.7=py_0 148 | - pyqt=5.9.2=py37h05f1152_2 149 | - pysam=0.15.3=py37hda2845c_1 150 | - pysocks=1.7.1=py37_1 151 | - pytables=3.6.1=py37h71ec239_0 152 | - python=3.7.7=hcff3b4d_5 153 | - python-dateutil=2.8.1=py_0 154 | - pytz=2020.1=py_0 155 | - qt=5.9.7=h5867ecd_1 156 | - readline=8.0=h7b6447c_0 157 | - requests=2.24.0=py_0 158 | - samtools=1.6=h244ad75_5 159 | - scipy=1.5.0=py37h0b6359f_0 160 | - semidbm=0.5.1=pyh864c0ab_3 161 | - setuptools=49.4.0=py37_0 162 | - sip=4.19.8=py37hf484d3e_0 163 | - six=1.15.0=py_0 164 | - snappy=1.1.8=he6710b0_0 165 | - spades=3.13.0=0 166 | - sqlite=3.32.3=h62c20be_0 167 | - sra-tools=2.10.0=pl526he1b5a44_0 168 | - statsmodels=0.11.1=py37h7b6447c_0 169 | - subread=2.0.0=hed695b0_0 170 | - tbb=2020.0=hfd86e86_0 171 | - tk=8.6.10=hbc83047_0 172 | - tornado=6.0.4=py37h7b6447c_1 173 | - tqdm=4.48.2=py_0 174 | - urllib3=1.25.10=py_0 175 | - wheel=0.34.2=py37_0 176 | - xz=5.2.5=h7b6447c_0 177 | - zlib=1.2.11=h7b6447c_3 178 | - zstd=1.4.5=h9ceee32_0 179 | -------------------------------------------------------------------------------- /envs/orthofinder.yml: -------------------------------------------------------------------------------- 1 | name: orthofinder 2 | channels: 3 | - bioconda 4 | - conda-forge 5 | - defaults 6 | dependencies: 7 | - _libgcc_mutex=0.1=conda_forge 8 | - _openmp_mutex=4.5=1_gnu 9 | - blast=2.10.1=pl526he19e7b1_1 10 | - boost-cpp=1.70.0=h7b93d67_3 11 | - bzip2=1.0.8=h516909a_3 12 | - c-ares=1.16.1=h516909a_3 13 | - ca-certificates=2020.6.20=hecda079_0 14 | - certifi=2019.11.28=py27h8c360ce_1 15 | - curl=7.71.1=he644dc0_5 16 | - diamond=2.0.4=h56fc30b_0 17 | - dlcpar=1.0=py_2 18 | - entrez-direct=13.8=pl526h375a9b1_0 19 | - expat=2.2.9=he1b5a44_2 20 | - fastme=2.1.5=0 21 | - fasttree=2.1.10=h516909a_4 22 | - gawk=5.1.0=h516909a_0 23 | - gettext=0.19.8.1=hc5be6a0_1002 24 | - icu=67.1=he1b5a44_0 25 | - iqtree=2.0.3=h176a8bc_0 26 | - krb5=1.17.1=hfafb76e_3 27 | - ld_impl_linux-64=2.35=h769bd43_9 28 | - libblas=3.8.0=17_openblas 29 | - libcblas=3.8.0=17_openblas 30 | - libcurl=7.71.1=hcdd3856_5 31 | - libedit=3.1.20191231=he28a2e2_2 32 | - libev=4.33=h516909a_1 33 | - libffi=3.2.1=he1b5a44_1007 34 | - libgcc=7.2.0=h69d50b8_2 35 | - libgcc-ng=9.3.0=h24d8f2e_16 36 | - libgfortran-ng=7.5.0=hdf63c60_16 37 | - libgomp=9.3.0=h24d8f2e_16 38 | - libidn2=2.3.0=h516909a_0 39 | - liblapack=3.8.0=17_openblas 40 | - libnghttp2=1.41.0=h8cfc5f6_2 41 | - libopenblas=0.3.10=pthreads_hb3c22a3_4 42 | - libssh2=1.9.0=hab1572f_5 43 | - libstdcxx-ng=9.3.0=hdf63c60_16 44 | - libunistring=0.9.10=h14c3975_0 45 | - llvm-meta=7.0.0=0 46 | - lz4-c=1.9.2=he1b5a44_3 47 | - mafft=7.471=h516909a_0 48 | - mcl=14.137=pl526h516909a_5 49 | - mmseqs2=12.113e3=h2d02072_0 50 | - muscle=3.8.1551=hc9558a2_5 51 | - ncurses=6.2=he1b5a44_1 52 | - numpy=1.16.5=py27h95a1406_0 53 | - openmp=7.0.0=h2d50403_0 54 | - openssl=1.1.1g=h516909a_1 55 | - orthofinder=2.2.7=0 56 | - pcre=8.44=he1b5a44_0 57 | - perl=5.26.2=h516909a_1006 58 | - perl-app-cpanminus=1.7044=pl526_1 59 | - perl-archive-tar=2.32=pl526_0 60 | - perl-base=2.23=pl526_1 61 | - perl-business-isbn=3.004=pl526_0 62 | - perl-business-isbn-data=20140910.003=pl526_0 63 | - perl-carp=1.38=pl526_3 64 | - perl-common-sense=3.74=pl526_2 65 | - perl-compress-raw-bzip2=2.087=pl526he1b5a44_0 66 | - perl-compress-raw-zlib=2.087=pl526hc9558a2_0 67 | - perl-constant=1.33=pl526_1 68 | - perl-data-dumper=2.173=pl526_0 69 | - perl-digest-hmac=1.03=pl526_3 70 | - perl-digest-md5=2.55=pl526_0 71 | - perl-encode=2.88=pl526_1 72 | - perl-encode-locale=1.05=pl526_6 73 | - perl-exporter=5.72=pl526_1 74 | - perl-exporter-tiny=1.002001=pl526_0 75 | - perl-extutils-makemaker=7.36=pl526_1 76 | - perl-file-listing=6.04=pl526_1 77 | - perl-file-path=2.16=pl526_0 78 | - perl-file-temp=0.2304=pl526_2 79 | - perl-html-parser=3.72=pl526h6bb024c_5 80 | - perl-html-tagset=3.20=pl526_3 81 | - perl-html-tree=5.07=pl526_1 82 | - perl-http-cookies=6.04=pl526_0 83 | - perl-http-daemon=6.01=pl526_1 84 | - perl-http-date=6.02=pl526_3 85 | - perl-http-message=6.18=pl526_0 86 | - perl-http-negotiate=6.01=pl526_3 87 | - perl-io-compress=2.087=pl526he1b5a44_0 88 | - perl-io-html=1.001=pl526_2 89 | - perl-io-socket-ssl=2.066=pl526_0 90 | - perl-io-zlib=1.10=pl526_2 91 | - perl-json=4.02=pl526_0 92 | - perl-json-xs=2.34=pl526h6bb024c_3 93 | - perl-libwww-perl=6.39=pl526_0 94 | - perl-list-moreutils=0.428=pl526_1 95 | - perl-list-moreutils-xs=0.428=pl526_0 96 | - perl-lwp-mediatypes=6.04=pl526_0 97 | - perl-lwp-protocol-https=6.07=pl526_4 98 | - perl-mime-base64=3.15=pl526_1 99 | - perl-mozilla-ca=20180117=pl526_1 100 | - perl-net-http=6.19=pl526_0 101 | - perl-net-ssleay=1.88=pl526h90d6eec_0 102 | - perl-ntlm=1.09=pl526_4 103 | - perl-parent=0.236=pl526_1 104 | - perl-pathtools=3.75=pl526h14c3975_1 105 | - perl-scalar-list-utils=1.52=pl526h516909a_0 106 | - perl-socket=2.027=pl526_1 107 | - perl-storable=3.15=pl526h14c3975_0 108 | - perl-test-requiresinternet=0.05=pl526_0 109 | - perl-time-local=1.28=pl526_1 110 | - perl-try-tiny=0.30=pl526_1 111 | - perl-types-serialiser=1.0=pl526_2 112 | - perl-uri=1.76=pl526_0 113 | - perl-www-robotrules=6.02=pl526_3 114 | - perl-xml-namespacesupport=1.12=pl526_0 115 | - perl-xml-parser=2.44_01=pl526ha1d75be_1002 116 | - perl-xml-sax=1.02=pl526_0 117 | - perl-xml-sax-base=1.09=pl526_0 118 | - perl-xml-sax-expat=0.51=pl526_3 119 | - perl-xml-simple=2.25=pl526_1 120 | - perl-xsloader=0.24=pl526_0 121 | - pip=20.1.1=pyh9f0ad1d_0 122 | - python=2.7.15=h5a48372_1011_cpython 123 | - python_abi=2.7=1_cp27mu 124 | - raxml=8.2.12=h516909a_2 125 | - readline=8.0=he28a2e2_2 126 | - scipy=1.2.1=py27h921218d_2 127 | - setuptools=44.0.0=py27_0 128 | - sqlite=3.33.0=h4cf870e_0 129 | - tk=8.6.10=hed695b0_0 130 | - wget=1.20.1=h22169c7_0 131 | - wheel=0.35.1=pyh9f0ad1d_0 132 | - xz=5.2.5=h516909a_1 133 | - zlib=1.2.11=h516909a_1009 134 | - zstd=1.4.5=h6597ccf_2 135 | -------------------------------------------------------------------------------- /envs/sgRNAcas9.yml: -------------------------------------------------------------------------------- 1 | name: sgRNAcas9 2 | channels: 3 | - bioconda 4 | - conda-forge 5 | - defaults 6 | dependencies: 7 | - _libgcc_mutex=0.1=conda_forge 8 | - _openmp_mutex=4.5=1_gnu 9 | - alsa-lib=1.2.3=h516909a_0 10 | - ca-certificates=2020.6.20=hecda079_0 11 | - cairo=1.16.0=h3fc0475_1005 12 | - certifi=2020.6.20=py38h32f6830_0 13 | - fontconfig=2.13.1=h1056068_1002 14 | - freetype=2.10.2=he06d7ca_0 15 | - gettext=0.19.8.1=hc5be6a0_1002 16 | - giflib=5.2.1=h516909a_2 17 | - glib=2.66.1=h680cd38_0 18 | - graphite2=1.3.13=he1b5a44_1001 19 | - harfbuzz=2.7.2=hee91db6_0 20 | - icu=67.1=he1b5a44_0 21 | - jpeg=9d=h516909a_0 22 | - lcms2=2.11=hbd6801e_0 23 | - ld_impl_linux-64=2.35=h769bd43_9 24 | - libffi=3.2.1=he1b5a44_1007 25 | - libgcc-ng=9.3.0=h5dbcf3e_17 26 | - libgomp=9.3.0=h5dbcf3e_17 27 | - libiconv=1.16=h516909a_0 28 | - libpng=1.6.37=hed695b0_2 29 | - libstdcxx-ng=9.3.0=h2ae2ef3_17 30 | - libtiff=4.1.0=hc7e4089_6 31 | - libuuid=2.32.1=h14c3975_1000 32 | - libwebp-base=1.1.0=h516909a_3 33 | - libxcb=1.13=h14c3975_1002 34 | - libxml2=2.9.10=h68273f3_2 35 | - lz4-c=1.9.2=he1b5a44_3 36 | - ncurses=6.2=he1b5a44_1 37 | - openjdk=11.0.8=hacce0ff_0 38 | - openssl=1.1.1h=h516909a_0 39 | - pcre=8.44=he1b5a44_0 40 | - perl=5.30.3=h516909a_1 41 | - pip=20.2.3=py_0 42 | - pixman=0.38.0=h516909a_1003 43 | - pthread-stubs=0.4=h14c3975_1001 44 | - python=3.8.5=h1103e12_9_cpython 45 | - python_abi=3.8=1_cp38 46 | - readline=8.0=he28a2e2_2 47 | - seqmap=1.0.13=hc9558a2_1 48 | - setuptools=49.6.0=py38h32f6830_1 49 | - sqlite=3.33.0=h4cf870e_0 50 | - tk=8.6.10=hed695b0_0 51 | - wheel=0.35.1=pyh9f0ad1d_0 52 | - xorg-fixesproto=5.0=h14c3975_1002 53 | - xorg-inputproto=2.3.2=h14c3975_1002 54 | - xorg-kbproto=1.0.7=h14c3975_1002 55 | - xorg-libice=1.0.10=h516909a_0 56 | - xorg-libsm=1.2.3=h84519dc_1000 57 | - xorg-libx11=1.6.12=h516909a_0 58 | - xorg-libxau=1.0.9=h14c3975_0 59 | - xorg-libxdmcp=1.1.3=h516909a_0 60 | - xorg-libxext=1.3.4=h516909a_0 61 | - xorg-libxfixes=5.0.3=h516909a_1004 62 | - xorg-libxi=1.7.10=h516909a_0 63 | - xorg-libxrender=0.9.10=h516909a_1002 64 | - xorg-libxtst=1.2.3=h516909a_1002 65 | - xorg-recordproto=1.14.2=h516909a_1002 66 | - xorg-renderproto=0.11.1=h14c3975_1002 67 | - xorg-xextproto=7.3.0=h14c3975_1002 68 | - xorg-xproto=7.0.31=h14c3975_1007 69 | - xz=5.2.5=h516909a_1 70 | - zlib=1.2.11=h516909a_1009 71 | - zstd=1.4.5=h6597ccf_2 72 | -------------------------------------------------------------------------------- /scripts/DE_analysis_edgeR_script.R: -------------------------------------------------------------------------------- 1 | # https://github.com/rnnh/bioinfo-notebook.git 2 | 3 | # Loading required libraries 4 | library(limma) 5 | library(edgeR) 6 | 7 | # Changing working directory 8 | # Setting the working directory to the directory which contains this script 9 | if (exists("RStudio.Version")){ 10 | setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) 11 | } else { 12 | setwd(getSrcDirectory()[1]) 13 | } 14 | 15 | # Reading in the feature count file as "counts.df" 16 | counts.df <- read.csv("../data/featCounts_S_cere_20200331.csv") 17 | 18 | # Printing the start of the counts.df object in R... 19 | head(counts.df) 20 | 21 | # Using the "Geneid" column to set the rownames 22 | rownames(counts.df) <- counts.df$Geneid 23 | 24 | # Removing the "Geneid" column 25 | counts.df$Geneid <- NULL 26 | 27 | # Printing the start of the counts.df object in R... 28 | head(counts.df) 29 | 30 | # Reading in the design table as "design.df" 31 | design.df <- read.csv("../data/design_table.csv", fileEncoding="UTF-8-BOM") 32 | 33 | # Printing the start of the design.df object in R... 34 | print(design.df) 35 | 36 | # Subsetting gene counts according to experimental condition 37 | counts_standard.df <- counts.df[,c("SRR8933535", "SRR8933536", "SRR8933537")] 38 | counts_anaerobic.df <- counts.df[,c("SRR8933506", "SRR8933511", "SRR8933512")] 39 | counts_high_temp.df <- counts.df[,c("SRR8933532", "SRR8933533", "SRR8933534")] 40 | counts_low_pH.df <- counts.df[,c("SRR8933530", "SRR8933531", "SRR8933539")] 41 | counts_pressure.df <- counts.df[,c("SRR8933509", "SRR8933510", "SRR8933538")] 42 | 43 | # Printing the structure of the gene counts set and subsets 44 | str(counts.df) 45 | str(counts_standard.df) 46 | str(counts_anaerobic.df) 47 | str(counts_high_temp.df) 48 | str(counts_low_pH.df) 49 | str(counts_pressure.df) 50 | 51 | # Defining function "RSD.test()" 52 | RSD.test <- function(dataframe){ 53 | # This function tests whether the relative standard deviation (RSD) is less 54 | # than or equal to one for each row in a data frame. 55 | # It adds the result to a new variable in the data frame called "RSD.test". 56 | # For a given row, if data.frame$RSD.test is TRUE, that row has an RSD less 57 | # than or equal to one, i.e. RSD <= 1. 58 | # If data.frame$RSD.test is FALSE, that row has an RSD outside of this range. 59 | RSD_tests = dataframe[,1] 60 | for (row_index in 1:nrow(dataframe)){ 61 | row = as.numeric(dataframe[row_index,]) 62 | RSD = sd(row) / mean(row) 63 | RSD_tests[row_index] = RSD <= 1 || is.na(RSD) 64 | } 65 | dataframe$RSD.test <- as.factor(RSD_tests) 66 | levels(dataframe$RSD.test) <- c(FALSE, TRUE) 67 | return(dataframe) 68 | } 69 | 70 | # Applying RSD.test() to gene count subsets 71 | counts_standard.df <- RSD.test(counts_standard.df) 72 | counts_anaerobic.df <- RSD.test(counts_anaerobic.df) 73 | counts_high_temp.df <- RSD.test(counts_high_temp.df) 74 | counts_low_pH.df <- RSD.test(counts_low_pH.df) 75 | counts_pressure.df <- RSD.test(counts_pressure.df) 76 | 77 | # Printing the structure of the gene counts subsets 78 | str(counts_standard.df) 79 | str(counts_anaerobic.df) 80 | str(counts_high_temp.df) 81 | str(counts_low_pH.df) 82 | str(counts_pressure.df) 83 | 84 | # Creating list of genes which failed RSD test 85 | RSD_failed_genes <- rownames(counts_standard.df[ 86 | which(counts_standard.df$RSD.test == FALSE),]) 87 | RSD_failed_genes <- append(RSD_failed_genes, rownames(counts_anaerobic.df[ 88 | which(counts_anaerobic.df$RSD.test == FALSE),])) 89 | RSD_failed_genes <- append(RSD_failed_genes, rownames(counts_high_temp.df[ 90 | which(counts_high_temp.df$RSD.test == FALSE),])) 91 | RSD_failed_genes <- append(RSD_failed_genes, rownames(counts_low_pH.df[ 92 | which(counts_low_pH.df$RSD.test == FALSE),])) 93 | RSD_failed_genes <- append(RSD_failed_genes, rownames(counts_pressure.df[ 94 | which(counts_pressure.df$RSD.test == FALSE),])) 95 | RSD_failed_genes <- unique(RSD_failed_genes) 96 | length(RSD_failed_genes) 97 | 98 | # Filtering gene counts 99 | filtered_counts.df <- counts.df[ 100 | which(!rownames(counts.df) %in% RSD_failed_genes),] 101 | 102 | # Printing the structure of the filtered gene counts 103 | str(filtered_counts.df) 104 | 105 | # Checking that gene counts were correctly filtered 106 | nrow(counts.df) - length(RSD_failed_genes) == nrow(filtered_counts.df) 107 | 108 | # Removing redundant objects from R environment 109 | rm(counts_anaerobic.df, counts_high_temp.df, counts_low_pH.df, 110 | counts_pressure.df, counts_standard.df, counts.df, RSD_failed_genes) 111 | 112 | # Creating a DGEList object using the filtered gene counts 113 | counts.DGEList <- DGEList(counts = filtered_counts.df, 114 | genes = rownames(filtered_counts.df)) 115 | 116 | # Printing the design table 117 | print(design.df) 118 | 119 | # Confirming samples are in the same order in the gene counts and design table 120 | summary(colnames(filtered_counts.df) == design.df$run) 121 | 122 | # Add grouping information to DGEList object 123 | counts.DGEList$samples$group <- as.factor(design.df$condition) 124 | 125 | # Printing counts.DGEList 126 | counts.DGEList 127 | 128 | # Summary of the counts.DGEList object: number of genes, number of samples 129 | dim(counts.DGEList) 130 | 131 | # Creating an object to filter genes with low expression 132 | counts.keep <- filterByExpr(counts.DGEList) 133 | summary(counts.keep) 134 | 135 | # Filtering lowly expressed genes 136 | counts.DGEList <- counts.DGEList[counts.keep, , keep.lib.sizes = FALSE] 137 | dim(counts.DGEList) 138 | 139 | # Confirming that the number of genes in counts.DGEList is the same as the 140 | # number of TRUE values in counts.keep 141 | length(counts.keep[counts.keep == TRUE]) == dim(counts.DGEList)[1] 142 | 143 | # Removing counts.keep 144 | rm(counts.keep) 145 | 146 | # Printing the normalisation factors for the libraries 147 | counts.DGEList$samples$norm.factors 148 | 149 | # Calculating normalisation factors and applying them to counts.DGEList 150 | counts.DGEList <- calcNormFactors(counts.DGEList) 151 | counts.DGEList$samples$norm.factors 152 | 153 | # Estimating common dispersion and tagwise dispersion 154 | condition_ <- design.df$condition 155 | counts.DGEList <- estimateDisp(counts.DGEList, 156 | design = model.matrix(~condition_)) 157 | 158 | counts.DGEList 159 | 160 | condition_ 161 | 162 | # Exact tests for differences between experimental conditions 163 | std_anaerobic.DGEExact <- exactTest(counts.DGEList, pair = c("standard", 164 | "anaerobic")) 165 | std_salt.DGEExact <- exactTest(counts.DGEList, pair = c("standard", 166 | "osmotic_pressure")) 167 | std_temp.DGEExact <- exactTest(counts.DGEList, pair = c("standard", 168 | "high_temp")) 169 | std_pH.DGEExact <- exactTest(counts.DGEList, pair = c("standard", 170 | "low_pH")) 171 | 172 | # Extracting most differentially expressed genes from exact tests 173 | std_anaerobic.topTags <- topTags(std_anaerobic.DGEExact) 174 | std_salt.topTags <- topTags(std_salt.DGEExact) 175 | std_temp.topTags <- topTags(std_temp.DGEExact) 176 | std_pH.topTags <- topTags(std_pH.DGEExact) 177 | 178 | # Printing the most differentially expressed genes 179 | std_anaerobic.topTags 180 | std_salt.topTags 181 | std_temp.topTags 182 | std_pH.topTags 183 | 184 | # Printing session information 185 | sessionInfo() 186 | -------------------------------------------------------------------------------- /scripts/UniProt_downloader.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | # https://github.com/rnnh/bioinfo-notebook.git 3 | 4 | # Help/usage text 5 | usage="$(basename "$0") [-h|--help] [-p|--processors n -o|--output] -i|--input \n 6 | \n 7 | This script takes a list of UniProt primary accession numbers (*.list), and \n 8 | downloads the corresponding protein sequences from UniProt as a FASTA amino \n 9 | acid (.faa) file.\n 10 | \n 11 | This list can be generated by searching UniProtKB for a desired term (e.g. \n 12 | 'taxonomy:147537' for the Saccharomycotina subphylum), selecting 'Download' \n 13 | and 'Format: List' to download the accession numbers of the corresponding \n 14 | results.\n 15 | \n 16 | arguments: \n 17 | \t -h | --help\t\t show this help text and exit \n 18 | \t -i | --input\t\t the list of UniProt proteins to download \n 19 | \t -p | --processors\t optional: set the number (n) of processors to \n 20 | \t\t\t\t use (default: 1) \n 21 | \t -o | --output\t\t optional: name of the output .faa file \n 22 | \t\t\t\t (default: uniprot_{date}.faa) \n 23 | " 24 | 25 | PROCESSORS=1 26 | OUTPUT=uniprot_$(date +%Y%m%d).faa 27 | 28 | # Iterating through the input arguments with a while loop 29 | while (( "$#" )); do 30 | case "$1" in 31 | -h|--help) 32 | echo -e $usage 33 | exit 34 | ;; 35 | -i|--input) 36 | INPUT=$2 37 | shift 2 38 | ;; 39 | -p|--processors) 40 | PROCESSORS=$2 41 | shift 2 42 | ;; 43 | -o|--output) 44 | OUTPUT=$2 45 | shift 2 46 | ;; 47 | --) # end argument parsing 48 | shift 49 | break 50 | ;; 51 | -*|--*) # unsupported flags 52 | echo -e "ERROR: $1 is an invalid option. \n" >&2 53 | echo -e $usage 54 | exit 1 55 | ;; 56 | esac 57 | done 58 | 59 | if test -z "$INPUT"; 60 | then 61 | echo -e "ERROR: No input file given. \n" >&2 62 | echo -e $usage 63 | exit 1 64 | fi 65 | 66 | echo "$(date +%Y/%m/%d\ %H:%M) Downloading UniProt sequences..." 67 | 68 | cat $INPUT | \ 69 | xargs -n 1 -P $PROCESSORS -I % curl -s https://www.uniprot.org/uniprot/%.fasta \ 70 | >> $OUTPUT 71 | 72 | echo "$(date +%Y/%m/%d\ %H:%M) Script finished." 73 | -------------------------------------------------------------------------------- /scripts/annotated_snps_filter.R: -------------------------------------------------------------------------------- 1 | # https://github.com/rnnh/bioinfo-notebook.git 2 | 3 | # Aim ========================================================================== 4 | 5 | # This script cross-references annotated SNP files created using 6 | # annotating_snps.R. It takes two files created using this script, and returns 7 | # unique SNPs for each file. If a SNP in File 1 is not found at the same 8 | # position on the same sequence as File 2, it is returned as a unique SNP, and 9 | # vice versa. These unique SNPs are then written to new .tsv files. 10 | 11 | # Selecting files ============================================================== 12 | 13 | # - Assign the name of the first annotated SNP file to be filtered to 14 | # 'annotated_SNP_file_1' 15 | # - Assign the name of the second annotated SNP file to be filtered to 16 | # 'annotated_SNP_file_2' 17 | # - These files should be in the `~/bioinfo-notebook/data/` directory. 18 | # - Optional: the name of the output files can be assigned on lines 109 and 19 | # 115 respectively. 20 | 21 | annotated_SNP_file_1 <- "<.tsv File name here>" 22 | annotated_SNP_file_2 <- "<.tsv File name here>" 23 | 24 | # Setup ======================================================================== 25 | 26 | # Setting the working directory 27 | setwd("~/bioinfo-notebook/data") 28 | 29 | annotated_SNP_file_1 <- read.table( 30 | annotated_SNP_file_1, 31 | stringsAsFactors = FALSE, header = TRUE) 32 | 33 | annotated_SNP_file_2 <- read.table( 34 | annotated_SNP_file_2, 35 | stringsAsFactors = FALSE, header = TRUE) 36 | 37 | # Finding rows in common between annotated SNP data frames ===================== 38 | 39 | # This needs to be carried out multiple times because the number of rows in 40 | # each annotate SNP file differ. Two files may have a SNP in common, but it may 41 | # not occur at the same row number 42 | 43 | # Loops in this section are structured as follows: 44 | # For every row index in a given data frame... 45 | # Get the row using the row index 46 | # If the SNP position for a given row index is in the other data frame... 47 | # Get the indices of the matching rows 48 | # For each index in the indices of matching row... 49 | # If the sequence names are the same for the matching rows... 50 | # Add that index to the matching row values 51 | # Keep only the unique indices 52 | 53 | # Creating empty integer values for the matching SNPs 54 | file_1_SNPs_common_with_file_2 <- integer() 55 | file_2_SNPs_common_with_file_1 <- integer() 56 | 57 | # Rows in common between file 1 and file 2 58 | for (index in 1:nrow(annotated_SNP_file_2)){ 59 | row = annotated_SNP_file_2[index, ] 60 | if (row$POS %in% annotated_SNP_file_1$POS){ 61 | matching_row_indices = which(annotated_SNP_file_1$POS == row$POS) 62 | for (mr_index in matching_row_indices){ 63 | if (annotated_SNP_file_1$sequence[mr_index] == row$sequence){ 64 | file_1_SNPs_common_with_file_2 <- c(file_1_SNPs_common_with_file_2, 65 | mr_index) 66 | file_1_SNPs_common_with_file_2 <- unique(file_1_SNPs_common_with_file_2) 67 | } 68 | } 69 | } 70 | } 71 | 72 | # Rows in common between file 2 and file 1 73 | for (index in 1:nrow(annotated_SNP_file_1)){ 74 | row = annotated_SNP_file_1[index, ] 75 | if (row$POS %in% annotated_SNP_file_2$POS){ 76 | matching_row_indices = which(annotated_SNP_file_2$POS == row$POS) 77 | for (mr_index in matching_row_indices){ 78 | if (annotated_SNP_file_2$sequence[mr_index] == row$sequence){ 79 | file_2_SNPs_common_with_file_1 <- c(file_2_SNPs_common_with_file_1, 80 | mr_index) 81 | file_2_SNPs_common_with_file_1 <- unique(file_2_SNPs_common_with_file_1) 82 | } 83 | } 84 | } 85 | } 86 | 87 | # Filtering SNPs in common between annotated SNP data frames =================== 88 | 89 | # The matching row values produced by the loops in the previous section are 90 | # used to subset each data frame: this is done by selecting non-matching rows 91 | 92 | annotated_SNP_file_1_unique.df <- annotated_SNP_file_1[-file_1_SNPs_common_with_file_2, ] 93 | annotated_SNP_file_2_unique.df <- annotated_SNP_file_2[-file_2_SNPs_common_with_file_1, ] 94 | 95 | # Checking that the correct number of rows were filtered ======================= 96 | 97 | # If the correct number of rows were filtered, the following statements should 98 | # all return TRUE 99 | 100 | nrow(annotated_SNP_file_2) == nrow(annotated_SNP_file_2_unique.df) + 101 | length(file_2_SNPs_common_with_file_1) 102 | 103 | nrow(annotated_SNP_file_1) == nrow(annotated_SNP_file_1_unique.df) + 104 | length(file_1_SNPs_common_with_file_2) 105 | 106 | # Writing data frames to tab-separated values (.tsv) files ===================== 107 | 108 | write.table(annotated_SNP_file_1_unique.df, 109 | file = c(annotated_SNP_file_1, "_filtered.tsv", 110 | fileEncoding = "UTF-8", 111 | sep = "\t", 112 | row.names = FALSE) 113 | 114 | write.table(annotated_SNP_file_2_unique.df, 115 | file = c(annotated_SNP_file_2, "_filtered.tsv", 116 | fileEncoding = "UTF-8", 117 | sep = "\t", 118 | row.names = FALSE) 119 | 120 | # Exiting ====================================================================== 121 | quit(save = "no") 122 | -------------------------------------------------------------------------------- /scripts/annotating_snps.R: -------------------------------------------------------------------------------- 1 | # https://github.com/rnnh/bioinfo-notebook.git 2 | 3 | # Aim ========================================================================== 4 | 5 | # The aim of this script is to cross-reference annotations of genome assemblies 6 | # with VCF files containing SNPs of sequencing reads aligned against those 7 | # genome assemblies. If a SNP falls within- or upstream of- an annotated 8 | # genome feature (start codon, stop codon, CDS, etc.), the script will return 9 | # that feature along with the SNP. 10 | 11 | # Selecting files and parameters =============================================== 12 | 13 | # - The VCF and GFF files to be cross-referenced are specified in this 14 | # section. For this script to work, these files need to use the same 15 | # sequence names: e.g. if the first sequence in the VCF is called "chrI", 16 | # there should be a corresponding sequence called "chrI" in the GFF file. 17 | # - The VCF and GFF files should be in the directory 18 | # '~/bioinfo-notebook/data/'. 19 | # - The number of lines in the VCF file header should be specified in the 20 | # 'VCF_header.int' variable. This is the number of lines that begin with '#' 21 | # in the VCF file. 22 | # - The variable 'upstream.int' is used to determine how far upstream from an 23 | # annotated feature a SNP can be. This can be set to 0 if you do not want 24 | # upstream SNPs to be considered. Setting it to 1000 will mean that SNPs 25 | # up to 1,000 bases/1kb upstream from a feature will be annotated. 26 | # - The variable 'output_name' is used to specify the name of the output file, 27 | # which should end in '.tsv' as it will be a tab-separated values text file. 28 | 29 | GFF_file <- "<.gff File name here>" 30 | VCF_file <- "<.vcf File name here>" 31 | VCF_header.int <- as.integer("") 32 | upstream.int <- as.integer("% 80 | select(-ID, -FORMAT, -FILTER) %>% 81 | filter(POS >= (start - upstream) & 82 | POS <= end) 83 | 84 | # Removing redundant data frames 85 | rm(genome_annotation.df, SNPs.df) 86 | 87 | # Ordering filtered data frame of SNPs with annotations ======================== 88 | attach(SNPs_with_annotations.df) 89 | SNPs_with_annotations.df <- SNPs_with_annotations.df[order(sequence, start, end), ] 90 | detach(SNPs_with_annotations.df) 91 | 92 | # Exporting SNPs with annotations to tab-separated value (.tsv) file =========== 93 | write.table(SNPs_with_annotations.df, 94 | file = output_name, 95 | fileEncoding = "UTF-8", 96 | sep = "\t", 97 | row.names = FALSE) 98 | 99 | # Exiting ====================================================================== 100 | quit(save = "no") 101 | -------------------------------------------------------------------------------- /scripts/combining_featCount_tables.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # https://github.com/rnnh/bioinfo-notebook.git 3 | # -*- coding: utf-8 -*- 4 | """ 5 | Created on Wed Mar 18 12:08:41 2020 6 | 7 | @author: ronan 8 | 9 | This script creates a single CSV feature count table from the featureCounts 10 | output tables in the target directory. 11 | 12 | This combined feature count table can be used for differential expression 13 | analysis (e.g. using DESeq2 or edgeR in R). 14 | """ 15 | 16 | # Loading required libraries 17 | from time import gmtime, strftime 18 | import pandas as pd 19 | import argparse 20 | import sys 21 | import os 22 | 23 | # Parsing command line arguments 24 | parser = argparse.ArgumentParser( 25 | description = "Combines the featureCounts output tables in the target \ 26 | directory.") 27 | 28 | # -d PATH -o CUSTOM_FILENAME 29 | parser.add_argument("-d", "--directory", dest = "path", 30 | help = "path to target directory. \ 31 | Default: current directory") 32 | parser.add_argument("-o", "--output", dest ="custom_filename", 33 | help = "output filename.\ 34 | Default: featCounts_{species}_{date}.csv") 35 | 36 | args = parser.parse_args() 37 | 38 | # Changing to the target directory 39 | if args.path is not None: 40 | path = args.path 41 | else: 42 | path = os.getcwd() 43 | os.chdir(path) 44 | 45 | # Creating variables 46 | fixed_headers = ["Geneid", "Chromosome", "Start", "End", "Strand", "Length"] 47 | target_file_prefix = "feature_counts_" 48 | date = strftime("%Y%m%d", gmtime()) 49 | counts_table = pd.DataFrame() 50 | output_filename = str() 51 | target_file_count = 0 52 | species_name = str() 53 | srr = str() 54 | 55 | # Iterating through files in target directory, combining feature counts 56 | # into one DataFrame object ("counts_table") 57 | for filename in os.listdir(): 58 | if filename.startswith(target_file_prefix): 59 | target_file_count = target_file_count + 1 60 | filename_list = filename.split("_") 61 | srr = filename_list[2] 62 | species_name = filename_list[3] + "_" + filename_list[4] 63 | featCounts_df = pd.read_csv(filename, sep = "\t", 64 | lineterminator = '\n', skiprows = 1, 65 | header = 0) 66 | featCounts_headers = fixed_headers.copy() 67 | featCounts_headers += [srr] 68 | featCounts_df.columns = featCounts_headers 69 | gene_ids = featCounts_df["Geneid"] 70 | counts = featCounts_df[srr] 71 | # Add the gene IDs and counts to the counts_table DataFrame as columns 72 | # if it's empty; otherwise add the counts only 73 | if counts_table.empty: 74 | counts_table = pd.concat([gene_ids, counts], axis = 1, 75 | sort = False) 76 | else: 77 | counts_table = pd.concat([counts_table, counts], axis = 1, 78 | sort = False) 79 | del featCounts_headers 80 | 81 | if target_file_count == 0: 82 | # Exiting script if there are no target files in the target directory 83 | print("ERROR: There are no featureCount files in the target directory. \n") 84 | parser.print_help(sys.stderr) 85 | exit 86 | else: 87 | # Exporting counts_table DataFrame as a CSV file 88 | if args.custom_filename is not None: 89 | output_filename = args.custom_filename 90 | else: 91 | output_filename = "featCounts_" + species_name + "_" + date + ".csv" 92 | counts_table.to_csv(output_filename, index = False) 93 | -------------------------------------------------------------------------------- /scripts/fastq-dump_to_featureCounts.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | # https://github.com/rnnh/bioinfo-notebook.git 3 | 4 | # Help/usage text 5 | usage="$(basename "$0") [options] -a|--annotation \ 6 | -f|--fasta \n 7 | \n 8 | This script downloads FASTQ reads from NCBI's SRA, aligns them to an annotated \n 9 | genome using bowtie2, and generates gene count table(s) using featureCounts.\n 10 | It can take a single SRR ID as an input, or multiple SRR IDs separated by\n 11 | spaces.\n 12 | \n 13 | Required arguments: \n 14 | \t -a | --annotation\t input genome annotation file \n 15 | \t -f | --fasta\t\t input FASTA file for annotated genome \n 16 | \t SRR ID(s)\t\t Sequence Read Archive Run ID(s) (SRR...) \n 17 | \n 18 | Optional arguments: \n 19 | \t -h | --help\t\t show this help text and exit \n 20 | \t -p | --processors\t number (n) of processors to use (default: 1) \n 21 | \t --fastq-dump\t\t use 'fastq-dump' instead of the 'fasterq-dump'\n 22 | \t --verbose\t\t make output of script more verbose\n 23 | \t --removetemp\t\t remove read and alignment files once they are\n 24 | \t \t\t\t no longer needed (minimises disk space needed) \n 25 | \t --log\t\t\t redirect terminal output to log file 26 | " 27 | 28 | # Setting FASTQDUMP to 0 29 | # This will be changed to "1" if --fastq-dump is given as an argument, 30 | # resulting in fastq-dump being used instead of the default fasterq-dump 31 | FASTQDUMP=0 32 | 33 | # Setting VERBOSE to 0 34 | # This will be changed to "1" if --verbose is given as an argument, 35 | # resulting in more verbose script output 36 | VERBOSE=0 37 | 38 | # Setting REMOVETEMP to 0 39 | # This will be changed to "1" if --removetemp is given as an argument, 40 | # resulting in *.fastq, *.fastq.gz, *.sam, *.bam and *.tsv.summary, being 41 | # removed once they are no longer needed to create a featureCounts table 42 | REMOVETEMP=0 43 | 44 | # Setting LOG to 0 45 | # This will be changed to "1" if --log is given as an argument, 46 | # resulting in the terminal output from this script being redirected to a log 47 | # file 48 | LOG=0 49 | 50 | # Setting default number of PROCESSORS to use 51 | PROCESSORS=1 52 | 53 | # Creating an empty variable for SRRs to be downloaded and aligned to genome 54 | SRRs="" 55 | 56 | # Print usage instructions if script is called without any arguments 57 | if [ "$1" = "" ] ; then 58 | echo -e "ERROR: please provide input files. \n" 59 | echo -e $usage 60 | exit 1 61 | fi 62 | 63 | # Iterating through the input arguments with a while loop 64 | while (( "$#" )); do 65 | case "$1" in 66 | -h|--help) 67 | echo -e $usage 68 | exit 69 | ;; 70 | -a|--annotation) 71 | ANNOTATION=$2 72 | shift 2 73 | ;; 74 | -f|--fasta) 75 | FASTA=$2 76 | shift 2 77 | ;; 78 | -p|--processors) 79 | PROCESSORS=$2 80 | shift 2 81 | ;; 82 | --fastq-dump) 83 | FASTQDUMP=1 84 | shift 85 | ;; 86 | --verbose) 87 | VERBOSE=1 88 | shift 89 | ;; 90 | --removetemp) 91 | REMOVETEMP=1 92 | shift 93 | ;; 94 | --log) 95 | LOG=1 96 | shift 97 | ;; 98 | --) # end argument parsing 99 | shift 100 | break 101 | ;; 102 | -*|--*) # unsupported flags 103 | echo -e "ERROR: $1 is an invalid option. \n" >&2 104 | echo -e $usage 105 | exit 1 106 | ;; 107 | *) # preserve SRR ID(s) as positional arguments 108 | SRRs="$SRRs $1" 109 | shift 110 | ;; 111 | esac 112 | done 113 | 114 | if [ $LOG -eq "1" ] 115 | then 116 | # Redirecting terminal output to log file 117 | exec 3>&1 4>&2 118 | trap 'exec 2>&4 1>&3' 0 1 2 3 119 | exec 1>fd_to_fC_$(date +%Y%m%d_%H%M%S).log 2>&1 120 | fi 121 | 122 | # Beginning the main body of the script 123 | # The sleep commands ("sleep 1s", "sleep 2s") slow down the script to make 124 | # the output more readable in real-time 125 | 126 | echo -e ~~~~~~~~~~~~~ F A S T Q - D U M P t o F E A T U R E C O U N T S ~~~~~~~~~~~~~ 127 | echo Script started: $(date) 128 | 129 | # Loop through the input SRR IDs 130 | for SRR in $SRRs 131 | do 132 | printf "\n" 133 | echo ================================================================================ 134 | echo SRR ID: $SRR 135 | sleep 1s 136 | echo Genome annotation: $ANNOTATION 137 | sleep 1s 138 | echo Genome multi-FASTA file: $FASTA 139 | echo ================================================================================ 140 | sleep 1s 141 | 142 | if [ $VERBOSE -eq "1" ] 143 | then 144 | printf "\n" 145 | echo Listing files in directory ... 146 | sleep 1s 147 | ls 148 | sleep 2s 149 | fi 150 | 151 | 152 | if [ $FASTQDUMP -eq "1" ] 153 | then 154 | if [ $VERBOSE -eq "1" ] 155 | then 156 | echo Downloading compressed FASTQ reads using fastq-dump... 157 | fi 158 | until fastq-dump --gzip --skip-technical --readids --read-filter pass \ 159 | --dumpbase --split-3 --clip $SRR; do 160 | echo fastq-dump failed, retrying in 10 seconds... 161 | sleep 10s 162 | done 163 | else 164 | if [ $VERBOSE -eq "1" ] 165 | then 166 | echo Downloading FASTQ reads using fasterq-dump... 167 | fi 168 | if [ $LOG -eq "0" ] 169 | then 170 | until fasterq-dump --progress --threads $PROCESSORS $SRR; do 171 | echo fasterq-dump failed, retrying in 10 seconds... 172 | rm -r fasterq.tmp.* 173 | sleep 10s 174 | done 175 | else 176 | until fasterq-dump --threads $PROCESSORS $SRR; do 177 | echo fasterq-dump failed, retrying in 10 seconds... 178 | rm -r fasterq.tmp.* 179 | sleep 10s 180 | done 181 | fi 182 | fi 183 | 184 | if [ $VERBOSE -eq "1" ] 185 | then 186 | sleep 1s 187 | echo Listing files in directory after downloading reads... 188 | sleep 1s 189 | ls 190 | sleep 2s 191 | fi 192 | 193 | # Checking if bowtie2 index of FASTA file exists before creating bowtie2 index 194 | # If bowtie2_$FASTA.1.bt2 (one of the bowtie2 index files) does not exist... 195 | if [ ! -f bowtie2_$FASTA.1.bt2 ] 196 | # ...then create the bowtie2_$FASTA index 197 | then 198 | if [ $VERBOSE -eq "1" ] 199 | then 200 | echo Indexing genome FASTA file using bowtie2-build... 201 | sleep 2s 202 | fi 203 | bowtie2-build $FASTA bowtie2_$FASTA 204 | if [ $VERBOSE -eq "1" ] 205 | then 206 | sleep 1s 207 | echo Listing files in directory after running bowtie2-build... 208 | sleep 1s 209 | ls 210 | sleep 2s 211 | fi 212 | # Otherwise, print a message confirming that it exists 213 | else 214 | if [ $VERBOSE -eq "1" ] 215 | then 216 | echo The bowtie2 index bowtie2_$FASTA exists 217 | sleep 1s 218 | fi 219 | fi 220 | 221 | if [ $VERBOSE -eq "1" ] 222 | then 223 | echo Aligning reads to reference genome using bowtie2... 224 | sleep 2s 225 | fi 226 | 227 | # Checking if fastq-dump or fasterq-dump was used, as this will result 228 | # in different filenames 229 | if [ $FASTQDUMP -eq "1" ] 230 | then 231 | bowtie2 -p $PROCESSORS --no-unal -x bowtie2_$FASTA \ 232 | -1 $SRR\_pass_1.fastq.gz -2 $SRR\_pass_2.fastq.gz \ 233 | -S $SRR\_$FASTA.sam 234 | else 235 | bowtie2 -p $PROCESSORS --no-unal -x bowtie2_$FASTA \ 236 | -1 $SRR\_1.fastq -2 $SRR\_2.fastq \ 237 | -S $SRR\_$FASTA.sam 238 | fi 239 | 240 | if [ $REMOVEREADS -eq "1"] 241 | then 242 | echo Removing .fastq reads... 243 | rm *.fastq *.fastq.gz 244 | fi 245 | 246 | if [ $VERBOSE -eq "1" ] 247 | then 248 | sleep 1s 249 | echo Listing files in directory after running bowtie2... 250 | sleep 1s 251 | ls 252 | sleep 2s 253 | 254 | echo Converting alignment from SAM to BAM format using samtools view... 255 | sleep 2s 256 | fi 257 | samtools view -@ $PROCESSORS -Sb $SRR\_$FASTA.sam \ 258 | > $SRR\_$FASTA.bam 259 | 260 | if [ $VERBOSE -eq "1" ] 261 | then 262 | sleep 1s 263 | echo Listing files in directory after running samtools view... 264 | sleep 1s 265 | ls 266 | sleep 2s 267 | 268 | echo Sorting the BAM file using samtools sort... 269 | sleep 2s 270 | fi 271 | samtools sort -@ $PROCESSORS $SRR\_$FASTA.bam \ 272 | -o sorted_$SRR\_$FASTA.bam 273 | 274 | if [ $VERBOSE -eq "1" ] 275 | then 276 | sleep 1s 277 | echo Listing files in directory after running samtools sort... 278 | sleep 1s 279 | ls 280 | sleep 2s 281 | 282 | echo Generating count table using featureCounts... 283 | sleep 2s 284 | fi 285 | featureCounts -p -s 2 -T $PROCESSORS -a $ANNOTATION \ 286 | -o feature_counts_$SRR\_$FASTA.tsv \ 287 | sorted_$SRR\_$FASTA.bam 288 | 289 | if [ $VERBOSE -eq "1" ] 290 | then 291 | sleep 1s 292 | echo Listing files in directory after running featureCounts... 293 | sleep 1s 294 | ls 295 | sleep 2s 296 | 297 | echo Results written to feature_counts_$SRR\_$FASTA.tsv 298 | sleep 2s 299 | 300 | echo Head of feature_counts_$SRR\_$FASTA.tsv 301 | sleep 2s 302 | head feature_counts_$SRR\_$FASTA.tsv 303 | sleep 2s 304 | 305 | echo Tail of feature_counts_$SRR\_$FASTA.tsv 306 | sleep 2s 307 | tail feature_counts_$SRR\_$FASTA.tsv 308 | sleep 2s 309 | fi 310 | 311 | 312 | if [ $REMOVETEMP -eq "1" ] 313 | then 314 | echo Removing temporary files... 315 | if [ $FASTQDUMP -eq "1" ] 316 | then 317 | rm *.fastq.gz *.sam *.bam *.tsv.summary 318 | else 319 | rm *.fastq *.sam *.bam *.tsv.summary 320 | fi 321 | fi 322 | 323 | done 324 | 325 | echo Script finished: $(date) 326 | -------------------------------------------------------------------------------- /scripts/genome_annotation_SwissProt_CDS.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | # https://github.com/rnnh/bioinfo-notebook.git 3 | 4 | # Help/usage text 5 | usage="$(basename "$0") [-h|--help] [-d|--demo] [-i|--input] \n 6 | [-l|--log -p|--processors n -e|--email] \n 7 | \n 8 | A script to annotate proteins in a genome assembly, using BLASTx with\n 9 | UniProtKB/Swiss-Prot.\n 10 | \n 11 | When run with the arugment '-d' or '--demo' this script...\n 12 | \n 13 | \t 1. Downloads a Saccharomyces cerevisiae S288C genome assembly, and \n 14 | \t the UniProtKB/Swiss-Prot amino acid sequences. \n 15 | \t 2. Creates a BLAST database from the downloaded Swiss-Prot sequences,\n 16 | \t and searches the S. cerevisiae genome against it using BLASTx with an\n 17 | \t E-value threshold of 1e-100. \n 18 | \t 3. Filters the BLASTx results, removing results with less than 90%\n 19 | \t identity.\n 20 | \t 4. Creates a genome annotation GFF file from these BLASTx results.\n 21 | \t 5. Adds information to the genome annotation from UniProt (protein\n 22 | \t names, KeGG ortholog information, EC numbers, etc.) \n 23 | \n 24 | The end result ('S_cere.gff') is an annotation of the coding sequences (CDS) \n 25 | in the S. cerevisiae genome that are described in UniProtKB/Swiss-Prot. \n 26 | \n 27 | This script can also be run with the argument '-i' or '--input', which is used\n 28 | to specify a FASTA nucleotide file (.fasta or .fna) to annotate, instead of\n 29 | the demo sequence. The end result is also an annotation of the CDS in the input\n 30 | sequence based on UniProtKB/Swiss-Prot, called '.gff'.\n 31 | \n 32 | This script should be called from the 'bioinfo-notebook/' directory.The \n 33 | programs required for this script are in the 'bioinfo-notebook' conda \n 34 | environment (bioinfo-notebook/envs/bioinfo-notebook.yml or \n 35 | bioinfo-notebook/envs/bioinfo-notebook.txt). \n 36 | If the input file is not in the 'bioinfo-notebook/data/' directory, the full \n 37 | file path should be given.\n 38 | \n 39 | arguments: \n 40 | \t -h | --help\t\t show this help text and exit \n 41 | \t -i | --input\t\t name of input FASTA nucleotide file to annotate \n 42 | \t -d | --demo\t\t run the script with demonstration inputs\n 43 | \n 44 | optional arguments:\n 45 | \t -l | --log\t\t redirect terminal output to a log file \n 46 | \t -p | --processors\t set the number (n) of processors to use\n 47 | \t\t\t\t (default: 1) \n 48 | \t -e | --email\t\t contact email for UniProt queries 49 | " 50 | 51 | MAKELOG=false 52 | PROCESSORS=1 53 | EMAIL="none" 54 | DEMO=false 55 | INPUT="" 56 | 57 | # Iterating through the input arguments with a while loop 58 | while (( "$#" )); do 59 | case "$1" in 60 | -h|--help) 61 | echo -e $usage 62 | exit 63 | ;; 64 | -i|--input) 65 | INPUT=$2 66 | shift 2 67 | ;; 68 | -d|--demo) 69 | DEMO=true 70 | shift 1 71 | ;; 72 | -l|--log) 73 | MAKELOG=true 74 | shift 1 75 | ;; 76 | -p|--processors) 77 | PROCESSORS=$2 78 | shift 2 79 | ;; 80 | -e|--email) 81 | EMAIL=$2 82 | shift 2 83 | ;; 84 | --) # end argument parsing 85 | shift 86 | break 87 | ;; 88 | -*|--*) # unsupported flags 89 | echo -e "ERROR: $1 is an invalid option. \n" >&2 90 | echo -e $usage 91 | exit 1 92 | ;; 93 | esac 94 | done 95 | 96 | cd data 97 | 98 | if $MAKELOG ; then 99 | # Creating results directory, if it does not already exist 100 | if [ ! -d ../results ]; then 101 | mkdir ../results 102 | fi 103 | # CREATING LOG FILE 104 | # Terminal output directed to the file 'genome_annotation_[date]_[time].log' 105 | exec 3>&1 4>&2 106 | trap 'exec 2>&4 1>&3' 0 1 2 3 107 | exec 1>../results/genome_annotation_$(date +%Y%m%d_%H%M).log 2>&1 108 | fi 109 | 110 | echo "$(date +%Y/%m/%d\ %H:%M) Beginning genome annotation script." 111 | 112 | if $DEMO ; then 113 | echo Downloading genome FASTA file... 114 | curl -s -o S_cere.fna.gz ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/146\ 115 | /045/GCF_000146045.2_R64/GCF_000146045.2_R64_genomic.fna.gz 116 | 117 | echo Decompressing genome FASTA file... 118 | gunzip S_cere.fna.gz 119 | 120 | fi 121 | 122 | echo Downloading Swiss-Prot sequences... 123 | curl -s -o uniprot_sprot.fasta.gz ftp://ftp.uniprot.org/pub/databases/uniprot/\ 124 | current_release/knowledgebase/complete/uniprot_sprot.fasta.gz | xargs -n 1 \ 125 | -P $PROCESSORS 126 | 127 | echo Decompressing Swiss-Prot sequences... 128 | gunzip uniprot_sprot.fasta.gz 129 | 130 | echo Creating BLAST database... 131 | makeblastdb -dbtype prot -in uniprot_sprot.fasta -out SwissProt 132 | 133 | echo Removing Swiss-Prot sequences... 134 | rm -v uniprot_sprot.fasta 135 | 136 | if $DEMO ; then 137 | echo Searching genome FASTA file against Swiss-Prot with BLASTx... 138 | blastx -num_threads $PROCESSORS -evalue 1e-100 -query S_cere.fna \ 139 | -db SwissProt -outfmt 6 -out blastx_SwissProt_S_cere_unfiltered.tsv 140 | 141 | echo Removing Swiss-Prot database... 142 | rm -v SwissProt* 143 | 144 | echo Filtering BLASTx results with percentage identity less than 90% with awk... 145 | awk '{ if ($3 >= 90) { print } }' blastx_SwissProt_S_cere_unfiltered.tsv \ 146 | > blastx_SwissProt_S_cere.tsv 147 | 148 | echo Removing unfiltered BLASTx results... 149 | rm -v blastx_SwissProt_S_cere_unfiltered.tsv 150 | 151 | echo Creating genome annotation GFF file from BLASTx results... 152 | blast2gff uniprot --fasta-file S_cere.fna blastx_SwissProt_S_cere.tsv \ 153 | S_cere_without_UniProt_info.gff 154 | 155 | echo Adding information to genome annotation from UniProt... 156 | until add-gff-info uniprot --email $EMAIL --protein-names --enzymes \ 157 | --kegg_orthologs --eggnog --taxon-id S_cere_without_UniProt_info.gff \ 158 | S_cere.gff; do 159 | echo add-gff-info failed, retrying in 10 seconds... 160 | rm -v S_cere.gff 161 | sleep 10s 162 | done 163 | 164 | echo Removing copy of genome annotation without added UniProt info... 165 | rm -v S_cere_without_UniProt_info.gff 166 | 167 | echo First line of finished genome annotation... 168 | head -n 1 S_cere.gff 169 | fi 170 | 171 | if [ ! -z $INPUT ]; then 172 | echo Searching genome FASTA file against Swiss-Prot with BLASTx... 173 | blastx -num_threads $PROCESSORS -evalue 1e-100 -query $INPUT \ 174 | -db SwissProt -outfmt 6 -out blastx_SwissProt_$INPUT\_unfiltered.tsv 175 | 176 | echo Removing Swiss-Prot database... 177 | rm -v SwissProt* 178 | 179 | echo Filtering BLASTx results with percentage identity less than 90% with awk... 180 | awk '{ if ($3 >= 90) { print } }' blastx_SwissProt_$INPUT\_unfiltered.tsv \ 181 | > blastx_SwissProt_$INPUT\.tsv 182 | 183 | echo Removing unfiltered BLASTx results... 184 | rm -v blastx_SwissProt_$INPUT\_unfiltered.tsv 185 | 186 | echo Creating genome annotation GFF file from BLASTx results... 187 | blast2gff uniprot --fasta-file $INPUT blastx_SwissProt_$INPUT\.tsv \ 188 | $INPUT\_without_UniProt_info.gff 189 | 190 | echo Adding information to genome annotation from UniProt... 191 | until add-gff-info uniprot --email $EMAIL --protein-names --enzymes \ 192 | --kegg_orthologs --eggnog --taxon-id $INPUT\_without_UniProt_info.gff \ 193 | $INPUT.gff; do 194 | echo add-gff-info failed, retrying in 10 seconds... 195 | rm -v $INPUT.gff 196 | sleep 10s 197 | done 198 | 199 | echo Removing copy of genome annotation without added UniProt info... 200 | rm -v $INPUT\_without_UniProt_info.gff 201 | 202 | echo First line of finished genome annotation... 203 | head -n 1 $INPUT.gff 204 | fi 205 | 206 | echo "$(date +%Y/%m/%d\ %H:%M) Script finished." 207 | -------------------------------------------------------------------------------- /scripts/linux_setup.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash/ 2 | # https://github.com/rnnh/bioinfo-notebook.git 3 | 4 | # Help/usage text 5 | usage="$(basename "$0") \n 6 | \n 7 | This script downloads and installs Miniconda3, and uses conda to install \n 8 | the 'bioinfo-notebook' virtual environment. \n 9 | \n 10 | Before running this script... \n 11 | \n 12 | \t 1. Please run the following command: \n 13 | \t \t \$ sudo apt-get update \n 14 | \t This will ensure that the software installed will be up-to-date. \n 15 | \n 16 | \t 2. Please ensure that the 'bioinfo-notebook/' directory is in your \n 17 | \t home directory (~). The path to this directory should look like this: \n 18 | \t \t $HOME/bioinfo-notebook \n 19 | \n 20 | The 'bash' command is used to run this script: \n 21 | \t \$ bash $0 \n 22 | \n 23 | Optional arguments: \n 24 | \t -h | --help\t show this help text and exit \n 25 | " 26 | 27 | # Iterating through the input arguments with a while loop 28 | while (( "$#" )); do 29 | case "$1" in 30 | -h|--help) 31 | echo -e $usage 32 | exit 0 33 | ;; 34 | esac 35 | done 36 | 37 | # Changing directory to the home directory ("~" or "$HOME") 38 | cd ~ 39 | 40 | echo Checking if the bioinfo-notebook environment is already installed... 41 | sleep 2s # Slows down script to make terminal output more readable 42 | if [ -d ~/miniconda/envs/bioinfo-notebook ]; then 43 | echo The bioinfo-notebook environment already exists, exiting script. 44 | exit 0 45 | fi 46 | 47 | echo Checking if bioinfo-notebook/ is in the home directory... 48 | sleep 2s # Slows down script to make terminal output more readable 49 | # If bioinfo-notebook/ is not in the home directory... 50 | if [ ! -d ~/bioinfo-notebook/ ]; 51 | then 52 | echo ERROR: bioinfo-notebook/ is not in the home directory 53 | echo The home directory is $HOME 54 | echo Please move the bioinfo-notebook/ directory to the home directory, 55 | echo or create a copy of bioinfo-notebook/ in $HOME 56 | exit 1 57 | fi 58 | 59 | echo Downloading Miniconda3 installation script... 60 | sleep 2s # Slows down script to make terminal output more readable 61 | # If the Linux system is 64-bit... 62 | if [ "$(uname -m)" == "x86_64" ]; 63 | then 64 | # Download the script to install the 64-bit version of miniconda 65 | wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh \ 66 | -O miniconda.sh 67 | # If the Linux system is not 64-bit... 68 | else 69 | # Download the script to install the 32-bit version of miniconda 70 | wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86.sh \ 71 | -O miniconda.sh 72 | fi 73 | 74 | echo Installing Miniconda3... 75 | sleep 2s # Slows down script to make terminal output more readable 76 | bash miniconda.sh -b -p $HOME/miniconda 77 | 78 | echo Miniconda3 installed, removing installation script... 79 | rm -f miniconda.sh 80 | 81 | echo Setting up Miniconda3... 82 | sleep 2s # Slows down script to make terminal output more readable 83 | source "$HOME/miniconda/etc/profile.d/conda.sh" 84 | hash -r 85 | conda config --set always_yes yes --set changeps1 yes \ 86 | --set auto_activate_base false 87 | conda update -q conda 88 | conda init 89 | 90 | echo Displaying information about current conda installation... 91 | sleep 2s # Slows down script to make terminal output more readable 92 | conda info -a 93 | 94 | echo Creating the bioinfo-notebook virtual environment using conda... 95 | sleep 2s # Slows down script to make terminal output more readable 96 | # If the Linux system is 64-bit... 97 | if [ "$(uname -m)" == "x86_64" ]; 98 | then 99 | # Create the virtual environment using the explicit spec list 100 | conda create --name bioinfo-notebook \ 101 | --file ~/bioinfo-notebook/envs/bioinfo-notebook.txt 102 | # If the Linux system is not 64-bit... 103 | else 104 | # Create the virtual environment using an "environment".yml file 105 | conda env create -f ~/bioinfo-notebook/envs/bioinfo-notebook.yml 106 | fi 107 | 108 | echo Removing unused packages and caches using conda... 109 | sleep 2s # Slows down script to make terminal output more readable 110 | conda clean --all --yes 111 | 112 | echo -e Script finished! \n 113 | 114 | echo -e Please restart your Linux system for these changes to take effect. \n 115 | 116 | echo The bioinfo-notebook environment can be activated using the command... 117 | echo -e "\t \$ conda activate bioinfo-notebook" 118 | echo A conda virtual environment can be deactivated using the command... 119 | echo -e "\t \$ conda deactivate" 120 | -------------------------------------------------------------------------------- /scripts/snp_calling.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | # https://github.com/rnnh/bioinfo-notebook.git 3 | 4 | # Help/usage text 5 | usage="$(basename "$0") [-h|--help] [-1|--one -2|--two -r|--reference] \n 6 | [-d|--demo] [-o|--output -l|--log -p|--processors n] \n 7 | \n 8 | This script aligns sequencing reads to a reference genome, and finds genetic \n 9 | variants (SNPs/indels) based on this alignment, which are written to a variant\n 10 | call format (VCF) file.\n 11 | \n 12 | Calling this script with the argument '-d' or '--demo' will run this script \n 13 | using Saccharomyces cerevisiae FASTQ sequencing reads and a Saccharomyces \n 14 | cerevisiae reference genome, which will be downloaded from NCBI. \n 15 | \n 16 | This script should be called from the 'bioinfo-notebook/' directory.The \n 17 | programs required for this script are in the 'bioinfo-notebook' conda \n 18 | environment (bioinfo-notebook/envs/bioinfo-notebook.yml or \n 19 | bioinfo-notebook/envs/bioinfo-notebook.txt). \n 20 | If the input files are not in the 'bioinfo-notebook/data/' directory, the full \n 21 | file paths should be given.\n\n 22 | \n 23 | arguments: \n 24 | \t -h | --help\t\t show this help text and exit \n 25 | \t -1 | --one\t\t forward reads to align with reference sequence \n 26 | \t\t\t\t (FASTQ: .fastq or .fastq.gz) \n 27 | \t -2 | --two\t\t reverse reads to align with reference sequence \n 28 | \t\t\t\t (FASTQ: .fastq or .fastq.gz) \n 29 | \t -r | --reference\t reference sequence to align reads against \n 30 | \t\t\t\t (FASTA nucleotide file: .fna) \n 31 | \t -d | --demo\t\t run the script with demonstration inputs\n 32 | \n 33 | optional arguments: \n 34 | \t -o | --output\t\t optional: name of final output file \n 35 | \t\t\t\t (default: 'reference_seq_vs_reads_var.vcf', or \n 36 | \t\t\t\t 'S_cere_DRR237290_var.vcf' if demo is used). \n 37 | \t -l | --log\t\t redirect terminal output to a log file in the \n 38 | \t\t\t\t directory bioinfo-notebook/results/ \n 39 | \t -p | --processors\t optional: set the number (n) of processors to \n 40 | \t\t\t\t use (default: 1) \n 41 | " 42 | 43 | MAKELOG=false 44 | PROCESSORS=1 45 | DEMO=false 46 | ONE="" 47 | TWO="" 48 | REFERENCE="" 49 | OUTPUT="" 50 | 51 | # Iterating through the input arguments with a while loop 52 | while (( "$#" )); do 53 | case "$1" in 54 | -h|--help) 55 | echo -e $usage 56 | exit 57 | ;; 58 | -1|--one) 59 | ONE=$2 60 | shift 2 61 | ;; 62 | -2|--two) 63 | TWO=$2 64 | shift 2 65 | ;; 66 | -r|--reference) 67 | REFERENCE=$2 68 | shift 2 69 | ;; 70 | -o|--output) 71 | OUTPUT=$2 72 | shift 2 73 | ;; 74 | -d|--demo) 75 | DEMO=true 76 | shift 1 77 | ;; 78 | -l|--log) 79 | MAKELOG=true 80 | shift 1 81 | ;; 82 | -p|--processors) 83 | PROCESSORS=$2 84 | shift 2 85 | ;; 86 | --) # end argument parsing 87 | shift 88 | break 89 | ;; 90 | -*|--*) # unsupported flags 91 | echo -e "ERROR: $1 is an invalid option. \n" >&2 92 | echo -e $usage 93 | exit 1 94 | ;; 95 | esac 96 | done 97 | 98 | cd ~/bioinfo-notebook/data/ 99 | 100 | if $MAKELOG ; then 101 | # Creating results directory, if it does not already exist 102 | if [ ! -d ../results ]; then 103 | mkdir ../results 104 | fi 105 | # CREATING LOG FILE 106 | # Terminal output directed to the file 'snp_calling_[date]_[time].log' 107 | exec 3>&1 4>&2 108 | trap 'exec 2>&4 1>&3' 0 1 2 3 109 | exec 1>../results/snp_calling_$(date +%Y%m%d_%H%M).log 2>&1 110 | fi 111 | 112 | echo "$(date +%Y/%m/%d\ %H:%M) Beginning SNP calling script." 113 | 114 | if $DEMO ; then 115 | echo Downloading reads... 116 | until fastq-dump --gzip --skip-technical --readids --read-filter pass \ 117 | --dumpbase --split-files --clip DRR237290; do 118 | echo fastq-dump failed, retrying in 10 seconds... 119 | sleep 10s 120 | done 121 | 122 | echo Downloading reference sequence... 123 | curl -s --remote-name --remote-time ftp://ftp.ncbi.nlm.nih.gov/genomes/all/\ 124 | GCF/000/146/045/GCF_000146045.2_R64/GCF_000146045.2_R64_genomic.fna.gz 125 | 126 | echo Decompressing reference sequence... 127 | gunzip GCF_000146045.2_R64_genomic.fna.gz 128 | 129 | echo Indexing reference sequence for bowtie2... 130 | bowtie2-build GCF_000146045.2_R64_genomic.fna S_cere_ref_seq 131 | 132 | echo Aligning reads to the reference genome... 133 | bowtie2 --no-unal -p $PROCESSORS -x S_cere_ref_seq \ 134 | -1 DRR237290_pass_1.fastq.gz -2 DRR237290_pass_2.fastq.gz \ 135 | -S S_cere_DRR237290_alignment.sam 136 | 137 | echo Converting SAM alignment to sorted BAM alignment... 138 | samtools view -@ $PROCESSORS -Sb \ 139 | -o S_cere_DRR237290_alignment_unsorted.bam S_cere_DRR237290_alignment.sam 140 | 141 | samtools sort -@ $PROCESSORS -O bam -l 9 -o S_cere_DRR237290_alignment.bam \ 142 | S_cere_DRR237290_alignment_unsorted.bam 143 | 144 | echo Removing redundant alignment files... 145 | rm -v S_cere_DRR237290_alignment.sam S_cere_DRR237290_alignment_unsorted.bam 146 | 147 | echo Indexing reference sequence for SAMtools... 148 | samtools faidx GCF_000146045.2_R64_genomic.fna 149 | 150 | echo Generating genotype variant likelihoods with BCFtools... 151 | bcftools mpileup --max-depth 10000 --threads $PROCESSORS \ 152 | -f GCF_000146045.2_R64_genomic.fna \ 153 | -o S_cere_DRR237290_full.bcf S_cere_DRR237290_alignment.bam 154 | 155 | echo Variant calling with BCFtools... 156 | bcftools call -O b --threads $PROCESSORS -vc --ploidy 1 -p 0.05 \ 157 | -o S_cere_DRR237290_var_unfiltered.bcf S_cere_DRR237290_full.bcf 158 | 159 | echo Removing redundant BCF file... 160 | rm -v S_cere_DRR237290_full.bcf 161 | 162 | if [ -z $OUTPUT ]; then 163 | echo Variant filtering with BCFtools filter... 164 | bcftools filter --threads $PROCESSORS -i '%QUAL>=20' -O v \ 165 | -o S_cere_DRR237290_var.vcf S_cere_DRR237290_var_unfiltered.bcf 166 | 167 | echo Head of VCF file... 168 | head S_cere_DRR237290_var.vcf 169 | 170 | echo Tail of VCF file... 171 | tail S_cere_DRR237290_var.vcf 172 | 173 | echo "$(date +%Y/%m/%d\ %H:%M) Script finished." 174 | else 175 | echo Variant filtering with BCFtools filter... 176 | bcftools filter --threads $PROCESSORS -i '%QUAL>=20' -O v \ 177 | -o $OUTPUT.vcf S_cere_DRR237290_var_unfiltered.bcf 178 | 179 | echo Head of VCF file... 180 | head $OUTPUT.vcf 181 | 182 | echo Tail of VCF file... 183 | tail $OUTPUT.vcf 184 | fi 185 | fi 186 | 187 | echo Indexing reference sequence for bowtie2... 188 | bowtie2-build $REFERENCE reference_seq 189 | 190 | echo Aligning reads to the reference genome... 191 | bowtie2 --no-unal -p $PROCESSORS -x reference_seq \ 192 | -1 $ONE -2 $TWO -S reference_seq_vs_reads_alignment.sam 193 | 194 | echo Converting SAM alignment to sorted BAM alignment... 195 | samtools view -@ $PROCESSORS -Sb \ 196 | -o reference_seq_vs_reads_alignment_unsorted.bam \ 197 | reference_seq_vs_reads_alignment.sam 198 | 199 | samtools sort -@ $PROCESSORS -O bam -l 9 \ 200 | -o reference_seq_vs_reads_alignment.bam \ 201 | reference_seq_vs_reads_alignment_unsorted.bam 202 | 203 | echo Removing redundant alignment files... 204 | rm -v reference_seq_vs_reads_alignment.sam \ 205 | reference_seq_vs_reads_alignment_unsorted.bam 206 | 207 | echo Indexing reference sequence for SAMtools... 208 | samtools faidx $REFERENCE 209 | 210 | echo Generating genotype variant likelihoods with BCFtools... 211 | bcftools mpileup --max-depth 10000 --threads $PROCESSORS \ 212 | -f $REFERENCE -o reference_seq_vs_reads_full.bcf \ 213 | reference_seq_vs_reads_alignment.bam 214 | 215 | echo Variant calling with BCFtools... 216 | bcftools call -O b --threads $PROCESSORS -vc --ploidy 1 -p 0.05 \ 217 | -o reference_seq_vs_reads_var_unfiltered.bcf reference_seq_vs_reads_full.bcf 218 | 219 | echo Removing redundant BCF file... 220 | rm reference_seq_vs_reads_full.bcf 221 | 222 | if [ -z $OUTPUT ]; then 223 | echo Variant filtering with BCFtools filter... 224 | bcftools filter --threads $PROCESSORS -i '%QUAL>=20' -O v \ 225 | -o reference_seq_vs_reads_var.vcf reference_seq_vs_reads_var_unfiltered.bcf 226 | 227 | echo Head of VCF file... 228 | head reference_seq_vs_reads_var.vcf 229 | 230 | echo Tail of VCF file... 231 | tail reference_seq_vs_reads_var.vcf 232 | else 233 | echo Variant filtering with BCFtools filter... 234 | bcftools filter --threads $PROCESSORS -i '%QUAL>=20' -O v \ 235 | -o $OUTPUT.vcf reference_seq_vs_reads_var_unfiltered.bcf 236 | 237 | echo Head of VCF file... 238 | head $OUTPUT.vcf 239 | 240 | echo Tail of VCF file... 241 | tail $OUTPUT.vcf 242 | fi 243 | 244 | echo "$(date +%Y/%m/%d\ %H:%M) Script finished." 245 | --------------------------------------------------------------------------------