├── .github
└── ISSUE_TEMPLATE
│ ├── bug_report.md
│ └── feature_request.md
├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── _config.yml
├── assets
└── bioinfo-notebook_logo.svg
├── data
├── design_table.csv
├── example_genome_annotation.gtf
├── example_nucleotide_sequence.fasta
└── featCounts_S_cere_20200331.csv
├── docs
├── DE_analysis_edgeR_script.md
├── DE_analysis_edgeR_script.pdf
├── SPAdes.md
├── UniProt_downloader.md
├── annotated_snps_filter.md
├── annotating_snps.md
├── augustus.md
├── bcftools.md
├── blast.md
├── bowtie.md
├── bowtie2.md
├── cl_intro.md
├── cl_solutions.md
├── combining_featCount_tables.md
├── conda.md
├── fasterq-dump.md
├── fastq-dump.md
├── fastq-dump_to_featureCounts.md
├── featureCounts.md
├── file_formats.md
├── genome_annotation_SwissProt_CDS.md
├── htseq-count.md
├── linux_setup.md
├── orthofinder.md
├── part1.md
├── part2.md
├── part3.md
├── report_an_issue.md
├── samtools.md
├── sgRNAcas9.md
├── snp_calling.md
├── ubuntu_virtualbox.md
└── wsl.md
├── envs
├── augustus.yml
├── bioinfo-notebook.txt
├── bioinfo-notebook.yml
├── orthofinder.yml
└── sgRNAcas9.yml
└── scripts
├── DE_analysis_edgeR_script.R
├── UniProt_downloader.sh
├── annotated_snps_filter.R
├── annotating_snps.R
├── combining_featCount_tables.py
├── fastq-dump_to_featureCounts.sh
├── genome_annotation_SwissProt_CDS.sh
├── linux_setup.sh
└── snp_calling.sh
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help us improve
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 |
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 |
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 |
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 |
26 | **Desktop (please complete the following information):**
27 | - OS: [e.g. iOS]
28 | - Browser [e.g. chrome, safari]
29 | - Version [e.g. 22]
30 |
31 | **Smartphone (please complete the following information):**
32 | - Device: [e.g. iPhone6]
33 | - OS: [e.g. iOS8.1]
34 | - Browser [e.g. stock browser, safari]
35 | - Version [e.g. 22]
36 |
37 | **Additional context**
38 | Add any other context about the problem here.
39 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea for this project
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 |
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 |
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 |
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | data/*.fastq
2 | data/bowtie2_example_index*
3 | data/*.sam
4 | data/*.bam
5 | data/*.bai
6 | data/*.txt
7 | data/*.summary
8 | data/*.gz
9 | data/*.bt2
10 | data/S_cere_GCF_000146045.2_R64_genomic.*
11 | data/*.tsv
12 | data/*.log
13 | temp/
14 | results/
15 | .temp/
16 | results/*
17 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 |
3 | python:
4 | # We don't actually use the Travis Python, but this keeps it organized.
5 | #- "2.7"
6 | #- "3.5"
7 | #- "3.6"
8 | - "3.7"
9 |
10 | install:
11 | - sudo apt-get update
12 | # We do this conditionally because it saves us some downloading if the
13 | # version is the same.
14 | - if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then
15 | wget https://repo.continuum.io/miniconda/Miniconda2-latest-Linux-x86_64.sh -O miniconda.sh;
16 | else
17 | wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh;
18 | fi
19 | - bash miniconda.sh -b -p $HOME/miniconda
20 | - source "$HOME/miniconda/etc/profile.d/conda.sh"
21 | - hash -r
22 | - conda config --set always_yes yes --set changeps1 no
23 | - conda update -q conda
24 | # Useful for debugging any issues with conda
25 | - conda info -a
26 |
27 | # Creating conda environment using envs/bioinfo-notebook.txt
28 | - conda create --name bioinfo-notebook-explicit --file envs/bioinfo-notebook.txt
29 | - conda activate bioinfo-notebook-explicit
30 | - conda deactivate
31 |
32 | # Creating conda environment using envs/bioinfo-notebook.yml
33 | - conda env create --name bioinfo-notebook-yml --file envs/bioinfo-notebook.yml
34 | - conda activate bioinfo-notebook-yml
35 | - conda deactivate
36 |
37 | # Creating conda environment using envs/augustus.yml
38 | - conda env create --name augustus-yml --file envs/augustus.yml
39 | - conda activate augustus-yml
40 | - conda deactivate
41 |
42 | # Creating conda environment using envs/orthofinder.yml
43 | - conda env create --name orthofinder-yml --file envs/orthofinder.yml
44 | - conda activate orthofinder-yml
45 | - conda deactivate
46 |
47 | # Creating conda environment using envs/sgRNAcas9.yml
48 | - conda env create --name sgRNAcas9-yml --file envs/sgRNAcas9.yml
49 | - conda activate sgRNAcas9-yml
50 | - conda deactivate
51 |
52 | script:
53 | # Confirming that programs work in conda environments
54 | # bioinfo-notebook-explicit
55 | - conda activate bioinfo-notebook-explicit
56 | - bowtie2 --version
57 | - samtools --version
58 | - fastq-dump --version
59 | - conda deactivate
60 |
61 | # bioinfo-notebook-yml
62 | - conda activate bioinfo-notebook-yml
63 | - bowtie2 --version
64 | - samtools --version
65 | - fastq-dump --version
66 | - conda deactivate
67 |
68 | # augustus-yml
69 | - conda activate augustus-yml
70 | - augustus --help
71 | - conda deactivate
72 |
73 | # orthofinder-yml
74 | - conda activate orthofinder-yml
75 | - orthofinder --help
76 | - conda deactivate
77 |
78 | # orthofinder-yml
79 | - conda activate sgRNAcas9-yml
80 | - java --version
81 | - conda deactivate
82 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 Ronan Harrington
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | title: Home
4 | nav_order: 1
5 | description: "Quick start guides for bioinformatics programs, with video demonstrations and scripts."
6 | permalink: /
7 | ---
8 |
9 |
10 | # [Bioinformatics Notebook](https://github.com/rnnh/bioinfo-notebook.git)
11 |
12 | by [Ronan Harrington](https://github.com/rnnh)
13 |
14 | [](https://travis-ci.com/rnnh/bioinfo-notebook)
15 | [](https://opensource.org/licenses/MIT)
16 | 
17 | 
18 | 
19 | [](https://zenodo.org/badge/latestdoi/243280413)
20 |
21 | This project provides introductions to various bioinformatics tools with short guides, video demonstrations, and scripts that tie these tools together.
22 | The documents in this project can be read locally in a plain-text editor, or viewed online at .
23 | If you are not familiar with using programs from the command line, begin with the page "[Introduction to the command line](docs/cl_intro.md)".
24 | If you have any questions, or spot any mistakes, [please submit an issue on GitHub](https://github.com/rnnh/bioinfo-notebook/issues).
25 |
26 | - [Pipeline examples](#pipeline-examples)
27 | - [Contents](#contents)
28 | - [Installation instructions](#installation-instructions)
29 | - [Repository structure](#repository-structure)
30 |
31 | ## Pipeline examples
32 |
33 | These bioinformatics pipelines can be carried out using scripts and tools described in this project.
34 | Input files for some of these scripts can be specified in the command line; other scripts will need to be altered to fit the given input data.
35 |
36 | ### SNP analysis
37 |
38 | - [FASTQ](docs/file_formats.md#fastq) reads from whole genome sequencing (WGS) can be assembled using [SPAdes](docs/SPAdes.md).
39 | - Sequencing reads can be aligned to this assembled genome using [bowtie2](docs/bowtie2.md).
40 | - The script [snp_calling.sh](docs/snp_calling.md) aligns sequencing reads to an assembled genome and detects single nucleotide polymorphisms (SNPs). This will produce a [Variant Call Format (VCF) file](docs/file_formats.md#vcf).
41 | - The proteins in the assembled reference genome- the genome to which the reads are aligned- can be annotated using [genome_annotation_SwissProt_CDS.sh](docs/genome_annotation_SwissProt_CDS.md).
42 | - The genome annotation [GFF](docs/file_formats.md#gff) file can be cross-referenced with the VCF file using [annotating_snps.R](docs/annotating_snps.md). This will produce an [annotated SNP format](docs/annotating_snps.md#annotated-snp-format) file.
43 | - Annotated SNP format files can be cross-referenced using [annotated_snps_filter.R](docs/annotated_snps_filter.md). For two annotated SNP files, this script will produce a file with annotated SNPs unique to the first file, and a file with annotated SNPs unique to the second file.
44 |
45 | ### RNA-seq analysis
46 |
47 | - [fastq-dump_to_featureCounts.sh](docs/fastq-dump_to_featureCounts.md) can be used to download RNA-seq reads from NCBI's Sequence Read Archive (SRA) and align them to a reference genome. This script uses [fastq-dump](docs/fastq-dump.md) or [fasterq-dump](docs/fasterq-dump.md) to download the sequencing reads as [FASTQ](docs/file_formats.md#fastq), and [featureCounts](docs/featureCounts.md) to align them to a reference [FASTA nucleotide file.](docs/file_formats.md#fasta)
48 | - Running [fastq-dump_to_featureCounts.sh](docs/fastq-dump_to_featureCounts.md) will produce feature count tables. These feature count tables can be combined using [combining_featCount_tables.py](docs/combining_featCount_tables.md).
49 | - These combined feature count tables can be used for differential expression (DE) analysis. An example DE analysis script is included in this project: [DE_analysis_edgeR_script.R](docs/DE_analysis_edgeR_script.md). This script uses the [R programming language](https://cran.r-project.org/) with the [edgeR](https://bioconductor.org/packages/release/bioc/html/edgeR.html) library.
50 |
51 | ### Detecting orthologs between genomes
52 |
53 | - [Augustus](docs/augustus.md) can be used to predict genes from [FASTA nucleotide files](docs/file_formats.md#fasta).
54 | - Once the FASTA amino acid sequences have been [extracted from the Augustus annotations](docs/augustus.md#extracting-the-fasta-amino-acid-sequences-of-predicted-genes-from-an-augustus-annotation), you can search for orthologs using [OrthoFinder](docs/orthofinder.md).
55 | - To find a specific gene of interest, search the amino acid sequences of the predicted genes using [BLAST](docs/blast.md).
56 |
57 | ## Contents
58 |
59 | ### [1. General guides](docs/part1.md)
60 |
61 | - [Introduction to the command line](docs/cl_intro.md)
62 | - [Windows Subsystem for Linux](docs/wsl.md)
63 | - [Using Ubuntu through a Virtual Machine](docs/ubuntu_virtualbox.md)
64 | - [File formats used in bioinformatics](docs/file_formats.md)
65 |
66 | ### [2. Program guides](docs/part2.md)
67 |
68 | - [Augustus](docs/augustus.md)
69 | - [Bcftools](docs/bcftools.md)
70 | - [BLAST](docs/blast.md)
71 | - [Bowtie](docs/bowtie.md)
72 | - [Bowtie2](docs/bowtie2.md)
73 | - [Conda](docs/conda.md)
74 | - [Fasterq-dump](docs/fasterq-dump.md)
75 | - [Fastq-dump](docs/fastq-dump.md)
76 | - [FeatureCounts](docs/featureCounts.md)
77 | - [Htseq-count](docs/htseq-count.md)
78 | - [OrthoFinder](docs/orthofinder.md)
79 | - [SAMtools](docs/samtools.md)
80 | - [sgRNAcas9](docs/sgRNAcas9.md)
81 | - [SPAdes](docs/SPAdes.md)
82 |
83 | ### [3. Scripts](docs/part3.md)
84 |
85 | - [Annotated SNPs filter](docs/annotated_snps_filter.md)
86 | - [Annotating SNPs](docs/annotating_snps.md)
87 | - [Combining featCount tables.py](docs/combining_featCount_tables.md)
88 | - [DE_analysis_edgeR_script.R](docs/DE_analysis_edgeR_script.md)
89 | - [Fastq-dump to featureCounts](docs/fastq-dump_to_featureCounts.md)
90 | - [Genome annotation script](docs/genome_annotation_SwissProt_CDS.md)
91 | - [Linux setup script](docs/linux_setup.md)
92 | - [SNP calling script](docs/snp_calling.md)
93 | - [UniProt downloader](docs/UniProt_downloader.md)
94 |
95 | ## Installation instructions
96 |
97 | After following these instructions, there will be a copy of the [bioinfo-notebook GitHub repo](https://www.github.com/rnnh/bioinfo-notebook/) on your system in the `~/bioinfo-notebook/` directory.
98 | This means there will be a copy of all the documents and scripts in this project on your computer.
99 | If you are using Linux and run the [Linux setup script](docs/linux_setup.sh), the `bioinfo-notebook` virtual environment- which includes the majority of the command line programs covered in this project- will also be installed using [conda](docs/conda.md).
100 |
101 | **1.** This project is written to be used through a UNIX (Linux or Mac with macOS Mojave or later) operating system.
102 | If you are using a Windows operating system, begin with these pages on setting up Ubuntu (a Linux operating system):
103 |
104 | - [Windows Subsystem for Linux](docs/wsl.md)
105 | - [Using Ubuntu through a Virtual Machine](docs/ubuntu_virtualbox.md)
106 |
107 | Once you have an Ubuntu system set up, run the following command to update the lists of available software:
108 |
109 | ```bash
110 | $ sudo apt-get update # Updates lists of software that can be installed
111 | ```
112 |
113 | **2.** Run the following command in your home directory (`~`) to download this project:
114 |
115 | ```bash
116 | $ git clone https://github.com/rnnh/bioinfo-notebook.git
117 | ```
118 |
119 | **3.** If you are using Linux, run the [Linux setup script](docs/linux_setup.md) with this command after downloading the project:
120 |
121 | ```bash
122 | $ bash ~/bioinfo-notebook/scripts/linux_setup.sh
123 | ```
124 |
125 | ### Video demonstration of installation
126 |
127 | [](https://asciinema.org/a/314853?autoplay=1)
128 |
129 | ## Repository structure
130 |
131 | ```
132 | bioinfo-notebook/
133 | ├── assets/
134 | │ └── bioinfo-notebook_logo.svg
135 | ├── data/
136 | │ ├── blastx_SwissProt_example_nucleotide_sequence.fasta.tsv
137 | │ ├── blastx_SwissProt_S_cere.tsv
138 | │ ├── design_table.csv
139 | │ ├── example_genome_annotation.gtf
140 | │ ├── example_nucleotide_sequence.fasta
141 | │ └── featCounts_S_cere_20200331.csv
142 | ├── docs/
143 | │ ├── annotated_snps_filter.md
144 | │ ├── annotating_snps.md
145 | │ ├── augustus.md
146 | │ ├── blast.md
147 | │ ├── bowtie2.md
148 | │ ├── bowtie.md
149 | │ ├── cl_intro.md
150 | │ ├── cl_solutions.md
151 | │ ├── combining_featCount_tables.md
152 | │ ├── conda.md
153 | │ ├── DE_analysis_edgeR_script.md
154 | │ ├── DE_analysis_edgeR_script.pdf
155 | │ ├── fasterq-dump.md
156 | │ ├── fastq-dump.md
157 | │ ├── fastq-dump_to_featureCounts.md
158 | │ ├── featureCounts.md
159 | │ ├── file_formats.md
160 | │ ├── genome_annotation_SwissProt_CDS.md
161 | │ ├── htseq-count.md
162 | │ ├── linux_setup.md
163 | │ ├── orthofinder.md
164 | │ ├── part1.md # Navigation page for website
165 | │ ├── part2.md # Navigation page for website
166 | │ ├── part3.md # Navigation page for website
167 | │ ├── report_an_issue.md
168 | │ ├── samtools.md
169 | │ ├── sgRNAcas9.md
170 | │ ├── snp_calling.md
171 | │ ├── SPAdes.md
172 | │ ├── ubuntu_virtualbox.md
173 | │ ├── UniProt_downloader.md
174 | │ └── wsl.md
175 | ├── envs/ # conda environment files
176 | │ ├── augustus.yml # environment for Augustus
177 | │ ├── bioinfo-notebook.txt
178 | │ ├── bioinfo-notebook.yml
179 | │ ├── orthofinder.yml # environment for OrthoFinder
180 | │ └── sgRNAcas9.yml # environment for sgRNAcas9
181 | ├── scripts/
182 | │ ├── annotated_snps_filter.R
183 | │ ├── annotating_snps.R
184 | │ ├── combining_featCount_tables.py
185 | │ ├── DE_analysis_edgeR_script.R
186 | │ ├── fastq-dump_to_featureCounts.sh
187 | │ ├── genome_annotation_SwissProt_CDS.sh
188 | │ ├── linux_setup.sh
189 | │ ├── snp_calling.sh
190 | │ └── UniProt_downloader.sh
191 | ├── _config.yml # Configures github.io project website
192 | ├── .gitignore
193 | ├── LICENSE
194 | ├── README.md
195 | └── .travis.yml # Configures Travis CI testing for GitHub repo
196 | ```
197 |
--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | remote_theme: pmarsceill/just-the-docs
2 | baseurl: "/bioinfo-notebook" # the subpath of your site, e.g. /blog
3 | url: "https://rnnh.github.io" # the base hostname & protocol for your site, e.g. http://example.com
4 | title: Bioinformatics Notebook
5 | logo: assets/bioinfo-notebook_logo.svg
6 | search_enabled: true
7 | search_tokenizer_separator: /[\s/]+/
8 | aux_links:
9 | "Bioinformatics Notebook on GitHub":
10 | - "//github.com/rnnh/bioinfo-notebook"
11 | heading_anchors: true
12 |
--------------------------------------------------------------------------------
/assets/bioinfo-notebook_logo.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
154 |
--------------------------------------------------------------------------------
/data/design_table.csv:
--------------------------------------------------------------------------------
1 | run,name,condition
2 | SRR8933532,SCEhightemp3,high_temp
3 | SRR8933534,SCEhightemp1,high_temp
4 | SRR8933509,SCEkcl3,osmotic_pressure
5 | SRR8933530,SCElowPH2,low_pH
6 | SRR8933511,SCEanaer2,anaerobic
7 | SRR8933533,SCEhightemp2,high_temp
8 | SRR8933537,SCEstan1,standard
9 | SRR8933506,SCEanaer3,anaerobic
10 | SRR8933531,SCElowPH1,low_pH
11 | SRR8933538,SCEkcl1,osmotic_pressure
12 | SRR8933512,SCEanaer1,anaerobic
13 | SRR8933510,SCEkcl2,osmotic_pressure
14 | SRR8933535,SCEstan3,standard
15 | SRR8933536,SCEstan2,standard
16 | SRR8933539,SCElowPH3,low_pH
17 |
--------------------------------------------------------------------------------
/docs/DE_analysis_edgeR_script.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rnnh/bioinfo-notebook/aa1c8f5318d40c4105a50108ea1a6102433be8a0/docs/DE_analysis_edgeR_script.pdf
--------------------------------------------------------------------------------
/docs/SPAdes.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | title: SPAdes
4 | parent: 2. Program guides
5 | ---
6 |
7 | # SPAdes
8 |
9 | SPAdes [is an assembly toolkit containing various assembly pipelines](https://github.com/ablab/spades/blob/spades_3.14.1/README.md).
10 |
11 | ## Assembling a genome from Illumina paired-end reads using `SPAdes`
12 |
13 | `SPAdes` can be used to assemble paired-end reads as follows:
14 |
15 | ```bash
16 | $ spades -1 reads_1.fq.gz -2 reads_2.fq.gz -t 5 -m 200 -o results/directory/
17 | ```
18 |
19 | In this command...
20 |
21 | 1. **`-1`** is the file with forward reads.
22 | 2. **`-2`** is the file with reverse reads.
23 | 3. **`-t`** or **`--threads`** sets the number of processors/threads to use. The default is 16.
24 | 4. **`-m`** or **`--memory`** is memory the limit in Gb. SPAdes terminates if it reaches this limit. The default value is 250Gb.
25 | 5. **`-o`** or **`--outdir`** is the output directory to use. The default is the current directory.
26 |
27 | SPAdes supports uncompressed (**`.fastq`** or **`.fq`**) or compressed (**`.fastq.gz`** or **`.fq.gz`**) sequencing read inputs.
28 | In the output directory, the assembled genome will be available as contigs (**`contigs.fasta`**) and scaffolds (**`scaffolds.fasta`**), both of which are FASTA nucleotide files.
29 |
30 | ## See also
31 |
32 | - [conda](conda.md): The `bioinfo-notebook` conda environment includes SPAdes
33 | - [File formats used in bioinformatics](file_formats.md)
34 |
35 | ## Further reading
36 |
37 | - [SPAdes on GitHub](https://github.com/ablab/spades/)
38 |
--------------------------------------------------------------------------------
/docs/UniProt_downloader.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | title: UniProt downloader
4 | parent: 3. Scripts
5 | ---
6 |
7 | # UniProt downloader
8 |
9 | [UniProt_downloader.sh](../scripts/UniProt_downloader.sh) is a `bash` shell script for downloading [UniProt](https://www.uniprot.org/) protein sequences to a FASTA amino acid ([.faa](file_formats.md)) file.
10 | It takes a list of UniProt accession numbers as input, and then pipes each one into a `curl` command to download the corresponding protein.
11 | This is essentially a [one-line program](https://en.wikipedia.org/wiki/One-liner_program) wrapped in a shell script to make downloading UniProt sequences easier.
12 |
13 | ## Usage
14 |
15 | ```
16 | UniProt_downloader.sh [-h|--help] [-p|--processors n -o|--output] -i|--input
17 |
18 | This script takes a list of UniProt primary accession numbers (*.list), and
19 | downloads the corresponding protein sequences from UniProt as a FASTA amino
20 | acid (.faa) file.
21 |
22 | This list can be generated by searching UniProtKB for a desired term (e.g.
23 | 'taxonomy:147537' for the Saccharomycotina subphylum), selecting 'Download'
24 | and 'Format: List' to download the accession numbers of the corresponding
25 | results.
26 |
27 | arguments:
28 | -h | --help show this help text and exit
29 | -i | --input the list of UniProt proteins to download
30 | -p | --processors optional: set the number (n) of processors to
31 | use (default: 1)
32 | -o | --output optional: name of the output .faa file
33 | (default: uniprot_{date}.faa)
34 | ```
35 |
36 | ## See also
37 |
38 | - [UniProt_downloader.sh on GitHub](https://github.com/rnnh/bioinfo-notebook/blob/master/scripts/UniProt_downloader.sh)
39 | - [File formats used in bioinformatics](file_formats.md)
40 | - [UniProt](https://www.uniprot.org/)
41 |
--------------------------------------------------------------------------------
/docs/annotated_snps_filter.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | title: Annotated SNPs filter
4 | parent: 3. Scripts
5 | ---
6 |
7 | # Annotated SNPs filter
8 |
9 | [annotated_snps_filter.R](../scripts/annotated_snps_filter.R) is an `R` script cross-references annotated SNP files created using [annotating_snps.R](annotating_snps.md).
10 | It takes two files created using this script, and returns unique SNPs for each file.
11 | If a SNP in File 1 is not found at the same position on the same sequence as File 2, it is returned as a unique SNP, and vice versa.
12 | These unique SNPs are then written to new `.tsv` files.
13 |
14 | ## Usage
15 |
16 | To use this script, variables need to be defined on lines 21 and 22 of the script:
17 |
18 | - Assign the name of the first annotated SNP file to be filtered to 'annotated_SNP_file_1'.
19 | - Assign the name of the second annotated SNP file to be filtered to 'annotated_SNP_file_2'.
20 | - These files should be in the `~/bioinfo-notebook/data/` directory.
21 | - Optional: the name of the output files can be assigned on lines 109 and 115 respectively.
22 |
23 | ## See also
24 |
25 | - [annotated_snps_filter.R on GitHub](https://github.com/rnnh/bioinfo-notebook/blob/master/scripts/annotated_snps_filter.R)
26 | - [annotating_snps.R](annotating_snps.md)
27 | - [File formats used in bioinformatics](file_formats.md)
28 | - [snp_calling.sh](snp_calling.md), a script for generating VCF files of SNPs.
29 | - [genome_annotation_SwissProt_CDS.sh](genome_annotation_SwissProt_CDS.md), a script for generating genome annotation GFF files.
30 |
--------------------------------------------------------------------------------
/docs/annotating_snps.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | title: Annotating SNPs
4 | parent: 3. Scripts
5 | ---
6 |
7 | # Annotating SNPs
8 |
9 | [annotating_snps.R](../scripts/annotating_snps.R) is an `R` script that cross-references annotations of genome assemblies with VCF files containing SNPs of sequencing reads aligned against
10 | those genome assemblies.
11 | If a SNP falls within- or upstream of- an annotated genome feature (start codon, stop codon, CDS, etc.), the script will return that feature along with the SNP.
12 | For this script to work, these files need to use the same sequence names: e.g. if the first sequence in the VCF is called "chrI", there should be a corresponding sequence called "chrI" in the GFF file.
13 |
14 | ## Usage
15 |
16 | To use this script, variables need to be defined on lines 28 to 32 of the script:
17 |
18 | - The GFF file name should be assigned to the variable `GFF_file`.
19 | - The VCF file name should be assigned to the variable `VCF_file`.
20 | - The VCF and GFF files should be in the directory `~/bioinfo-notebook/data/`.
21 | - The number of lines in the VCF file header should be specified in the `VCF_header.int` variable. This is the number of lines that begin with `#` in the VCF file.
22 | - The variable `upstream.int` is used to determine how far upstream from an annotated feature a SNP can be. This can be set to 0 if you do not want upstream SNPs to be considered. Setting it to 1000 will mean that SNPs up to 1,000 bases/1kb upstream from a feature will be annotated.
23 | - The variable 'output_name' is used to specify the name of the output file, which should end in '.tsv' as it will be a tab-separated values text file.
24 |
25 | ## Annotated SNP format
26 |
27 | The `.tsv` files created by this script have a combination of columns from the [GFF and VCF formats](file_formats.md) as follows...
28 |
29 | 1. `sequence` The name of the sequence where the feature is located.
30 | 2. `source` Keyword identifying the source of the feature, like a program (e.g. Augustus) or an organization (e.g. [SGD](https://www.yeastgenome.org/)).
31 | 3. `feature` The feature type name, like `gene` or `exon`. In a well-structured GFF file, all the children features always follow their parents in a single block (so all exons of a transcript are put after their parent `transcript` feature line and before any other parent transcript line).
32 | 4. `start` Genomic start of the feature, with a 1-base offset.
33 | 5. `end` Genomic end of the feature, with a 1-base offset.
34 | 6. `score` Numeric value that generally indicates the confidence of the source in the annotated feature. A value of `.` (a dot) is used to define a null value.
35 | 7. `strand` Single character that indicates the strand of the feature; it can assume the values of `+` (positive, or `5'->3'`), `-`, (negative, or `3'->5'`), `.` (undetermined).
36 | 8. `phase` Phase of coding sequence (CDS) features, indicating where the feature starts in relation to the reading frame. It can be either one of `0`, `1`, `2` (for CDS features) or `.` (for everything else).
37 | 9. `attributes` All the other information pertaining to this feature. The format, structure and content of this field is the one which varies the most between GFF formats.
38 | 10. `POS` The 1-based position of the variation on the given sequence.
39 | 11. `REF` The reference base (or bases in the case of an indel) at the given position on the given reference sequence.
40 | 12. `ALT` The list of alternative alleles at this position.
41 | 13. `QUAL` A quality score associated with the inference of the given alleles.
42 | 14. `FILTER` A flag indicating which of a given set of filters the variation has passed.
43 | 15. `INFO` An extensible list of key-value pairs (fields) describing the variation. Multiple fields are separated by semicolons with optional values in the format: =[,data].
44 | 16. `SAMPLE` For each (optional) sample described in the file, values are given for the fields listed in FORMAT. If multiple samples have been aligned to the reference sequence, each sample will have its own column.
45 |
46 |
47 | ## See also
48 |
49 | - [annotating_snps.R on GitHub](https://github.com/rnnh/bioinfo-notebook/blob/master/scripts/annotating_snps.R)
50 | - [annotated_snps_filter.R](annotated_snps_filter.md)
51 | - [File formats used in bioinformatics](file_formats.md)
52 | - [snp_calling.sh](snp_calling.md), a script for generating VCF files of SNPs.
53 | - [genome_annotation_SwissProt_CDS.sh](genome_annotation_SwissProt_CDS.md), a script for generating genome annotation GFF files.
54 |
--------------------------------------------------------------------------------
/docs/augustus.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | title: Augustus
4 | parent: 2. Program guides
5 | ---
6 |
7 | # Augustus
8 |
9 | Augustus is a program that predicts genes in eukaryotic genomic sequences.
10 | It can be run online, with a server for [smaller files](http://bioinf.uni-greifswald.de/augustus/submission.php) and one for [larger files](http://bioinf.uni-greifswald.de/webaugustus/), or locally.
11 | The local version of Augustus can be installed through [conda](conda.md).
12 | This project includes an example [augustus conda environment](../envs/augustus.yml).
13 |
14 | ## Predicting genes in a eukaryotic FASTA nucleic acid file using `augustus`
15 |
16 | `augustus` can be used to predict genes as follows:
17 |
18 | ```bash
19 | $ augustus --species=species_name input_file.fna > output_file.gff
20 | ```
21 |
22 | In this command...
23 |
24 | 1. `--species` is used to specify the target species for gene predictions (`species_name`).
25 | 2. `input_file.fna` is the input FASTA nucleic acid file ([.fna](file_formats.md#fasta)).
26 | 3. `output_file.gff` is the general feature format ([GFF](file_formats.md#generic-feature-formats)) genome annotation output file.
27 | Lines beginning with `#` are Augustus comments: these lines do not follow the GFF structure.
28 |
29 | The following command gives the list of valid species names for use with Augustus:
30 |
31 | ```bash
32 | $ augustus --species=help
33 | ```
34 |
35 | ## Extracting the FASTA amino acid sequences of predicted genes from an Augustus annotation
36 |
37 | The genome annotation file produced by `augustus` (`output_file.gff`) contains the amino acid sequences of predicted genes in comment lines.
38 | These amino acid sequences can be extracted to a FASTA file with the following command:
39 |
40 | ```bash
41 | $ getAnnoFasta.pl output_file.gff
42 | ```
43 |
44 | The amino acid sequences will be written to `output_file.aa`.
45 | This is a FASTA amino acid ([.faa](file_formats.md#fasta)).
46 | The extension of this file can be changed from ".aa" to ".faa" with the following command:
47 |
48 | ```bash
49 | $ mv output_file.aa output_file.faa
50 | ```
51 |
52 | ## Removing comments from Augustus annotations
53 |
54 | Genome annotations produced by Augustus follow the [Generic Feature Format](file_formats.md#generic-feature-formats), with the addition of comment lines for amino acid sequences.
55 | These are the same FASTA amino acid sequences that are extracted using `getAnnoFasta.pl`.
56 | These lines begin with the character `#`, and removing them results a standard GFF file.
57 |
58 | Here is one method for removing these amino acid lines, using `grep -v` to select lines which do not contain the `#` character:
59 |
60 | ```bash
61 | $ grep -v "#" augustus_annotation.gff > clean_augustus_annotation.gff
62 | ```
63 |
64 | ## Demonstration
65 |
66 | In this video, `augustus` is used to predict genes in `example_nucleotide_sequence.fasta`.
67 | This results in a genome annotation file: `augustus_example.gff`.
68 | The script `getAnnoFasta.pl` is used to extract the amino acid sequences in this genome annotation file to a new FASTA amino acid file: `augustus_example.aa`.
69 | The `mv` command is used to change the extension of this file from ".aa" to ".faa".
70 |
71 | [](https://asciinema.org/a/346541?autoplay=1)
72 |
73 | ## See also
74 |
75 | - [conda](conda.md)
76 | - [augustus conda environment](../envs/augustus.yml)
77 | - [File formats used in bioinformatics](file_formats.md)
78 |
79 | ## References
80 |
81 | - [The Augustus website](http://bioinf.uni-greifswald.de/augustus/)
82 | - [GNU grep](https://www.gnu.org/software/grep/manual/grep.html)
83 |
--------------------------------------------------------------------------------
/docs/bcftools.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | title: Bcftools
4 | parent: 2. Program guides
5 | ---
6 |
7 | # Bcftools
8 |
9 | Bcftools are a set of [utilities for variant calling and manipulating VCFs and BCFs](https://samtools.github.io/bcftools/bcftools.html).
10 |
11 | ## Generating genotype likelihoods for alignment files using `bcftools mpileup`
12 |
13 | `bcftools mpileup` can be used to generate VCF or BCF files containing genotype likelihoods for one or multiple alignment (BAM or CRAM) files as follows:
14 |
15 | ```bash
16 | $ bcftools mpileup --max-depth 10000 --threads n -f reference.fasta -o genotype_likelihoods.bcf reference_sequence_alignmnet.bam
17 | ```
18 |
19 | In this command...
20 |
21 | 1. **`--max-depth`** or **`-d`** sets the reads per input file for each position in the alignment. In this case, it is set to 10000
22 | 2. **`--threads`** sets the number (*n*) of processors/threads to use.
23 | 3. **`--fasta-ref`** or **`-f`** is used to select the [faidx-indexed FASTA](samtools.md#indexing-a-fasta-file-using-samtools-faidx) nucleotide reference file (*reference.fasta*) used for the alignment.
24 | 4. **`--output `** or **`-o`** is used to name the ouput file (*genotype_likelihoods.bcf*).
25 | 5. The final argument given is the input BAM alignment file (*reference_sequence_alignment.bam*). Multiple input files can be given here.
26 |
27 | ## Variant calling using `bcftools call`
28 |
29 | `bcftools call` can be used to call SNP/indel variants from a BCF file as follows:
30 |
31 | ```bash
32 | $ bcftools call -O b --threads n -vc --ploidy 1 -p 0.05 -o variants_unfiltered.bcf genotype_likelihoods.bcf
33 | ```
34 |
35 | In this command...
36 |
37 | 1. **`--output-type`** or **`-O`** is used to select the output format. In this case, *b* for BCF.
38 | 2. **`--threads`** sets the number (*n*) of processors/threads to use.
39 | 3. **`-vc`** specifies that we want the output to contain variants only, using the original [SAMtools](samtools.md) consensus caller.
40 | 4. **`--ploidy`** specifies the ploidy of the assembly.
41 | 5. **`--pval-threshold`** or **`-p`** is used to the set the p-value threshold for variant sites (*0.05*).
42 | 6. **`--output `** or **`-o`** is used to name the ouput file (*variants_unfiltered.bcf*).
43 | 7. The final argument is the input BCF file (*genotype_likelihoods.bcf*).
44 |
45 | ## Filtering variants using `bcftools filter`
46 |
47 | `bcftools filter` can be used to filter variants from a BCF file as follows...
48 |
49 | ```bash
50 | $ bcftools filter --threads n -i '%QUAL>=20' -O v -o variants_filtered.vcf variants_unfiltered.bcf
51 | ```
52 |
53 | In this command...
54 |
55 | 1. **`--threads`** sets the number (*n*) of processors/threads to use.
56 | 2. **`--include`** or **`-i`** is used to define the expression used to filter sites. In this case, *`%QUAL>=20`* results in sites with a quality score greater than or equal to 20.
57 | 3. **`--output-type`** or **`-O`** is used to select the output format. In this case, *v* for VCF.
58 | 4. **`--output `** or **`-o`** is used to name the ouput file (*variants_filtered.vcf*).
59 | 5. The final argument is the input BCF file (*genotype_likelihoods.bcf*).
60 |
61 | ## See also
62 |
63 | - [File formats used in bioinformatics](file_formats.md)
64 | - [SNP calling script](snp_calling.md)
65 |
66 | ## Futher reading
67 |
68 | - [bcftools documentation](https://samtools.github.io/bcftools/bcftools.html)
69 |
--------------------------------------------------------------------------------
/docs/blast.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | title: BLAST
4 | parent: 2. Program guides
5 | ---
6 |
7 | # BLAST
8 |
9 | The Basic Local Alignment Search Tool (BLAST) is an algorithm and program for comparing primary biological sequence information, such as the amino-acid sequences of proteins or the nucleotides of DNA and/or RNA sequences.
10 | BLAST is one of the most widely used tools in bioinformatics; it can be applied to different problems or projects in a myriad ways.
11 |
12 | ## Contents
13 |
14 | - [How BLAST works](#how-blast-works)
15 | - [The command line version of BLAST](#the-command-line-version-of-blast)
16 | - [Types of BLAST search](#types-of-blast-search)
17 | - [E-value and Bit-score](#e-value-and-bit-score)
18 | - [Creating a BLAST database using `makeblastdb`](#creating-a-blast-database-using-makeblastdb)
19 | - [Creating local BLAST database from Swiss-Prot](#downloading-swiss-prot-fasta-sequences-and-creating-a-blast-protein-database)
20 | - [Searching against a BLAST nucleotide database using `blastn`](#searching-against-a-blast-nucleotide-database-using-blastn)
21 | - [BLAST `-outfmt 6` results](#blast--outfmt-6-results)
22 | - [Video demonstration](#video-demonstration)
23 | - [See also](#see-also)
24 | - [References](#references)
25 |
26 | ## How BLAST works
27 |
28 | There are two main steps in BLAST:
29 |
30 | 1. A list of "words" (sets of characters/residues) of length *k* is created for the query sequence. By default, *k* = 3 for amino acid sequences, and *k* = 11 for nucleotide sequences.
31 | 2. An alignment is made for database (subject) sequences that share many words with the query sequence. This is a local alignment in which only High-scoring Segment Pairs (HSPs) are reported. In other words, BLAST finds islands of similarity between sequences.
32 |
33 | 
34 |
35 | ## The command line version of BLAST
36 |
37 | BLAST can be used online, or through the command line.
38 | Most biologists are familiar with [NCBI's web application for BLAST]().
39 | If you use this web application regularly, the command line BLAST program is worth your consideration.
40 | The command line version of BLAST has several advantages over the web version:
41 |
42 | 1. BLAST on the command line can be used to run *local searches*, i.e. searches which use files that are on your computer, instead of files that are on an NCBI database.
43 | 2. BLAST searches on the command line can be made more specific by adding additional arguments.
44 | 3. BLAST searches carried out on the command line can be automated, and incorporated into larger scripts.
45 | 4. The command line BLAST program can output search results in various structured text formats.
46 |
47 | The command line version of BLAST can be downloaded via [conda](conda.md) using the following command:
48 |
49 | ```bash
50 | $ conda install -c bioconda blast
51 | ```
52 |
53 | This program is included in the [bioinfo-notebook conda environment](../envs/bioinfo-notebook.txt).
54 |
55 | ## Types of BLAST search
56 |
57 | There are five main types of BLAST search:
58 |
59 | 1. **BLASTp** searches a protein database with a protein query sequence.
60 | 2. **BLASTn** searches a nucleic acid database with nucleic acid query sequence.
61 | 3. **BLASTx** searches a protein database with nucleic acid query sequence, which is translated into an amino acid sequence.
62 | 4. **tBLASTx** searches a nucleic acid database with nucleic acid query sequence. In this case, both the database (subject) sequences and query sequence are translated into amino acid sequences.
63 | 5. **tBLASTn** searches a nucleic acid database with protein query sequence. In this case, the nucleic acid database is translated into a set of amino acid sequences.
64 |
65 | While the type of query and subject sequences required for each of these BLAST searches differs, the command line arguments that can be used for these BLAST searches are interchangeable.
66 |
67 | ## E-value and Bit-score
68 |
69 | Two important variables when interpreting BLAST results are *E-value* and *bit-score*.
70 | These are both derived from the *raw alignment score (S)*, which is based on the number of residues (i.e. individual amino/nucleic acids) that two sequences have in common.
71 | The more identical residues that two sequences have at the same position in an alignment, the higher the alignment score.
72 |
73 | - **Bit-score (S')** is the raw alignment score (S) normalised with respect to the scoring system used for the alignment.
74 | - **E-value** or Expectation value is the number of different alignments with scores equivalent to or better than S that is expected to occur in a database search by chance. The lower the E value, the more significant the score and the alignment. An exact match between query and subject sequences results in an E-value of zero.
75 |
76 | While bit-scores are comparable between searches, as they are normalised, they do not take the size of the database into account.
77 | E-values, however, do account for the size of the database.
78 | The lower the E-value and the higher the bit-score, the better the BLAST result.
79 |
80 | ## Creating a BLAST database using `makeblastdb`
81 |
82 | To search against a set of nucleotide or amino acid sequences using BLAST, a database must be created.
83 | This can be done using the `makeblastdb` command.
84 |
85 | ```bash
86 | $ makeblastdb -dbtype prot/nucl -in input_file -out database_name
87 | ```
88 |
89 | In this command...
90 |
91 | 1. `-dbtype` specifies the type of sequences used to create the database. For amino acid (protein) sequences, `prot` is used ("`-dbtype prot`"). For nucleic acid sequences, `nucl` is used ("`-dbtype nucl`").
92 | 2. `-in` is used to specify the input file. The database created can be used to search against the sequences in this file.
93 | 3. `-out` is used to name the database that will be created from the input file.
94 |
95 | ## Downloading Swiss-Prot FASTA sequences and creating a BLAST protein database
96 |
97 | In this video, the FASTA amino acid sequences of Swiss-Prot are downloaded, and a BLAST protein database is created from these sequences using `makeblastdb`.
98 | [UniProtKB/Swiss-Prot](https://en.wikipedia.org/wiki/UniProt#UniProtKB.2FSwiss-Prot) is a manually annotated, non-redundant protein sequence database.
99 | As it is well-annotated and curated, the Swiss-Prot database gives informative results when searched locally using `blastp` and `blastx`.
100 | The link used in the `wget` command is copied and pasted from the [UniProt downloads page](https://www.uniprot.org/downloads).
101 | The compressed FASTA sequences of the Swiss-Prot database on `ftp.uniprot.org`.
102 |
103 | These FASTA amino acid sequences are compressed into a `.gz` (gzip) file.
104 | Before using the `makeblastdb` command, this FASTA file is uncompressed using `gunzip`, turning `uniprot_sprot`**`.fasta.gz`** into `uniprot_sprot`**`.fasta`**.
105 | Once the FASTA file is downloaded and uncompressed, `makeblastdb` is used to create a BLAST protein database of the amino acid sequences in this FASTA file.
106 | This BLAST protein database is named `swissprot`, and consists of three binary files.
107 |
108 | Once the BLAST protein database is created, `blastp` and `blastx` can be used to search sequences against it.
109 | This database can be selected using the argument `-db swissprot`with `blastp` or `blastx` (the path to the `swissprot` database will need to be given if the command is run from a different directory).
110 |
111 | [](https://asciinema.org/a/338534?autoplay=1)
112 |
113 | ## Searching against a BLAST nucleotide database using `blastn`
114 |
115 | The program `blastn` is used for searching nucleotide databases with a nucleotide query.
116 |
117 | ```bash
118 | $ blastn -query query_file.fna -db nucl_database_name -out results_file.tsv -outfmt 6 -evalue x -max_hsps y -num_threads n
119 | ```
120 |
121 | In this command...
122 |
123 | 1. `-query` is used to select the FASTA nucleic acids file you want to search against the BLAST database (the `query_file.fna`).
124 | 2. `-db` is used to select the BLAST nucleotide database you want to search against (`nucl_database_name`).
125 | 3. `-out` is used to direct the results to an output file (`results_file.tsv`).
126 | 4. `-outfmt` is used to specify how this results file should be formatted. In this case, as `-outfmt` is `6`, the results will be written to a file as tab-separated values: this is why `results_file.tsv` has a `.tsv` extension.
127 | 5. `-evalue` is used to set an E-value threshold (`x`). Results which have an E-value greater than this threshold will not be written to the results file.
128 | 6. `-max_hsps` is used to set a High-scoring Segment Pairs (HSPs) threshold (`y`). When given, no more than `y` HSPs (alignments) for each query-subject pair will be written to the results file.
129 | 7. `-num_threads` is used to set the number (*`n`*) of threads/processors to use (default 1).
130 |
131 | The last two arguments given in this command- `-evalue` and `-max_hsps`- are optional, but they are useful as they allow the results to be filtered before being written to the file.
132 | Using these arguments will result in more specific results, and will reduce the need to manually filter results later.
133 |
134 | ## BLAST `-outfmt 6` results
135 |
136 | These BLAST results are taken from the [video demonstration](#video_demonstration) and are in BLAST output format 6.
137 |
138 | ```
139 | gi|242120357|gb|FJ461870.1| NC_001144.5 93.252 163 11 0 196 358 454921 454759 7.57e-63 241
140 | gi|242120357|gb|FJ461870.1| NC_001144.5 93.252 163 11 0 196 358 464058 463896 7.57e-63 241
141 | gi|242120357|gb|FJ461870.1| CP036478.1 93.252 163 11 0 196 358 454829 454667 7.57e-63 241
142 | gi|242120357|gb|FJ461870.1| CP036478.1 93.252 163 11 0 196 358 463966 463804 7.57e-63 241
143 | gi|242120357|gb|FJ461870.1| CP024006.1 93.252 163 11 0 196 358 453978 453816 7.57e-63 241
144 | ```
145 |
146 | These results are tab-separated values, meaning each column in the results is separated by a `Tab` character.
147 | These columns always appear in the same order:
148 |
149 | ```
150 | query_id subject_id per_identity aln_length mismatches gap_openings q_start q_end s_start s_end e-value bit_score
151 | ```
152 |
153 | In this format...
154 |
155 | 1. `query_id` is the FASTA header of the sequence being searched against the database (the query sequence).
156 | 2. `subject_id` is the FASTA header of the sequence in the database that the query sequence has been aligned to (the subject sequence).
157 | 3. `per_identity` is the percentage identity- the extent to which the query and subject sequences have the same residues at the same positions.
158 | 4. `aln_length` is the alignment length.
159 | 5. `mismatches` is the number of mismatches.
160 | 6. `gap_openings` is the number of gap openings in the alignment.
161 | 7. `q_start` is the start of the alignment in the query sequence.
162 | 8. `q_end` is the end of the alignment in the query sequence.
163 | 9. `s_start` is the start of the alignment in the subject sequence.
164 | 10. `s_end` is the end of the alignment in the subject sequence.
165 | 11. `e_value` is the expect value (E-value) for the alignment.
166 | 12. `bit_score` is the bit-score of the alignment.
167 |
168 | All BLAST output formats above 4 (i.e. `--outfmt > 4`) use this tabular layout, formatted in different ways.
169 | For example, `--outfmt 10` gives the same information in a comma-separated values (`.csv`) file instead of a tab-separated values (`.tsv`) file.
170 |
171 | ## Video demonstration
172 |
173 | In this demonstration, `makeblastdb` is used to create a BLAST database from the file `S_cere_genomes.fna`.
174 | This FASTA nucleic acids (`.fna`) file was created by concatenating the following *Saccharomyces cerevisiae* genome assemblies, which were downloaded from NCBI: [GCA_003086655.1](https://www.ncbi.nlm.nih.gov/assembly/GCA_003086655.1), [GCA_003709285.1](https://www.ncbi.nlm.nih.gov/assembly/GCA_003709285.1) and [GCA_004328465.1](https://www.ncbi.nlm.nih.gov/assembly/GCA_004328465.1).
175 |
176 | The program `blastn` is then used to query `23S_rRNA_gene.fna` against this database.
177 | This file is a copy of the [*Scutellospora fulgida* isolate NC303A 25S ribosomal RNA gene](https://www.ncbi.nlm.nih.gov/nuccore/FJ461870.1?report=fasta) from NCBI.
178 |
179 | The program `tblastn` is also used to query `YPK1.faa` against this database multiple times.
180 | This FASTA amino acid (`.faa`) file is a copy of the [serine/threonine-protein kinase YPK1](https://www.uniprot.org/uniprot/P12688) from UniProt.
181 | This search is carried out multiple times with additional parameters: the flag `-evalue`is used to set an E-value threshold, and the flag `-max_hsps` is used to set a maximum number of High-scoring Segment Pairs (HSPs).
182 |
183 | The results from these BLAST searches are written to tab-separated values (`.tsv`) files.
184 | This output format is specified with the flag `-outfmt 6`.
185 |
186 | [](https://asciinema.org/a/327279?autoplay=1)
187 |
188 | ## See also
189 |
190 | - [File formats used in bioinformatics](file_formats.md)
191 | - [Introduction to the command line](cl_intro.md)
192 | - [conda](conda.md)
193 | - [NCBI's web application for BLAST]()
194 |
195 | ## References
196 |
197 | - [BLAST® Command Line Applications User Manual](https://www.ncbi.nlm.nih.gov/books/NBK279690/)
198 | - [BLAST Glossary](https://www.ncbi.nlm.nih.gov/books/NBK62051/)
199 |
--------------------------------------------------------------------------------
/docs/bowtie.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | title: Bowtie
4 | parent: 2. Program guides
5 | ---
6 |
7 | # Bowtie
8 |
9 | `bowtie` can be used to:
10 | - index reference FASTA nucleotide genomes/sequences
11 | - align FASTQ sequencing reads to those genomes/sequences
12 |
13 | If you want to align short reads (50bp or less), [bowtie is more suitable than bowtie2](bowtie2.md#differences-between-bowtie-and-bowtie2).
14 |
15 | ## Indexing a reference genome/sequence using `bowtie-build`
16 |
17 | Before aligning reads to a reference genome with `bowtie`, it must be indexed using `bowtie-bui
18 | ld`.
19 | This command will create six files with the extensions `.1.ebwt`, `.2.ebwt`, `.3.ebwt`, `.4.ebwt`, `.rev.1.ebwt`, and `.rev.2.ebwt`.
20 | These six files together are the index.
21 | Once an index has been created, the original reference genome/sequence is no longer needed to align reads.
22 | Here's an example `bowtie2-build` command:
23 |
24 | ```
25 | $ bowtie-build reference_sequence.fasta index_name
26 | ```
27 |
28 | In this command, the `reference_sequence.FASTA` is the nucleotide FASTA sequence we want to index, and `index_name` is the name of the index.
29 | There will be six files beginning with the `index_name` in the output directory: `index_name.1.ebwt`, `index_name.2.ebwt`, `index_name.3.ebwt`, `index_name.4.ebwt`, `index_name.rev.1.ebwt`, and `index_name.rev.2.ebwt`.
30 | There's no need to specify any of these files individually in subsequent `bowtie` commands, the `index_name` alone is enough to refer to the entire index.
31 |
32 | ## Aligning reads to an indexed genome/sequence using `bowtie`
33 |
34 | Now that the genome has been indexed, FASTQ sequencing reads can be aligned to it.
35 | This is done using the `bowtie` command.
36 | Here is an example `bowtie2` command:
37 |
38 | ```
39 | $ bowtie --no-unal --threads n --sam index_name -1 reads_1.fastq -2 reads_2.fastq output.sam
40 | ```
41 |
42 | In this command...
43 |
44 | 1. **`--no-unal`** is an optional argument, meaning reads that do not align to the reference genome will not be written to `sam` output
45 | 2. **`--threads`** is the number (*n*) of processors/threads used
46 | 3. **`--sam`** specifies that the output should be written in the [SAM format](file_formats.md#sam)
47 | 4. **`index_name`** is the name of the genome index
48 | 4. **`-1`** is the file(s) containing mate 1 reads ([`reads_1.fastq`](file_formats.md#fastq))
49 | 5. **`-2`** is the file(s) containing mate 2 reads ([`reads_2.fastq`](file_formats.md#fastq))
50 | 6. **`output.sam`** is the output alignment in `sam` format
51 |
52 | ## Demonstration
53 |
54 | In this video, `bowtie-build` is used to index `S_cere_GCF_000146045.2_R64_genomic.fna`, which is a copy of the [*Saccharomyces cerevisiae* S288C genome from RefSeq](https://www.ncbi.nlm.nih.gov/assembly/GCF_000146045.2).
55 | The `bowtie` command is then used to align [*Saccharomyces cerevisiae* RNAseq reads](https://www.ncbi.nlm.nih.gov/sra/SRR11462797) to this bowtie index.
56 |
57 | [](https://asciinema.org/a/316272?autoplay=1)
58 |
59 | ## Further reading
60 |
61 | 1. The `bowtie` manual:
62 |
--------------------------------------------------------------------------------
/docs/bowtie2.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | title: Bowtie2
4 | parent: 2. Program guides
5 | ---
6 |
7 | # Bowtie2
8 |
9 | From the manual: [*"Bowtie 2 is an ultrafast and memory-efficient tool for aligning sequencing reads to long reference sequences"*](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml).
10 |
11 | `bowtie2` can be used to:
12 | - index reference FASTA nucleotide genomes/sequences
13 | - align FASTQ sequencing reads to those genomes/sequences
14 |
15 | ## Differences between `bowtie` and `bowtie2`
16 |
17 | - `bowtie2` has no upper limit on read length
18 | - `bowtie2` can make gapped alignments
19 | - `bowtie2` is more flexible for paired-end alignment
20 | - `bowtie2` is faster and more memory efficient
21 | - `bowtie` is advantageous over `bowtie2` for relatively short sequencing reads (50bp or less)
22 |
23 | ## Indexing a reference genome/sequence using `bowtie2-build`
24 |
25 | Before aligning reads to a reference genome with `bowtie2`, it must be indexed using `bowtie2-build`.
26 | This command will create six files with the extensions `.1.bt2`, `.2.bt2`, `.3.bt2`, `.4.bt2`, `.rev.1.bt2`, and `.rev.2.bt2`.
27 | These six files together are the index.
28 | Once an index has been created, the original reference genome/sequence is no longer needed to align reads.
29 | Here's an example `bowtie2-build` command:
30 |
31 | ```
32 | $ bowtie2-build reference_sequence.fasta index_name
33 | ```
34 |
35 | In this command, the `reference_sequence.FASTA` is the nucleotide FASTA sequence we want to index, and `index_name` is the name of the index.
36 | There will be six files beginning with the `index_name` in the output directory: `index_name.1.bt2`, `index_name.2.bt2`, `index_name.3.bt2`, `index_name.4.bt2`, `index_name.rev.1.bt2`, and `index_name.rev.2.bt2`.
37 | There's no need to specify any of these files individually, just the `index_name` alone is enough to refer to the entire index.
38 |
39 | ## Aligning reads to an indexed genome/sequence using `bowtie2`
40 |
41 | Now that the genome has been indexed, FASTQ sequencing reads can be aligned to it.
42 | This is done using the `bowtie2` command.
43 | Here's an example `bowtie2` command:
44 |
45 | ```
46 | $ bowtie2 --no-unal -p n -x index_name -1 reads_1.fastq -2 reads_2.fastq -S output.sam
47 | ```
48 |
49 | In this command...
50 |
51 | 1. **`--no-unal`** is an optional argument, meaning reads that do not align to the reference genome will not be written to `sam` output
52 | 2. **`-p`** is the number (*n*) of processors/threads used
53 | 3. **`-x`** is the genome index
54 | 4. **`-1`** is the file(s) containing mate 1 reads
55 | 5. **`-2`** is the file(s) containing mate 2 reads
56 | 6. **`-S`** is the output alignment in `sam` format
57 |
58 | ## Demonstration
59 |
60 | In this video, `bowtie2-build` is used to index `example_nucleotide_sequence.fasta`, and the command `bowtie2` is used to align reads to this bowtie2 index.
61 |
62 | [](https://asciinema.org/a/306546?autoplay=1)
63 |
64 | ## Further reading
65 |
66 | 1. The `bowtie2` manual:
67 |
--------------------------------------------------------------------------------
/docs/cl_solutions.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | title: Command line exercise solutions
4 | nav_exclude: true
5 | ---
6 |
7 | # Command line exercise solutions
8 |
9 | The `pwd` commands in these solutions are added to clarify the working directories used.
10 |
11 | **1.** Change the working directory from `bioinfo-notebook/` to `bioinfo-notebook/data/`.
12 |
13 | ```bash
14 | ronan@dell:~/bioinfo-notebook$ pwd
15 | /home/ronan/bioinfo-notebook
16 | ronan@dell:~/bioinfo-notebook$ cd data/
17 | ronan@dell:~/bioinfo-notebook/data$ pwd
18 | /home/ronan/bioinfo-notebook/data
19 | ```
20 |
21 | **2.** Change the working directory from `bioinfo-notebook/data` to `bioinfo-notebook/docs`, using `../` in your command.
22 |
23 | ```bash
24 | ronan@dell:~/bioinfo-notebook/data$ pwd
25 | /home/ronan/bioinfo-notebook/data
26 | ronan@dell:~/bioinfo-notebook/data$ cd ../docs/
27 | ronan@dell:~/bioinfo-notebook/docs$ pwd
28 | /home/ronan/bioinfo-notebook/docs
29 | ```
30 |
31 | **3.** List the files in the `bioinfo-notebook/docs/` directory.
32 |
33 | ```bash
34 | ronan@dell:~/bioinfo-notebook/docs$ pwd
35 | /home/ronan/bioinfo-notebook/docs
36 | ronan@dell:~/bioinfo-notebook/docs$ ls
37 | bowtie2.md file_formats.md
38 | bowtie.md htseq-count.md
39 | cl_intro.md linux_setup.md
40 | cl_solutions.md part1.md
41 | combining_featCount_tables.md part2.md
42 | conda.md part3.md
43 | fasterq-dump.md samtools.md
44 | fastq-dump.md to_do.md
45 | fastq-dump_to_featureCounts.md ubuntu_virtualbox.md
46 | featureCounts.md wsl.md
47 | ```
48 |
49 | **4.** Select a file in the `bioinfo-notebook/docs/` directory, and display the first 6 lines of it using the `head` command.
50 |
51 | ```bash
52 | ronan@dell:~/bioinfo-notebook/docs$ pwd
53 | /home/ronan/bioinfo-notebook/docs
54 | ronan@dell:~/bioinfo-notebook/docs$ head cl_solutions.md
55 | ---
56 | layout: default
57 | title: Command line exercise solutions
58 | nav_exclude: true
59 | ---
60 |
61 | # Command line exercise solutions
62 |
63 | 1. Change the working directory from `bioinfo-notebook/` to `bioinfo-notebook/data/`.
64 | ```
65 |
66 | **5.** Display the last 2 lines of all the files in the `bioinfo-notebook/docs/` directory, using the `tail` command.
67 |
68 | ```bash
69 | ronan@dell:~/bioinfo-notebook/docs$ pwd
70 | /home/ronan/bioinfo-notebook/docs
71 | ronan@dell:~/bioinfo-notebook/docs$ tail -n 2 *
72 | ```
73 | ```
74 | ==> bowtie2.md <==
75 |
76 | 1. The `bowtie2` manual:
77 |
78 | ==> bowtie.md <==
79 |
80 | 1. The `bowtie` manual:
81 |
82 | ==> cl_intro.md <==
83 | - [File formats used in bioinformatics](file_formats.md)
84 | - [The DataCamp "Introduction to Shell" interactive course](https://www.datacamp.com/courses/introduction-to-shell-for-data-science)
85 |
86 | ==> cl_solutions.md <==
87 | 5. Display the last 2 lines of all the files in the `bioinfo-notebook/docs/` directory, using the `tail` command.
88 | 6. From the `bioinfo-notebook/docs/` directory, list the files in the `bioinfo-notebook/envs/` directory.
89 |
90 | ==> combining_featCount_tables.md <==
91 |
92 | - [fastq-dump_to_featureCounts.sh](fastq-dump_to_featureCounts.md)
93 |
94 | ==> conda.md <==
95 | 2. Conda packages:
96 | 3. Conda environments:
97 |
98 | ==> fasterq-dump.md <==
99 |
100 | 1. [How to use fasterq-dump from the sra-tools wiki on GitHub](https://github.com/ncbi/sra-tools/wiki/HowTo:-fasterq-dump)
101 |
102 | ==> fastq-dump.md <==
103 |
104 | 1. Rob Edward's notes on `fastq-dump`:
105 |
106 | ==> fastq-dump_to_featureCounts.md <==
107 |
108 | 1. [fastq-dump_to_featureCounts.sh on GitHub](https://github.com/rnnh/bioinfo-notebook/blob/master/scripts/fastq-dump_to_featureCounts.sh)
109 |
110 | ==> featureCounts.md <==
111 | 1. The `subread` user guide:
112 | 2. The `featureCounts` paper:
113 |
114 | ==> file_formats.md <==
115 | - [GTF2.2: A Gene Annotation Format (Revised Ensembl GTF)](http://mblab.wustl.edu/GTF22.html)
116 | - [GFF3 Specification](https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md)
117 |
118 | ==> htseq-count.md <==
119 |
120 | 1. The `htseq-count` manual:
121 |
122 | ==> linux_setup.md <==
123 | - [Using Ubuntu through a Virtual Machine](ubuntu_virtualbox.md)
124 | - [Windows Subsystem for Linux](wsl.md)
125 |
126 | ==> part1.md <==
127 |
128 | These are general guides for installing Ubuntu, using the command line, and the types of files used in bioinformatics.
129 |
130 | ==> part2.md <==
131 |
132 | These are guides to individual programs.
133 |
134 | ==> part3.md <==
135 |
136 | These are scripts that use the programs discussed in this project.
137 |
138 | ==> samtools.md <==
139 | - [Alignment formats](file_formats.md#alignment-formats)
140 | - The `samtools` manual:
141 |
142 | ==> to_do.md <==
143 | - Add page on `trimmomatic`
144 | - Entry on BED/bigWig
145 |
146 | ==> ubuntu_virtualbox.md <==
147 | - [What is a Virtual Machine?](https://azure.microsoft.com/en-us/overview/what-is-a-virtual-machine/)
148 | - [How to Install Ubuntu on VirtualBox](https://www.wikihow.com/Install-Ubuntu-on-VirtualBox)
149 |
150 | ==> wsl.md <==
151 | - [Using Ubuntu through a Virtual Machine](ubuntu_virtualbox.md)
152 | - [conda](conda.md)
153 | ```
154 |
155 | **6.** From the `bioinfo-notebook/docs/` directory, list the files in the `bioinfo-notebook/envs/` directory.
156 |
157 | ```bash
158 | ronan@dell:~/bioinfo-notebook/docs$ pwd
159 | /home/ronan/bioinfo-notebook/docs
160 | ronan@dell:~/bioinfo-notebook/docs$ ls ../envs/
161 | bioinfo-notebook.txt bioinfo-notebook.yml
162 | ```
163 |
--------------------------------------------------------------------------------
/docs/combining_featCount_tables.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | title: Combining featCount tables.py
4 | parent: 3. Scripts
5 | ---
6 |
7 |
8 | # Combining featCount tables.py
9 |
10 | This is a Python script that creates a single CSV feature count table from the featureCounts output tables in the target directory.
11 | This combined feature count table can be used for differential expression analysis (e.g. using DESeq2 or edgeR in R).
12 |
13 | ## Demonstration
14 |
15 | This is a video demonstartion of [combining_featCount_tables.py](../scripts/combining_featCount_tables.py).
16 |
17 | [](https://asciinema.org/a/311771?autoplay=1)
18 |
19 | In this video, `combining_featCount_tables.py` is used to combine the following [featureCounts](featureCounts.md) tables:
20 |
21 | ```
22 | feature_counts_SRR8933506_S_cere_GCF_000146045.2_R64_genomic.fna.tsv
23 | feature_counts_SRR8933509_S_cere_GCF_000146045.2_R64_genomic.fna.tsv
24 | feature_counts_SRR8933510_S_cere_GCF_000146045.2_R64_genomic.fna.tsv
25 | feature_counts_SRR8933511_S_cere_GCF_000146045.2_R64_genomic.fna.tsv
26 | feature_counts_SRR8933512_S_cere_GCF_000146045.2_R64_genomic.fna.tsv
27 | feature_counts_SRR8933530_S_cere_GCF_000146045.2_R64_genomic.fna.tsv
28 | feature_counts_SRR8933531_S_cere_GCF_000146045.2_R64_genomic.fna.tsv
29 | feature_counts_SRR8933532_S_cere_GCF_000146045.2_R64_genomic.fna.tsv
30 | feature_counts_SRR8933533_S_cere_GCF_000146045.2_R64_genomic.fna.tsv
31 | feature_counts_SRR8933534_S_cere_GCF_000146045.2_R64_genomic.fna.tsv
32 | feature_counts_SRR8933535_S_cere_GCF_000146045.2_R64_genomic.fna.tsv
33 | feature_counts_SRR8933536_S_cere_GCF_000146045.2_R64_genomic.fna.tsv
34 | feature_counts_SRR8933537_S_cere_GCF_000146045.2_R64_genomic.fna.tsv
35 | feature_counts_SRR8933538_S_cere_GCF_000146045.2_R64_genomic.fna.tsv
36 | feature_counts_SRR8933539_S_cere_GCF_000146045.2_R64_genomic.fna.tsv
37 | ```
38 |
39 | These featureCounts results were generated using the following [fastq-dump_to_featureCounts.sh](fastq-dump_to_featureCounts.md) command:
40 |
41 | ```bash
42 | $ bash ../scripts/fastq-dump_to_featureCounts.sh -a S_cere_GCF_000146045.2_R64_genomic.gtf -f S_cere_GCF_000146045.2_R64_genomic.fna --verbose -p 3 SRR8933506 SRR8933509 SRR8933510 SRR8933511 SRR8933512 SRR8933530 SRR8933531 SRR8933532 SRR8933533 SRR8933534 SRR8933535 SRR8933536 SRR8933537 SRR8933538 SRR8933539
43 | ```
44 |
45 | In this command, the full genome sequence (`S_cere_GCF_000146045.2_R64_genomic.fna`) and genome annotation (`S_cere_GCF_000146045.2_R64_genomic.gtf`) for [*Saccharomyces cerevisiae* S288C](https://www.ncbi.nlm.nih.gov/assembly/GCF_000146045.2) are used.
46 |
47 | These featureCounts results were then combined using the following command:
48 |
49 | ```bash
50 | $ python ../scripts/combining_featCount_tables.py
51 | ```
52 |
53 | Running this script combines all the featureCounts results in a directory into a single CSV file.
54 | If a custom name for this file is not given, it will be given a name using this scheme: `featCounts_{species}_{date}.csv`.
55 |
56 | ## Usage
57 |
58 | ```
59 | usage: combining_featCount_tables.py [-h] [-d PATH] [-o CUSTOM_FILENAME]
60 |
61 | Combines the featureCounts output tables in the target directory.
62 |
63 | optional arguments:
64 | -h, --help show this help message and exit
65 | -d PATH, --directory PATH
66 | path to target directory. Default: current directory
67 | -o CUSTOM_FILENAME, --output CUSTOM_FILENAME
68 | output filename. Default:
69 | featCounts_{species}_{date}.csv
70 | ```
71 |
72 | ## See also
73 | - [combining_featCount_tables.py on GitHub](https://github.com/rnnh/bioinfo-notebook/blob/master/scripts/combining_featCount_tables.py)
74 | - [fastq-dump_to_featureCounts.sh](fastq-dump_to_featureCounts.md)
75 |
--------------------------------------------------------------------------------
/docs/conda.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | title: Conda
4 | parent: 2. Program guides
5 | ---
6 |
7 | # Conda
8 |
9 | From the website, `conda` provides ["Package, dependency and environment management for any language"](https://docs.conda.io/en/latest/).
10 |
11 | Conda is a package manager allows specific versions of programs to be installed, alongside their dependencies.
12 | Different sets of programs can be installed to different [virtual environments](https://www.anaconda.com/moving-conda-environments/).
13 | A virtual environment is basically a set of programs.
14 |
15 | ## Installing `conda`
16 |
17 | Conda is part of [Anaconda](https://www.anaconda.com/distribution/), which is available for free.
18 | Conda is also available through [Miniconda](https://docs.conda.io/en/latest/miniconda.html), a free minimal installer for conda.
19 |
20 | Conda can be installed on a 64-bit Linux system with the following commands...
21 |
22 | ```bash
23 | # Downloading miniconda
24 | $ wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
25 | # Installing miniconda
26 | $ bash miniconda.sh -b -p $HOME/miniconda
27 | # Updating conda
28 | $ conda update -q conda
29 | ```
30 |
31 | ## Cloning and activating a `conda` environment
32 |
33 | Conda virtual environments can be shared, either as a `.yml` file or a `.txt` file.
34 | A `.yml` copy of a conda environment can be used to recreate that environment on another machine, regardless of the operating system platform used.
35 | A `.txt` copy of a conda environment is more explicit: it can be used to create an identical copy of a conda environment using the same operating system platform as the original machine.
36 | A conda virtual environment is used throughout this project: a [`.yml` copy](../envs/bioinfo-notebook.yml) and an [explicit `.txt` copy](../envs/bioinfo-notebook.txt) of this conda environment are provided.
37 |
38 | A conda environment can be activated using `$ conda activate name_of_environment`.
39 | Once activated, the programs installed in this environment are available.
40 | Conda can be deactivated using `$ conda deactivate`.
41 |
42 | The `conda` environment used throughout this project can be created from [bioinfo-notebook.txt](../envs/bioinfo-notebook.txt) and activated using the following commands...
43 |
44 | ```bash
45 | # Creating the bioinfo-notebook environment
46 | /bioinfo-notebook $ conda create --name bioinfo-notebook --file envs/bioinfo-notebook.txt
47 | # Activating the bioinfo-notebook environment
48 | $ conda activate bioinfo-notebook
49 | # Once activated, the environment name is at the start of the bash prompt
50 | (bioinfo-notebook) $
51 | ```
52 |
53 | ## Demonstration
54 |
55 | In this video demonstration, a conda virtual environment is created using [bioinfo-notebook.txt](../envs/bioinfo-notebook.txt).
56 | This virtual environment is then activated using `conda activate bioinfo-notebook`.
57 | Note that the name of the active conda environment is displayed in brackets at the start of the bash prompt: `(name of active environment) ... $`.
58 |
59 | [](https://asciinema.org/a/305992?autoplay=1)
60 |
61 | ## Further reading
62 | 1. Downloading conda:
63 | 2. Conda packages:
64 | 3. Conda environments:
65 |
--------------------------------------------------------------------------------
/docs/fasterq-dump.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | title: Fasterq-dump
4 | parent: 2. Program guides
5 | ---
6 |
7 | # Fasterq-dump
8 |
9 | `fasterq-dump` is a tool for downloading sequencing reads from [NCBI's Sequence Read Archive (SRA)](https://www.ncbi.nlm.nih.gov/sra).
10 | These sequence reads will be downloaded as [FASTQ files](file_formats.md#fastq).
11 | `fasterq-dump` is a newer, streamlined alternative to [fastq-dump](fastq-dump.md); both of these programs are a part of [sra-tools](https://anaconda.org/bioconda/sra-tools).
12 |
13 | ## `fasterq-dump` vs `fastq-dump`
14 |
15 | Here are a few of the differences between `fastq-dump` and `fasterq-dump`:
16 |
17 | 1. In `fastq-dump`, the flag `--split-3` is required to separate paired reads into left and right ends. This is the default setting in `fasterq-dump`.
18 | 2. The `fastq-dump` flag `--skip-technical` is no longer required to skip technical reads in `fasterq-dump`. Instead, the flag `--include-technical` is required to include technical reads when using `fasterq-dump`.
19 | 3. There is no `--gzip` or `--bzip2` flag in `fasterq-dump` to download compressed reads with `fasterq-dump`. However, FASTQ files downloaded using `fasterq-dump` can still be subsequently compressed.
20 |
21 | The following commands are equivalent, but will be executed faster using `fasterq-dump`:
22 |
23 | ```
24 | $ fastq-dump SRR_ID --split-3 --skip-technical
25 | $ fasterq-dump SRR_ID
26 | ```
27 |
28 | ## Downloading reads from the SRA using `fasterq-dump`
29 |
30 | In this example, we want to download FASTQ reads for a mate-pair library.
31 |
32 | ```
33 | fastq-dump --threads n --progress SRR_ID
34 | ```
35 |
36 | In this command...
37 |
38 | 1. **`--threads`** specifies the number (*`n`*) processors/threads to be used.
39 | 2. **`--progress`** is an optional argument that displays a progress bar when the reads are being downloaded.
40 | 3. **`SRR_ID`** is the ID of the run from the SRA to be downloaded. This ID begins with "SRR" and is followed by around seven digits (e.g. `SRA1234567`).
41 |
42 | ## Demonstration
43 |
44 | In this video, `fasterq-dump` is used to download [*Saccharomyces cerevisiae* RNAseq reads](https://www.ncbi.nlm.nih.gov/sra/SRR11462797) from the SRA.
45 |
46 | [](https://asciinema.org/a/316273?autoplay=1)
47 |
48 | ## See also
49 |
50 | - [fastq-dump](fastq-dump.md)
51 |
52 | ## References
53 |
54 | 1. [How to use fasterq-dump from the sra-tools wiki on GitHub](https://github.com/ncbi/sra-tools/wiki/HowTo:-fasterq-dump)
55 |
--------------------------------------------------------------------------------
/docs/fastq-dump.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | title: Fastq-dump
4 | parent: 2. Program guides
5 | ---
6 |
7 | # Fastq-dump
8 |
9 | `fastq-dump` is a tool for downloading sequencing reads from [NCBI's Sequence Read Archive (SRA)](https://www.ncbi.nlm.nih.gov/sra).
10 | These sequence reads will be downloaded as FASTQ files.
11 | How these FASTQ files are formatted depends on the `fastq-dump` options used.
12 |
13 | ## Downloading reads from the SRA using `fastq-dump`
14 |
15 | In this example, we want to download FASTQ reads for a mate-pair library.
16 |
17 | ```
18 | $ fastq-dump --gzip --skip-technical --readids --read-filter pass --dumpbase --split-3 --clip --outdir path/to/reads/ SRR_ID
19 | ```
20 |
21 | In this command...
22 |
23 | 1. **`--gzip`**: Compress output using gzip. Gzip archived reads can be read directly by [bowtie2](bowtie2.md).
24 | 2. **`--skip-technical`**: Dump only biological reads, skip the technical reads.
25 | 3. **`--readids`** or **`-I`**: Append read ID after spot ID as 'accession.spot.readid'. With this flag, one sequence gets appended the ID `.1` and the other `.2`. Without this option, pair-ended reads will have identical IDs.
26 | 4. **`--read-filter pass`**: Only returns reads that pass filtering (without `N`s).
27 | 5. **`--dumpbase`** or **`-B`**: Formats sequence using base space (default for other than SOLiD). Included to avoid colourspace (in which pairs of bases are represented by numbers).
28 | 6. **`--split-3`** separates the reads into left and right ends. If there is a left end without a matching right end, or a right end without a matching left end, they will be put in a single file.
29 | 7. **`--clip`** or **`-W`**: Some of the sequences in the SRA contain tags that need to be removed. This will remove those sequences.
30 | 8. **`--outdir`** or **`-O`**: *(Optional)* Output directory, default is current working directory.
31 | 9. **`SRR_ID`**: This is is the ID of the run from SRA to be downloaded. This ID begins with "SRR" and is followed by around seven digits (e.g. `SRA1234567`).
32 |
33 | Other options that can be used instead of `--split-3`:
34 |
35 | 1. **`--split-files`** splits the FASTQ reads into two files: one file for mate 1s (`...1`), and another for mate 2s (`..._2`). This option will not mateless pairs into a third file.
36 | 2. **`--split-spot`** splits the FASTQ reads into two (mate 1s and mate 2s) within one file. `--split-spot` gives you an 8-line fastq format where forward precedes reverse (see ).
37 |
38 | ## Demonstration
39 |
40 | In this demo, `fastq-dump` is used to download compressed FASTQ reads.
41 |
42 | [](https://asciinema.org/a/306937?autoplay=1)
43 |
44 | ## Further reading
45 |
46 | 1. Rob Edward's notes on `fastq-dump`:
47 |
--------------------------------------------------------------------------------
/docs/fastq-dump_to_featureCounts.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | title: Fastq-dump to featureCounts
4 | parent: 3. Scripts
5 | ---
6 |
7 | # Fastq-dump to featureCounts.sh
8 |
9 | [fastq-dump_to_featureCounts.sh](../scripts/fastq-dump_to_featureCounts.sh) is a `bash` script that...
10 |
11 | 1. Downloads FASTQ reads from NCBI's SRA using [fastq-dump](fastq-dump.md)
12 | 2. Indexes a reference genome and aligns reads to that index using [bowtie2](bowtie2.md)
13 | 3. Converts the alignment file created by bowtie2 to BAM format and sorts it using [samtools](samtools.md)
14 | 4. Assigns the read alignments to genes in a genome annotation file using [featureCounts](featureCounts.md)
15 |
16 | ## Demonstration
17 |
18 | This is a video demonstration of [fastq-dump_to_featureCounts.sh](../scripts/fastq-dump_to_featureCounts.sh).
19 |
20 | During this demonstration, the full genome sequence and genome annotation for [*Saccharomyces cerevisiae* S288C](https://www.ncbi.nlm.nih.gov/assembly/GCF_000146045.2) are used. The files [example_nucleotide_sequence.fasta](../data/example_nucleotide_sequence.fasta) and [example_genome_annotation.gtf](../data/example_genome_annotation.gtf) are fragments of the nucleotide sequence and annotation for this genome. [RNA-Seq reads for *Saccharomyces cerevisiae* (SRR8933512)](https://www.ncbi.nlm.nih.gov/sra/SRR8933512) are used as the example FASTQ files in this demonstration.
21 |
22 | [](https://asciinema.org/a/308745?autoplay=1)
23 |
24 | ## Usage
25 |
26 | ```
27 | fastq-dump_to_featureCounts.sh [options] -a|--annotation -f|--fasta
28 |
29 | This script downloads FASTQ reads from NCBI's SRA, aligns them to an annotated
30 | genome using bowtie2, and generates gene count table(s) using featureCounts.
31 | It can take a single SRR ID as an input, or multiple SRR IDs separated by
32 | spaces.
33 |
34 | Required arguments:
35 | -a | --annotation input genome annotation file
36 | -f | --fasta input FASTA file for annotated genome
37 | SRR ID(s) Sequence Read Archive Run ID(s) (SRR...)
38 |
39 | Optional arguments:
40 | -h | --help show this help text and exit
41 | -p | --processors number (n) of processors to use (default: 1)
42 | --fastq-dump use 'fastq-dump' instead of the 'fasterq-dump'
43 | --verbose make output of script more verbose
44 | --removetemp remove read and alignment files once they are
45 | no longer needed (minimises disk space needed)
46 | --log redirect terminal output to log file
47 | ```
48 |
49 | ## See also
50 |
51 | 1. [fastq-dump_to_featureCounts.sh on GitHub](https://github.com/rnnh/bioinfo-notebook/blob/master/scripts/fastq-dump_to_featureCounts.sh)
52 |
--------------------------------------------------------------------------------
/docs/featureCounts.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | title: FeatureCounts
4 | parent: 2. Program guides
5 | ---
6 |
7 |
8 | # FeatureCounts
9 |
10 | `featureCounts` is a program that counts how many reads map to genomic features, such as genes, exon, promoter and genomic bins.
11 |
12 | ## Counting how many reads align to each gene in a genome annotation using `featureCounts`
13 |
14 | `featureCounts` can be used to count how many reads align to genes as follows:
15 |
16 | ```
17 | $ featureCounts -p -O -T n -a example_genome_annotation.gtf -o example_featureCounts_output.txt sorted_example_alignment.bam
18 | ```
19 |
20 | In this command...
21 |
22 | 1. **`-p`** species that fragments (or templates) will be counted instead of reads. This is only applicable for paired-end reads.
23 | 2. **`-O`** assigns reads to all their overlapping meta-features.
24 | 3. **`-T`** specifies the number (*`n`*) of threads to be used.
25 | 4. **`-a`** is the genome annotation file (`example_genome_annotation.gtf`).
26 | 5. **`-o`** specifies the name of the output file, which includes the read counts (`example_featureCounts_output.txt`).
27 | 6. **`sorted_example_alignment.bam`** is an alignment file: in this file, the reads we want to count are aligned to the same genome as the annotation file.
28 |
29 | ### Demonstration
30 |
31 | In this video, `featureCounts` is used to assign reads in an alignment file (`sorted_example_alignment.bam`) to genes in a genome annotation file (`example_genome_annotation.gtf`).
32 |
33 | [](https://asciinema.org/a/306584?autoplay=1)
34 |
35 | ## More important options for `featureCounts`
36 |
37 | 1. **`-s`** specifies strand-specific read counting. `0` for unstranded reads, `1` for stranded reads and `2` for reversely stranded reads. This depends on the library used in the sequencing protocol.
38 |
39 | ## Further reading
40 |
41 | 1. The `subread` user guide:
42 | 2. The `featureCounts` paper:
43 |
--------------------------------------------------------------------------------
/docs/genome_annotation_SwissProt_CDS.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | title: Genome annotation script
4 | parent: 3. Scripts
5 | ---
6 |
7 | # Genome annotation SwissProt CDS.sh
8 |
9 | [genome annotation SwissProt CDS.sh](../scripts/genome_annotation_SwissProt_CDS.sh) is a bash script that annotates the coding sequences (CDS) in a given genome assembly.
10 | It uses [BLAST](blast.md) and [MGKit](https://github.com/frubino/mgkit), which are included in the `bioinfo-notebook` [conda environment](conda.md).
11 |
12 | ## Usage
13 |
14 | ```
15 | genome_annotation_SwissProt_CDS.sh [-h|--help] [-d|--demo] [-i|--input]
16 | [-l|--log -p|--processors n -e|--email]
17 |
18 | A script to annotate proteins in a genome assembly, using BLASTx with
19 | UniProtKB/Swiss-Prot.
20 |
21 | When run with the arugment '-d' or '--demo' this script...
22 |
23 | 1. Downloads a Saccharomyces cerevisiae S288C genome assembly, and
24 | the UniProtKB/Swiss-Prot amino acid sequences.
25 | 2. Creates a BLAST database from the downloaded Swiss-Prot sequences,
26 | and searches the S. cerevisiae genome against it using BLASTx with an
27 | E-value threshold of 1e-100.
28 | 3. Filters the BLASTx results, removing results with less than 90%
29 | identity.
30 | 4. Creates a genome annotation GFF file from these BLASTx results.
31 | 5. Adds information to the genome annotation from UniProt (protein
32 | names, KeGG ortholog information, EC numbers, etc.)
33 |
34 | The end result ('S_cere.gff') is an annotation of the coding sequences (CDS)
35 | in the S. cerevisiae genome that are described in UniProtKB/Swiss-Prot.
36 |
37 | This script can also be run with the argument '-i' or '--input', which is used
38 | to specify a FASTA nucleotide file (.fasta or .fna) to annotate, instead of
39 | the demo sequence. The end result is also an annotation of the CDS in the input
40 | sequence based on UniProtKB/Swiss-Prot, called '.gff'.
41 |
42 | This script should be called from the 'bioinfo-notebook/' directory.The
43 | programs required for this script are in the 'bioinfo-notebook' conda
44 | environment (bioinfo-notebook/envs/bioinfo-notebook.yml or
45 | bioinfo-notebook/envs/bioinfo-notebook.txt).
46 | If the input file is not in the 'bioinfo-notebook/data/' directory, the full
47 | file path should be given.
48 |
49 | arguments:
50 | -h | --help show this help text and exit
51 | -i | --input name of input FASTA nucleotide file to annotate
52 | -d | --demo run the script with demonstration inputs
53 |
54 | optional arguments:
55 | -l | --log redirect terminal output to a log file
56 | -p | --processors set the number (n) of processors to use
57 | (default: 1)
58 | -e | --email contact email for UniProt queries
59 | ```
60 |
61 | ## See also
62 |
63 | - [genome_annotation_SwissProt_CDS.sh on GitHub](https://github.com/rnnh/bioinfo-notebook/blob/master/scripts/genome_annotation_SwissProt_CDS.sh)
64 | - [BLAST](blast.md)
65 | - [MGKit](https://github.com/frubino/mgkit)
66 | - [Conda](conda.md)
67 |
--------------------------------------------------------------------------------
/docs/htseq-count.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | title: Htseq-count
4 | parent: 2. Program guides
5 | ---
6 |
7 |
8 | # Htseq-count
9 |
10 | Given a file with aligned sequencing reads and a list of genomic features, `htseq-count` can be used to count how many reads map to each feature.
11 |
12 | ## Aligining reads to a genome annotation using `htseq-count`
13 |
14 | `htseq-count` can be used to align reads to a genome annotation as follows:
15 |
16 | ```
17 | $ htseq-count --format bam sorted_alignment_file.bam genome_annotation > output_file.txt
18 | ```
19 |
20 | In this command...
21 |
22 | 1. **`--format`** or **`-f`** is the format of the input data. Possible values are `sam` (for text SAM files) and `bam` (for binary BAM files). Default is `sam`. A `bam` file is used in this example.
23 | 2. **`--order`** specifies whether the alignments have been sorted by name (`name`) or coordinates/position (`pos`).
24 | 3. **`sorted_alignment_file.bam`** is a `bam` format alignment file, sorted by name.
25 | 4. **`genome_annotation`** is the genome annotation file the reads in the `alignment_file` are aligned to (`.gtf` or `.gff`).
26 | 5. **`> output_file.txt`** redirects the output (`STDOUT`) to `output_file.txt`.
27 |
28 | ### Demonstration
29 |
30 | In this video, `htseq-counts` is used to count how many reads in an alignment file (`sorted_example_alignment.bam`) match the genes in a genome annotation (`example_genome_annotation.gtf`).
31 |
32 | [](https://asciinema.org/a/306597?autoplay=1)
33 |
34 | ## The `htseq-count` output file
35 |
36 | The program outputs a table with counts for each feature, followed by the special counters, which count reads that were not counted for any feature for various reasons.
37 | The names of the special counters all start with a double underscore, to facilitate filtering (**Note:** The double underscore was absent up to version 0.5.4).
38 | The special counters are:
39 |
40 | 1. **`__no_feature`**: reads (or read pairs) which could not be assigned to any feature (set S as described above was empty).
41 | 2. **`__ambiguous`**: reads (or read pairs) which could have been assigned to more than one feature and hence were not counted for any of these, unless the --nonunique all option was used (set S had more than one element).
42 | 3. **`__too_low_aQual`**: reads (or read pairs) which were skipped due to the optional minimal alignment quality flag.
43 | 4. **`__not_aligned`**: reads (or read pairs) in the SAM/BAM file without an alignment.
44 | 5. **`__alignment_not_unique`**: reads (or read pairs) with more than one reported alignment.
45 |
46 | ## Further reading
47 |
48 | 1. The `htseq-count` manual:
49 |
--------------------------------------------------------------------------------
/docs/linux_setup.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | title: Linux setup script
4 | parent: 3. Scripts
5 | ---
6 |
7 | # Linux setup script
8 |
9 | [linux_setup.sh](../scripts/linux_setup.sh) is a `bash` shell script that...
10 |
11 | 1. Downloads and installs [Miniconda3](conda.md)
12 | 2. Installs the `bioinfo-notebook` [virtual environment using conda](conda.md#cloning-and-activating-a-conda-environment)
13 |
14 | This will use around 2.3 GB of hard disk space in total.
15 |
16 | If you are using a Linux system that does not have Anaconda/Miniconda installed, this script will set up everything you need to follow the guides on this website.
17 | If you are using a freshly installed [Ubuntu virtual machine](ubuntu_virtualbox.md) or [Ubuntu through Windows Subsystem for Linux](wsl.md), this script is the ideal way to set up your new system.
18 |
19 | ## Demonstration
20 |
21 | This is a video demonstration of [linux_setup.sh](../scripts/linux_setup.sh).
22 |
23 | In this demonstration, the [bioinfo-notebook GitHub repository](https://github.com/rnnh/bioinfo-notebook) (or "repo") is cloned into the home directory of the Linux system (Ubuntu).
24 | This means that all the files for this project will be downloaded from GitHub into the `~/bioinfo-notebook/` directory.
25 | A GitHub repo can be cloned using the command `$ git clone` followed by the URL of the target repo (which can be found on GitHub using the "Clone or download" button).
26 | The Linux setup script is then run from this cloned GitHub repo.
27 |
28 | [](https://asciinema.org/a/314853?autoplay=1)
29 |
30 | ## Usage
31 |
32 | ```
33 | This script downloads and installs Miniconda3, and uses conda to install
34 | the 'bioinfo-notebook' virtual environment.
35 |
36 | Before running this script...
37 |
38 | 1. Please run the following command:
39 | $ sudo apt-get update
40 | This will ensure that the software installed will be up-to-date.
41 |
42 | 2. Please ensure that the 'bioinfo-notebook/' directory is in your
43 | home directory (~). The path to this directory should look like this:
44 | $HOME/bioinfo-notebook
45 |
46 | The 'bash' command is used to run this script:
47 | $ bash $0
48 |
49 | Optional arguments:
50 | -h | --help show this help text and exit
51 | ```
52 |
53 | ## See also
54 |
55 | - [linux_setup.sh on GitHub](https://github.com/rnnh/bioinfo-notebook/blob/master/scripts/linux_setup.sh)
56 | - [Conda](conda.md)
57 | - [Cloning and activating a conda environment](conda.md#cloning-and-activating-a-conda-environment)
58 | - [Using Ubuntu through a Virtual Machine](ubuntu_virtualbox.md)
59 | - [Windows Subsystem for Linux](wsl.md)
60 |
--------------------------------------------------------------------------------
/docs/orthofinder.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | title: OrthoFinder
4 | parent: 2. Program guides
5 | ---
6 |
7 | # OrthoFinder
8 |
9 | OrthoFinder is [a program for phylogenetic orthology inference](https://davidemms.github.io/).
10 | It can be installed using the [orthofinder.yml](../envs/orthofinder.yml) virtual environment using [conda](conda.md).
11 |
12 | ## Running `OrthoFinder` to find orthologs between sets of FASTA amino acid sequences
13 |
14 | `OrthoFinder` can be used to find orthologs between sets of FASTA amino acid files as follows:
15 |
16 | ```bash
17 | $ orthofinder -t n -S diamond -f path/to/fasta/files/
18 | ```
19 |
20 | In this command...
21 |
22 | 1. **`-t`** sets the number of threads/processors to use (*n*).
23 | 2. **`-S`** is used to select the search tool OrthoFinder uses. Setting it to [`diamond` is far faster than the default BLAST method](https://github.com/davidemms/OrthoFinder/releases/tag/v2.2.7).
24 | 3. **`-f`** is used to select the directory of [FASTA amino acid sequences](file_formats.md#fasta) files you want to compare.
25 |
26 | OrthoFinder will create a `Results` directory (ending with the current month and day, e.g. `Results_Sep16/`) in the target directory specified with **`-f`**.
27 | This directory will contain summary statistics of orthologs found between the FASTA files, as well as putative gene duplication events, and phylogenetic trees of the detected orthogroups.
28 |
29 | ## See also
30 |
31 | - [conda](conda.md)
32 | - [File formats used in bioinformatics](file_formats.md)
33 |
34 | ## Further reading
35 |
36 | - [OrthoFinder tutorials](https://davidemms.github.io/menu/tutorials.html)
37 |
--------------------------------------------------------------------------------
/docs/part1.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | title: 1. General guides
4 | nav_order: 2
5 | description: "These are general guides for installing Ubuntu, using the command line, and the types of files used in bioinformatics."
6 | has_children: true
7 | has_toc: True
8 | ---
9 |
10 | # 1. General guides
11 |
12 | These are general guides for installing Ubuntu, using the command line, and the types of files used in bioinformatics.
13 |
--------------------------------------------------------------------------------
/docs/part2.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | title: 2. Program guides
4 | nav_order: 3
5 | description: "These are guides to individual programs."
6 | has_children: true
7 | has_toc: True
8 | ---
9 |
10 | # 2. Program guides
11 |
12 | These are brief guides to individual programs.
13 | They are not comprehensive, but instead aim to introduce the essential features of each program.
14 |
--------------------------------------------------------------------------------
/docs/part3.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | title: 3. Scripts
4 | nav_order: 4
5 | description: "These are scripts that use the programs and file formats discussed in this project."
6 | has_children: true
7 | has_toc: True
8 | ---
9 |
10 | # 3. Scripts
11 |
12 | These are scripts that use the programs and file formats discussed in this project.
13 |
--------------------------------------------------------------------------------
/docs/report_an_issue.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | title: Report an Issue
4 | nav_order: 4
5 | description: "Report an Issue"
6 | ---
7 |
8 | # Report an Issue
9 |
10 | [If there are any errors or mistakes, please let me know.](https://github.com/rnnh/bioinfo-notebook/issues)
11 |
--------------------------------------------------------------------------------
/docs/samtools.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | title: SAMtools
4 | parent: 2. Program guides
5 | ---
6 |
7 | # SAMtools
8 |
9 | SAMtools is a set of utilities that can manipulate alignment formats.
10 | It imports from and exports to the [SAM](file_formats.md#sam), [BAM](file_formats.md#bam) & [CRAM](file_formats.md#cram); does sorting, merging & indexing; and allows reads in any region to be retrieved swiftly.
11 |
12 | ## Converting a `sam` alignment file to a sorted, indexed `bam` file using `samtools`
13 |
14 | Sequence Alignment Map (SAM/`.sam`) is a text-based file is a text-based file format for sequence alignments.
15 | It's binary equivalent is Binary Alignment Map (BAM/`.bam`), which stores the same data as a compressed binary file.
16 | A binary file for a sequence alignment is preferable over a text file, as binary files are faster to work with.
17 | A SAM alignment file (`example_alignment.sam`) can be converted to a BAM alignment using `samtools view`.
18 |
19 | ```
20 | $ samtools view -@ n -Sb -o example_alignment.bam example_alignment.sam
21 | ```
22 |
23 | In this command...
24 |
25 | 1. **`-@`** sets the number (*`n`*) of threads/CPUs to be used. This flag is optional and can be used with other `samtools` commands.
26 | 2. **`-Sb`** specifies that the input is in SAM format (`S`) and the output will be be BAM format(`b`).
27 | 3. **`-o`** sets the name of the output file (`example_alignment.bam`).
28 | 4. **`example_alignment.sam`** is the name of the input file.
29 |
30 | Now that the example alignment is in BAM format, we can sort it using `samtools sort`.
31 | Sorting this alignment will allow us to create a index.
32 |
33 | ```
34 | $ samtools sort -O bam -o sorted_example_alignment.bam example_alignment.bam
35 | ```
36 |
37 | In this command...
38 |
39 | 1. **`-O`** specifies the output format (`bam`, `sam`, or `cram`).
40 | 2. **`-o`** sets the name of the output file (`sorted_example_alignment.bam`).
41 | 3. **`example_alignment.bam`** is the name of the input file.
42 |
43 | This sorted BAM alignment file can now be indexed using `samtools index`.
44 | Indexing speeds allows fast random access to this alignment, allowing the information in the alignment file to be processed faster.
45 |
46 | ```
47 | $ samtools index sorted_example_alignment.bam
48 | ```
49 |
50 | In this command...
51 |
52 | 1. **`sorted_example_alignment.bam`** is the name of the input file.
53 |
54 | ### Demonstration 1
55 |
56 | In this video, `samtools` is used to convert `example_alignment.sam` into a BAM file, sort that BAM file, and index it.
57 |
58 | [](https://asciinema.org/a/U1Flwg3EljOfI1Sx77h8PvuNf?autoplay=1)
59 |
60 | ## Simulating short reads using `wgsim`
61 |
62 | `wgsim` is a SAMtools program that can simulate short sequencing reads from a reference genome.
63 | This is useful for creating FASTQ files to practice with.
64 |
65 | ```
66 | $ wgsim example_nucleotide_sequence.fasta example_reads_1.fastq example_reads_2.fastq
67 | ```
68 |
69 | In this command...
70 |
71 | 1. **`example_nucleotide_sequence.fasta`** is the reference genome input.
72 | 2. **`example_reads_1.fastq`** and **`example_reads_2.fastq`** are the names of the simulated read output files.
73 |
74 | ### Demonstration 2
75 |
76 | In this video, `wgsim` is used to simulate reads from `example_nucleotide_sequence.fasta`.
77 |
78 | [](https://asciinema.org/a/m89gXtx4cKRnKpI6amWj3BEAH?autoplay=1)
79 |
80 | ## Indexing a FASTA file using `samtools faidx`
81 |
82 | SAMtools can be used to index a FASTA file as follows...
83 |
84 | ```bash
85 | $ samtools faidx file.fasta
86 | ```
87 |
88 | After running this command, `file.fasta` can now be used by [bcftools](bcftools.md).
89 |
90 | ## See also
91 |
92 | - [Alignment formats](file_formats.md#alignment-formats)
93 | - The `samtools` manual:
94 |
--------------------------------------------------------------------------------
/docs/sgRNAcas9.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | title: sgRNAcas9
4 | parent: 2. Program guides
5 | ---
6 |
7 | # sgRNAcas9
8 |
9 | sgRNAcas9 is [a package for designing CRISPR sgRNA and evaluating potential off-target cleavage sites](https://doi.org/10.1371/journal.pone.0100448).
10 |
11 | ## Running sgRNAcas9
12 |
13 | 1. Install the [conda](conda.md) virutal environment for [sgRNAcas9](../envs/sgRNAcas9.yml).
14 | 2. Download [the GUI version of sgRNAcas9 from SourceForge](https://sourceforge.net/projects/sgrnacas9/).
15 | 3. Activate the sgRNAcas9 virtual environment.
16 | 4. In the directory for sgRNAcas9, run the following command to launch the sgRNAcas9 graphical user interface (GUI):
17 |
18 | ```bash
19 | (sgRNAcas9) ~/sgRNAcas9_V3.0_GUI$ java -jar sgRNAcas9.jar
20 | ```
21 |
22 | ## Using sgRNAcas9
23 |
24 | In the sgRNAcas9 GUI...
25 |
26 | - Select the [FASTA nucleic acid](file_formats.md#fasta) file of the target sequences in the "Target sequences(FASTA):" dialog box.
27 | - Select the [FASTA nucleic acid](file_formats.md#fasta) file of the genome you want to design the guide RNAs for in the "Genome sequence(FASTA):" dialog box.
28 | - Click "RUN" to run the program
29 |
30 | sgRNAcas9 will create a `report` directory in the current working directory.
31 | This directory contains its results.
32 | The most important file in this directory is `sgRNAcas9_report.xls`.
33 | This Excel files contains reported guide RNA sequences for CRISPR with quality score, and counts of potential off-target sites.
34 |
35 | ## References
36 |
37 | - [sgRNAcas9 paper](https://sourceforge.net/projects/sgrnacas9/)
38 | - [sgRNAcas9 website](http://biootools.com/software.html)
39 | - [sgRNAcas9 on SourceForge](https://sourceforge.net/projects/sgrnacas9/)
40 |
--------------------------------------------------------------------------------
/docs/snp_calling.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | title: SNP calling script
4 | parent: 3. Scripts
5 | ---
6 |
7 | # SNP calling script
8 |
9 | [snp_calling.sh](../scripts/snp_calling.sh) is a `bash` shell script that downloads [FASTQ](file_formats.md) sequencing reads using [fastq-dump](fastq-dump.md), aligns them to a genome using [bowtie2](bowtie2.md), and writes variants (SNPs and indels) to a variant call format (VCF) file.
10 |
11 | ## Usage
12 |
13 | ```
14 | snp_calling.sh [-h|--help] [-1|--one -2|--two -r|--reference]
15 | [-d|--demo] [-o|--output -l|--log -p|--processors n]
16 |
17 | This script aligns sequencing reads to a reference genome, and finds genetic
18 | variants (SNPs/indels) based on this alignment, which are written to a variant
19 | call format (VCF) file.
20 |
21 | Calling this script with the argument '-d' or '--demo' will run this script
22 | using Saccharomyces cerevisiae FASTQ sequencing reads and a Saccharomyces
23 | cerevisiae reference genome, which will be downloaded from NCBI.
24 |
25 | This script should be called from the 'bioinfo-notebook/' directory.The
26 | programs required for this script are in the 'bioinfo-notebook' conda
27 | environment (bioinfo-notebook/envs/bioinfo-notebook.yml or
28 | bioinfo-notebook/envs/bioinfo-notebook.txt).
29 | If the input files are not in the 'bioinfo-notebook/data/' directory, the full
30 | file paths should be given.
31 |
32 |
33 | arguments:
34 | -h | --help show this help text and exit
35 | -1 | --one forward reads to align with reference sequence
36 | (FASTQ: .fastq or .fastq.gz)
37 | -2 | --two reverse reads to align with reference sequence
38 | (FASTQ: .fastq or .fastq.gz)
39 | -r | --reference reference sequence to align reads against
40 | (FASTA nucleotide file: .fna)
41 | -d | --demo run the script with demonstration inputs
42 |
43 | optional arguments:
44 | -o | --output optional: name of final output file
45 | (default: 'reference_seq_vs_reads_var.vcf', or
46 | 'S_cere_DRR237290_var.vcf' if demo is used).
47 | -l | --log redirect terminal output to a log file in the
48 | directory bioinfo-notebook/results/
49 | -p | --processors optional: set the number (n) of processors to
50 | use (default: 1)
51 | ```
52 |
53 | ## See also
54 |
55 | - [snp_calling.sh on GitHub](https://github.com/rnnh/bioinfo-notebook/blob/master/scripts/snp_calling.sh)
56 | - [File formats used in bioinformatics](file_formats.md)
57 | - [samtools](samtools.md)
58 | - [fastq-dump](fastq-dump.md)
59 | - [bowtie2](bowtie2.md)
60 |
--------------------------------------------------------------------------------
/docs/ubuntu_virtualbox.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | title: Using Ubuntu through a Virtual Machine
4 | parent: 1. General guides
5 | nav_order: 3
6 | ---
7 |
8 |
9 | # Using Ubuntu through a Virtual Machine
10 |
11 | *Ubuntu* is a Linux operating system that is widely used for bioinformatics.
12 | If you have not used a Linux system before, an Ubuntu virtual machine is an ideal way to try the programs documented on this website.
13 |
14 | A *virtual machine* is a computer file, typically called an image, that behaves like an actual computer.
15 | It acts as a computer within a computer.
16 | Virtual machines run in a window, much like any other program running in a window on your computer.
17 | The virtual machine is sequestered from the rest of the system, meaning that the software inside a virtual machine can not tamper with the computer itself.
18 | This produces an ideal environment for testing other operating systems, and running software or applications on operating systems they were not originally intended for.
19 |
20 | An Ubuntu virtual machine can be created using *VirtualBox*, and an Ubuntu *disk image*.
21 | VirtualBox is a program that can be used to create, manage, and access virtual machines.
22 | A disk image is a file that acts like a compact disc, or another storage device.
23 | VirtualBox and the Ubuntu disk image are freely available online.
24 |
25 | ## Contents
26 |
27 | - [Files required to set up an Ubuntu virtual machine](#files-required-to-set-up-an-ubuntu-virtual-machine)
28 | - [Direct links to download required files](#direct-links-to-download-required-files)
29 | - [How to create an Ubuntu virtual machine using VirtualBox](#how-to-create-an-ubuntu-virtual-machine-using-virtualbox)
30 | - [Increasing the screen resolution of the Ubuntu virtual machine](#increasing-the-screen-resolution-of-the-ubuntu-virtual-machine)
31 | - [See also](#see-also)
32 | - [References](#references)
33 |
34 | ## Files required to set up an Ubuntu virtual machine
35 |
36 | To set up an Ubuntu virtual machine, you will need an Ubuntu disk image, a file to install VirtualBox, and the VirtualBox Extension Package.
37 | This requires around 13 GB of free hard drive space on your computer in total.
38 | The Ubuntu disk image is around 2 GB in size, and may take a while to download depending on your internet connection.
39 | The file required to install VirtualBox is around 108 or 123 MB in size, depending on the platform of your computer (i.e. Windows or Mac).
40 |
41 | ### Direct links to download required files
42 |
43 | 1. [The Ubuntu disk image (filename: `ubuntu-18.04.4-desktop-amd64.iso`)](http://releases.ubuntu.com/18.04.4/ubuntu-18.04.4-desktop-amd64.iso)
44 | 2. [VirtualBox installer for Windows](https://download.virtualbox.org/virtualbox/6.1.4/VirtualBox-6.1.4-136177-Win.exe)
45 | 3. [VirtualBox installer for Mac](https://download.virtualbox.org/virtualbox/6.1.4/VirtualBox-6.1.4-136177-OSX.dmg)
46 | 4. [VirtualBox Extension Pack (all platforms)](https://download.virtualbox.org/virtualbox/6.1.4/Oracle_VM_VirtualBox_Extension_Pack-6.1.4.vbox-extpack)
47 |
48 | If the above links do not work, they may have expired.
49 | In this case, the above files can be found on the [VirtualBox website](https://www.virtualbox.org/wiki/Downloads) and the [Ubuntu website](https://ubuntu.com/download/desktop).
50 |
51 | ## How to create an Ubuntu virtual machine using VirtualBox
52 |
53 | 1. Download the [VirtualBox installer](#direct-links-to-download-required-files) for your computer (either Windows or Mac).
54 | 2. Once the VirtualBox installer is downloaded, open it and follow the on-screen instructions to install the VirtualBox program.
55 | 3. **Windows only:** If you get a "Windows Security" prompt asking *"Would you like to install this device software?"* for driver software from *"Publisher: Oracle Corporation"*, select "Install".
56 | 4. **Mac only:** If you get a *"This package will run a program to determine if the software can be installed"* prompt while installing VirtualBox, select "Continue". You may also be asked to enter your user password while installing VirtualBox on a Mac.
57 | 5. Once installed, open the VirtualBox program.
58 | 6. In VirtualBox, click on "New" (the blue badge). This will open a menu to create a new virtual machine.
59 | 7. In the "Name" field of the "Name and operating system" window, type "ubuntu". VirtualBox will automatically set the type and version for this virtual machine as "Linux" and "Ubuntu".
60 | 8. Select "Next" to proceed to the "Memory size" section.
61 | 9. In this section, you can set the amount of Random Access Memory (RAM) that the virtual machine can use. A suggested amount of RAM will automatically be selected when you get to this page, but you can increase the amount of RAM allocated using the slider on this page.
62 | 10. **Note:** If you use the slider to increase the amount of RAM allocated on the "Memory Size" page, keep the slider in the green zone. Setting the slider in the orange or red zone (>50% of your computer's available RAM) will negatively affect the performance of the virtual machine.
63 | 11. Select "Next" to proceed to the "Hard disk" page.
64 | 12. Select "Create a virtual hard disk now", and then select "Create".
65 | 13. On the "Hard disk file type" page, select "VDI (VirtualBox Disk Image)", and then select "Next".
66 | 14. On the "Store on physical hard disk", select "Dynamically allocated", and then select "Next" to proceed to the "File location and size" page.
67 | 15. On this page, you can change the location and size of the virtual hard disk. There is no need to adjust the size of the virtual hard disk, but take note of its location (the folder/directory it will be created in). Select "Create".
68 | 16. In the left side of the VirtualBox main menu, double-click the name of the virtual machine you just created ("ubuntu").
69 | 17. This will bring up the "Select start-up disk" window. In this window, select the folder icon to open the "Optical Disk Selector" menu.
70 | 18. In this menu, select "Add", which will open a window titled "Please choose a virtual optical disk file".
71 | 19. In this window, go to the folder into which the Ubuntu disk image downloaded (e.g. "Downloads"), and click the [Ubuntu disk image (filename: `ubuntu-18.04.4-desktop-amd64.iso`)](#direct-links-to-download-required-files) to select it, and then select "Open".
72 | 20. This will bring you back to the "Optical Disk Selector" window. Select the Ubuntu disk image you selected in the previous window, and click on "Choose".
73 | 21. This will bring you back to the "Select start-up disk" window. The Ubuntu disk image should be selected in the drop down menu (this read "Empty" before the Ubuntu disk image was added). Select "Start" to start the virtual machine.
74 | 22. The Ubuntu virtual machine is now running in its own window. It may take a few minutes to start up the first time.
75 | 23. On the "Welcome" screen in Ubuntu, select "Install Ubuntu".
76 | 24. In the "Keyboard layout" section, select your keyboard layout, and then select "Continue". This will bring you to the "Updates and other software" window.
77 | 25. In this window, in the section "What apps would you like to install to start with?", select "Minimal installation".
78 | 26. In the "Other options" section, select "Download updates while installing Ubuntu", and leave "Install third-party software..." unselected. Select "Continue" to proceed to the "Installation type" window.
79 | 27. In this window, select "Erase disk and install Ubuntu". As this is a virtual machine, in this instance "disk" refers to the virtual disk image (`.vdi`) file created earlier (see steps 12 to 15). Select "Install now".
80 | 28. A window titled "Write the changes to disks?" will appear. In this window, select "Continue".
81 | 29. This will bring you to the "Where are you?" window. In this window, enter your location (which is needed to set the system clock) and select "Continue".
82 | 30. Fill in the requested details in the "Who are you?" window: your name, your computer's name, your username (both of which will be filled in automatically when you enter your name), and your password. Make sure you remember your password, you will need it to install programs in your Ubuntu virtual machine. Select "Continue" to proceed.
83 | 31. At this point, Ubuntu will begin installing on the virtual disk image created earlier (the `.vdi` file). This will take a few minutes.
84 | 32. Once the installation is complete, select "Restart Now" from the "Installation complete" dialog window.
85 | 33. When asked "Please remove the installation media and press ENTER", press Enter (a.k.a. Return).
86 | 34. The virtual machine will then restart, and the Ubuntu login page will load. On this page, select the user you created during the installation, and enter your password to log in.
87 | 35. Once you have logged in, you have finished setting up your Ubuntu virtual machine. Click through the "What's new in Ubuntu" window for a brief introduction to Ubuntu.
88 | 36. When you want to close your Ubuntu virtual machine, close the window it is running in to bring up the "Close Virtual Machine" window, select "Power off the machine" and click "OK". This is the equivalent of shutting down the machine. Alternatively, you can select "Power off" within the Ubuntu virtual machine.
89 |
90 | Once you have finished installing the Ubuntu virtual machine, you can delete the Ubuntu disk image (filename: `ubuntu-18.04.4-desktop-amd64.iso`), and the VirtualBox installer.
91 |
92 | ## Increasing the screen resolution of the Ubuntu virtual machine
93 |
94 | At this point, the Ubuntu virtual machine takes up only a small portion of the VirtualBox window it runs in.
95 | To increase the screen resolution of the Ubuntu virtual machine, you will need to download the [VirtualBox Extension Package](#direct-links-to-download-required-files) and follow the steps below.
96 |
97 | 1. Once downloaded, double click the VirtualBox Extension Pack (file extension `.vbox-extpack`). If you have installed the VirtualBox program, it will open this file.
98 | 2. VirtualBox will open with a window notifying that an extension pack is about to be installed. In this window, select "Install" to proceed with the extension pack installation.
99 | 3. Scroll to the bottom of the Terms and Conditions window that opens, and select "I Agree" to install the extension pack.
100 | 4. Open the Ubuntu virtual machine in VirtualBox.
101 | 5. In the menu bar of the VirtualBox window in which Ubuntu is running, select the "Devices" menu, and select "Insert Guest Additions CD image...".
102 | 6. A notification will appear in the Ubuntu virtual machine: '"VBox_GAs_6.1.4" contains software intended to be automatically started. Would you like to run it?". In this window, select "Run", and enter your Ubuntu password to install the VirtualBox Guest Additions on the Ubuntu virtual machine.
103 | 7. A terminal window will open showing the VirtualBox Guest Additions installation progress. Once the installation has finished, press Return (Enter) to close this window.
104 | 8. Close the Ubuntu virtual machine by closing the window it is running in, and selecting "Power off the machine" from the "Close Virtual Machine" window.
105 | 9. Open the Ubuntu virtual machine in VirtualBox.
106 | 10. In the menu bar of the window in which Ubuntu is running, select the "View" menu, and confirm that "Auto-resize Guest Display" is enabled.
107 |
108 | ## See also
109 |
110 | - [Introduction to the command line](cl_intro.md)
111 | - [Windows Subsystem for Linux](wsl.md)
112 | - [The Ubuntu Website](https://ubuntu.com/)
113 | - [The VirtualBox Website](https://www.virtualbox.org/)
114 |
115 | ## References
116 |
117 | - [What is a Virtual Machine?](https://azure.microsoft.com/en-us/overview/what-is-a-virtual-machine/)
118 | - [How to Install Ubuntu on VirtualBox](https://www.wikihow.com/Install-Ubuntu-on-VirtualBox)
119 |
--------------------------------------------------------------------------------
/docs/wsl.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | title: Windows Subsystem for Linux
4 | parent: 1. General guides
5 | nav_order: 2
6 | ---
7 |
8 | # Windows Subsystem for Linux
9 |
10 | Windows Subsystem for Linux (WSL) is a feature of Windows 10.
11 | When enabled, WSL allows Linux systems (e.g. Ubuntu) to be used as Windows applications.
12 | These Linux systems can be downloaded directly from the Microsoft Store.
13 | The bioinfo-notebook [conda](conda.md) environment can be installed in an Ubuntu system running using WSL.
14 |
15 | ## Installing Ubuntu on Windows 10 using WSL
16 |
17 | Before you begin, make sure you have around 1.20 GB of free disk space.
18 |
19 | ### Enable WSL
20 |
21 | *Note:* Enabling the WSL feature will take a few minutes, and you will need to restart your computer for it to take effect.
22 |
23 | 1. In the search box on the taskbar, type "control panel", and then select Control Panel.
24 | 2. In the Control Panel, select "Programs".
25 | 3. Under Programs and Features, select "Turn Windows features on or off".
26 | 4. If asked "Do you want this app to make changes to your device?", select "Yes".
27 | 5. From the list of Windows features, tick the box next to "Windows Subsystem for Linux" to enable WSL, and click OK.
28 |
29 | ### Download Ubuntu from the Microsoft Store
30 |
31 | 1. In the search box on the taskbar, type "microsoft store", and select Microsoft Store.
32 | 2. In the Microsoft Store, search for "Ubuntu".
33 | 3. Select the Ubuntu app.
34 | 4. On the app page, select "Get" to download Ubuntu.
35 | 5. If asked to sign in with a Microsoft account, select "No, thanks".
36 |
37 | After enabling WSL and downloading Ubuntu from the Microsoft Store, Ubuntu can be used like a regular Windows application.
38 |
39 | ### Running Ubuntu for the first time
40 |
41 | 1. In the search box on the taskbar, type "Ubuntu", and select the Ubuntu app to launch it. It will take a few minutes to install the first time it runs.
42 | 2. When prompted, enter a UNIX username- this does not need to be the same as your Windows account name.
43 | 3. You will need to set a UNIX password. This is only used for the Ubuntu app, it does not need to be the same as your Windows password. Make sure you remember your UNIX password, as you will need it for installing new programs in Ubuntu.
44 |
45 | Once your UNIX password has been updated successfully, you will see the `bash` command prompt in the Ubuntu window:
46 |
47 | ```
48 | (Your UNIX username)@(Your computer's alias):~$ _
49 | ```
50 |
51 | In this command prompt, the tilde character (`~`) indicates that you are currently in your home directory.
52 | The dollar sign (`$`) indicates that this command line uses the `bash` shell language.
53 |
54 | ## See also
55 |
56 | - [Introduction to the command line](cl_intro.md)
57 | - [Using Ubuntu through a Virtual Machine](ubuntu_virtualbox.md)
58 | - [conda](conda.md)
59 |
--------------------------------------------------------------------------------
/envs/augustus.yml:
--------------------------------------------------------------------------------
1 | name: augustus
2 | channels:
3 | - bioconda
4 | - conda-forge
5 | - cf-staging
6 | - defaults
7 | dependencies:
8 | - _libgcc_mutex=0.1=conda_forge
9 | - _openmp_mutex=4.5=0_gnu
10 | - augustus=3.3.3=pl526hce533f5_0
11 | - biopython=1.77=py38h1e0a361_0
12 | - boost=1.70.0=py38h9de70de_1
13 | - boost-cpp=1.70.0=h7b93d67_3
14 | - bzip2=1.0.8=h516909a_2
15 | - ca-certificates=2020.6.20=hecda079_0
16 | - certifi=2020.6.20=py38h32f6830_0
17 | - curl=7.71.1=he644dc0_0
18 | - gsl=2.5=h294904e_1
19 | - htslib=1.9=h4da6232_3
20 | - icu=67.1=he1b5a44_0
21 | - krb5=1.17.1=hfafb76e_1
22 | - ld_impl_linux-64=2.34=h53a641e_5
23 | - libblas=3.8.0=11_openblas
24 | - libcblas=3.8.0=11_openblas
25 | - libcurl=7.71.1=hcdd3856_0
26 | - libdeflate=1.2=h516909a_1
27 | - libedit=3.1.20191231=h46ee950_1
28 | - libffi=3.2.1=he1b5a44_1007
29 | - libgcc-ng=9.2.0=h24d8f2e_2
30 | - libgfortran-ng=7.5.0=hdf63c60_9
31 | - libgomp=9.2.0=h24d8f2e_2
32 | - liblapack=3.8.0=11_openblas
33 | - libopenblas=0.3.6=h6e990d7_6
34 | - libssh2=1.9.0=hab1572f_3
35 | - libstdcxx-ng=9.2.0=hdf63c60_2
36 | - lp_solve=5.5.2.5=h14c3975_1001
37 | - lz4-c=1.9.2=he1b5a44_1
38 | - metis=5.1.0=he1b5a44_1005
39 | - ncurses=6.1=hf484d3e_1002
40 | - numpy=1.18.5=py38h8854b6b_0
41 | - openblas=0.3.6=h6e990d7_6
42 | - openssl=1.1.1g=h516909a_0
43 | - perl=5.26.2=h516909a_1006
44 | - perl-apache-test=1.40=pl526_1
45 | - perl-app-cpanminus=1.7044=pl526_1
46 | - perl-base=2.23=pl526_1
47 | - perl-carp=1.38=pl526_3
48 | - perl-class-load=0.25=pl526_0
49 | - perl-class-load-xs=0.10=pl526h6bb024c_2
50 | - perl-class-method-modifiers=2.12=pl526_0
51 | - perl-constant=1.33=pl526_1
52 | - perl-cpan-meta=2.150010=pl526_0
53 | - perl-cpan-meta-requirements=2.140=pl526_0
54 | - perl-cpan-meta-yaml=0.018=pl526_0
55 | - perl-data-dumper=2.173=pl526_0
56 | - perl-data-optlist=0.110=pl526_2
57 | - perl-dbi=1.642=pl526_0
58 | - perl-devel-globaldestruction=0.14=pl526_0
59 | - perl-devel-overloadinfo=0.005=pl526_0
60 | - perl-devel-stacktrace=2.04=pl526_0
61 | - perl-dist-checkconflicts=0.11=pl526_2
62 | - perl-encode=2.88=pl526_1
63 | - perl-eval-closure=0.14=pl526h6bb024c_4
64 | - perl-exporter=5.72=pl526_1
65 | - perl-extutils-cbuilder=0.280230=pl526_1
66 | - perl-extutils-makemaker=7.36=pl526_1
67 | - perl-extutils-manifest=1.72=pl526_0
68 | - perl-extutils-parsexs=3.35=pl526_0
69 | - perl-file-path=2.16=pl526_0
70 | - perl-file-temp=0.2304=pl526_2
71 | - perl-file-which=1.23=pl526_0
72 | - perl-getopt-long=2.50=pl526_1
73 | - perl-ipc-cmd=1.02=pl526_0
74 | - perl-json-pp=4.04=pl526_0
75 | - perl-locale-maketext-simple=0.21=pl526_2
76 | - perl-module-build=0.4224=pl526_3
77 | - perl-module-corelist=5.20190524=pl526_0
78 | - perl-module-implementation=0.09=pl526_2
79 | - perl-module-load=0.32=pl526_1
80 | - perl-module-load-conditional=0.68=pl526_2
81 | - perl-module-metadata=1.000036=pl526_0
82 | - perl-module-runtime=0.016=pl526_1
83 | - perl-module-runtime-conflicts=0.003=pl526_0
84 | - perl-moo=2.003004=pl526_0
85 | - perl-moose=2.2011=pl526hf484d3e_1
86 | - perl-mro-compat=0.13=pl526_0
87 | - perl-package-deprecationmanager=0.17=pl526_0
88 | - perl-package-stash=0.38=pl526hf484d3e_1
89 | - perl-package-stash-xs=0.28=pl526hf484d3e_1
90 | - perl-parallel-forkmanager=2.02=pl526_0
91 | - perl-params-check=0.38=pl526_1
92 | - perl-params-util=1.07=pl526h6bb024c_4
93 | - perl-parent=0.236=pl526_1
94 | - perl-pathtools=3.75=pl526h14c3975_1
95 | - perl-perl-ostype=1.010=pl526_1
96 | - perl-role-tiny=2.000008=pl526_0
97 | - perl-scalar-list-utils=1.52=pl526h516909a_0
98 | - perl-storable=3.15=pl526h14c3975_0
99 | - perl-sub-exporter=0.987=pl526_2
100 | - perl-sub-exporter-progressive=0.001013=pl526_0
101 | - perl-sub-identify=0.14=pl526h14c3975_0
102 | - perl-sub-install=0.928=pl526_2
103 | - perl-sub-name=0.21=pl526_1
104 | - perl-sub-quote=2.006003=pl526_1
105 | - perl-text-abbrev=1.02=pl526_0
106 | - perl-text-parsewords=3.30=pl526_0
107 | - perl-try-tiny=0.30=pl526_1
108 | - perl-version=0.9924=pl526_0
109 | - perl-xsloader=0.24=pl526_0
110 | - perl-yaml=1.29=pl526_0
111 | - pip=20.1.1=py_1
112 | - python=3.8.3=cpython_he5300dc_0
113 | - python_abi=3.8=1_cp38
114 | - readline=8.0=h46ee950_1
115 | - setuptools=49.1.0=py38h32f6830_0
116 | - sqlite=3.32.3=hcee41ef_1
117 | - suitesparse=4.5.6=h717dc36_1204
118 | - tbb=2020.1=hc9558a2_0
119 | - tk=8.6.10=hed695b0_0
120 | - wheel=0.34.2=py_1
121 | - xz=5.2.5=h516909a_1
122 | - zlib=1.2.11=h516909a_1006
123 | - zstd=1.4.4=h6597ccf_3
124 |
--------------------------------------------------------------------------------
/envs/bioinfo-notebook.txt:
--------------------------------------------------------------------------------
1 | # This file may be used to create an environment using:
2 | # $ conda create --name --file
3 | # platform: linux-64
4 | @EXPLICIT
5 | https://repo.anaconda.com/pkgs/main/linux-64/_libgcc_mutex-0.1-main.conda
6 | https://repo.anaconda.com/pkgs/main/linux-64/blas-1.0-mkl.conda
7 | https://repo.anaconda.com/pkgs/main/linux-64/ca-certificates-2020.10.14-0.conda
8 | https://repo.anaconda.com/pkgs/main/linux-64/intel-openmp-2020.1-217.conda
9 | https://repo.anaconda.com/pkgs/main/linux-64/jpeg-9b-h024ee3a_2.conda
10 | https://repo.anaconda.com/pkgs/main/linux-64/ld_impl_linux-64-2.33.1-h53a641e_7.conda
11 | https://repo.anaconda.com/pkgs/main/linux-64/libgcc-ng-9.1.0-hdf63c60_0.conda
12 | https://repo.anaconda.com/pkgs/main/linux-64/libgfortran-ng-7.3.0-hdf63c60_0.conda
13 | https://repo.anaconda.com/pkgs/main/linux-64/libstdcxx-ng-9.1.0-hdf63c60_0.conda
14 | https://repo.anaconda.com/pkgs/main/linux-64/bzip2-1.0.8-h7b6447c_0.conda
15 | https://repo.anaconda.com/pkgs/main/linux-64/expat-2.2.9-he6710b0_2.conda
16 | https://repo.anaconda.com/pkgs/main/linux-64/icu-58.2-he6710b0_3.conda
17 | https://conda.anaconda.org/bioconda/linux-64/libdeflate-1.0-h14c3975_1.tar.bz2
18 | https://repo.anaconda.com/pkgs/main/linux-64/libffi-3.3-he6710b0_2.conda
19 | https://repo.anaconda.com/pkgs/main/linux-64/libiconv-1.15-h63c8f33_5.conda
20 | https://repo.anaconda.com/pkgs/main/linux-64/libuuid-1.0.3-h1bed415_2.conda
21 | https://repo.anaconda.com/pkgs/main/linux-64/libxcb-1.14-h7b6447c_0.conda
22 | https://repo.anaconda.com/pkgs/main/linux-64/lz4-c-1.9.2-he6710b0_1.conda
23 | https://repo.anaconda.com/pkgs/main/linux-64/lzo-2.10-h7b6447c_2.conda
24 | https://repo.anaconda.com/pkgs/main/linux-64/mkl-2020.1-217.conda
25 | https://repo.anaconda.com/pkgs/main/linux-64/ncurses-6.2-he6710b0_1.conda
26 | https://repo.anaconda.com/pkgs/main/linux-64/openssl-1.1.1h-h7b6447c_0.conda
27 | https://repo.anaconda.com/pkgs/main/linux-64/pcre-8.44-he6710b0_0.conda
28 | https://repo.anaconda.com/pkgs/main/linux-64/perl-5.26.2-h14c3975_0.conda
29 | https://repo.anaconda.com/pkgs/main/linux-64/snappy-1.1.8-he6710b0_0.conda
30 | https://repo.anaconda.com/pkgs/main/linux-64/tbb-2020.0-hfd86e86_0.conda
31 | https://repo.anaconda.com/pkgs/main/linux-64/xz-5.2.5-h7b6447c_0.conda
32 | https://repo.anaconda.com/pkgs/main/linux-64/zlib-1.2.11-h7b6447c_3.conda
33 | https://repo.anaconda.com/pkgs/main/linux-64/blosc-1.19.0-hd408876_0.conda
34 | https://repo.anaconda.com/pkgs/main/linux-64/glib-2.65.0-h3eb4bd4_0.conda
35 | https://repo.anaconda.com/pkgs/main/linux-64/gstreamer-1.14.0-hb31296c_0.conda
36 | https://repo.anaconda.com/pkgs/main/linux-64/hdf5-1.10.4-hb1b8bf9_0.conda
37 | https://repo.anaconda.com/pkgs/main/linux-64/libedit-3.1.20191231-h14c3975_1.conda
38 | https://repo.anaconda.com/pkgs/main/linux-64/libpng-1.6.37-hbc83047_0.conda
39 | https://repo.anaconda.com/pkgs/main/linux-64/libssh2-1.9.0-h1ba5d50_1.conda
40 | https://repo.anaconda.com/pkgs/main/linux-64/libxml2-2.9.10-he19cac6_1.conda
41 | https://conda.anaconda.org/bioconda/linux-64/perl-app-cpanminus-1.7044-pl526_1.tar.bz2
42 | https://conda.anaconda.org/bioconda/linux-64/perl-base-2.23-pl526_1.tar.bz2
43 | https://conda.anaconda.org/bioconda/linux-64/perl-common-sense-3.74-pl526_2.tar.bz2
44 | https://conda.anaconda.org/bioconda/linux-64/perl-compress-raw-bzip2-2.087-pl526he1b5a44_0.tar.bz2
45 | https://conda.anaconda.org/bioconda/linux-64/perl-compress-raw-zlib-2.087-pl526hc9558a2_0.tar.bz2
46 | https://conda.anaconda.org/bioconda/linux-64/perl-constant-1.33-pl526_1.tar.bz2
47 | https://conda.anaconda.org/bioconda/linux-64/perl-data-dumper-2.173-pl526_0.tar.bz2
48 | https://conda.anaconda.org/bioconda/linux-64/perl-digest-hmac-1.03-pl526_3.tar.bz2
49 | https://conda.anaconda.org/bioconda/linux-64/perl-digest-md5-2.55-pl526_0.tar.bz2
50 | https://conda.anaconda.org/bioconda/linux-64/perl-exporter-5.72-pl526_1.tar.bz2
51 | https://conda.anaconda.org/bioconda/linux-64/perl-exporter-tiny-1.002001-pl526_0.tar.bz2
52 | https://conda.anaconda.org/bioconda/linux-64/perl-extutils-makemaker-7.36-pl526_1.tar.bz2
53 | https://conda.anaconda.org/bioconda/linux-64/perl-html-tagset-3.20-pl526_3.tar.bz2
54 | https://conda.anaconda.org/bioconda/linux-64/perl-io-html-1.001-pl526_2.tar.bz2
55 | https://conda.anaconda.org/bioconda/linux-64/perl-io-zlib-1.10-pl526_2.tar.bz2
56 | https://conda.anaconda.org/bioconda/linux-64/perl-mozilla-ca-20180117-pl526_1.tar.bz2
57 | https://conda.anaconda.org/bioconda/linux-64/perl-parent-0.236-pl526_1.tar.bz2
58 | https://conda.anaconda.org/bioconda/linux-64/perl-scalar-list-utils-1.52-pl526h516909a_0.tar.bz2
59 | https://conda.anaconda.org/bioconda/linux-64/perl-socket-2.027-pl526_1.tar.bz2
60 | https://conda.anaconda.org/bioconda/linux-64/perl-try-tiny-0.30-pl526_1.tar.bz2
61 | https://conda.anaconda.org/bioconda/linux-64/perl-xml-parser-2.44-pl526h4e0c4b3_7.tar.bz2
62 | https://conda.anaconda.org/bioconda/linux-64/perl-xml-sax-base-1.09-pl526_0.tar.bz2
63 | https://conda.anaconda.org/bioconda/linux-64/perl-xsloader-0.24-pl526_0.tar.bz2
64 | https://repo.anaconda.com/pkgs/main/linux-64/readline-8.0-h7b6447c_0.conda
65 | https://conda.anaconda.org/bioconda/linux-64/subread-2.0.0-hed695b0_0.tar.bz2
66 | https://repo.anaconda.com/pkgs/main/linux-64/tk-8.6.10-hbc83047_0.conda
67 | https://repo.anaconda.com/pkgs/main/linux-64/zstd-1.4.5-h9ceee32_0.conda
68 | https://repo.anaconda.com/pkgs/main/linux-64/dbus-1.13.16-hb2f20db_0.conda
69 | https://repo.anaconda.com/pkgs/main/linux-64/freetype-2.10.2-h5ab3b9f_0.conda
70 | https://repo.anaconda.com/pkgs/main/linux-64/gst-plugins-base-1.14.0-hbbd80ab_1.conda
71 | https://repo.anaconda.com/pkgs/main/linux-64/krb5-1.18.2-h173b8e3_0.conda
72 | https://conda.anaconda.org/bioconda/linux-64/ncbi-ngs-sdk-2.10.0-hdf6179e_0.tar.bz2
73 | https://conda.anaconda.org/bioconda/linux-64/perl-carp-1.38-pl526_3.tar.bz2
74 | https://conda.anaconda.org/bioconda/linux-64/perl-encode-2.88-pl526_1.tar.bz2
75 | https://conda.anaconda.org/bioconda/linux-64/perl-file-path-2.16-pl526_0.tar.bz2
76 | https://conda.anaconda.org/bioconda/linux-64/perl-html-parser-3.72-pl526h6bb024c_5.tar.bz2
77 | https://conda.anaconda.org/bioconda/linux-64/perl-io-compress-2.087-pl526he1b5a44_0.tar.bz2
78 | https://conda.anaconda.org/bioconda/linux-64/perl-list-moreutils-xs-0.428-pl526_0.tar.bz2
79 | https://conda.anaconda.org/bioconda/linux-64/perl-mime-base64-3.15-pl526_1.tar.bz2
80 | https://conda.anaconda.org/bioconda/linux-64/perl-ntlm-1.09-pl526_4.tar.bz2
81 | https://conda.anaconda.org/bioconda/linux-64/perl-storable-3.15-pl526h14c3975_0.tar.bz2
82 | https://conda.anaconda.org/bioconda/linux-64/perl-test-requiresinternet-0.05-pl526_0.tar.bz2
83 | https://conda.anaconda.org/bioconda/linux-64/perl-types-serialiser-1.0-pl526_2.tar.bz2
84 | https://conda.anaconda.org/bioconda/linux-64/perl-xml-namespacesupport-1.12-pl526_0.tar.bz2
85 | https://repo.anaconda.com/pkgs/main/linux-64/sqlite-3.32.3-h62c20be_0.conda
86 | https://repo.anaconda.com/pkgs/main/linux-64/fontconfig-2.13.0-h9420a91_0.conda
87 | https://repo.anaconda.com/pkgs/main/linux-64/libcurl-7.71.1-h20c2e04_1.conda
88 | https://conda.anaconda.org/bioconda/linux-64/perl-business-isbn-data-20140910.003-pl526_0.tar.bz2
89 | https://conda.anaconda.org/bioconda/linux-64/perl-encode-locale-1.05-pl526_6.tar.bz2
90 | https://conda.anaconda.org/bioconda/linux-64/perl-file-temp-0.2304-pl526_2.tar.bz2
91 | https://conda.anaconda.org/bioconda/linux-64/perl-html-tree-5.07-pl526_1.tar.bz2
92 | https://conda.anaconda.org/bioconda/linux-64/perl-json-xs-2.34-pl526h6bb024c_3.tar.bz2
93 | https://conda.anaconda.org/bioconda/linux-64/perl-list-moreutils-0.428-pl526_1.tar.bz2
94 | https://conda.anaconda.org/bioconda/linux-64/perl-lwp-mediatypes-6.04-pl526_0.tar.bz2
95 | https://conda.anaconda.org/bioconda/linux-64/perl-net-ssleay-1.88-pl526h90d6eec_0.tar.bz2
96 | https://conda.anaconda.org/bioconda/linux-64/perl-pathtools-3.75-pl526h14c3975_1.tar.bz2
97 | https://conda.anaconda.org/bioconda/linux-64/perl-time-local-1.28-pl526_1.tar.bz2
98 | https://repo.anaconda.com/pkgs/main/linux-64/python-3.7.7-hcff3b4d_5.conda
99 | https://conda.anaconda.org/conda-forge/linux-64/asciinema-2.0.2-py37_1000.tar.bz2
100 | https://conda.anaconda.org/bioconda/linux-64/bowtie-1.2.3-py37hc9558a2_0.tar.bz2
101 | https://conda.anaconda.org/bioconda/linux-64/bowtie2-2.3.5.1-py37he513fc3_0.tar.bz2
102 | https://repo.anaconda.com/pkgs/main/noarch/certifi-2020.6.20-pyhd3eb1b0_3.conda
103 | https://repo.anaconda.com/pkgs/main/linux-64/chardet-3.0.4-py37_1003.conda
104 | https://repo.anaconda.com/pkgs/main/noarch/click-7.1.2-py_0.conda
105 | https://repo.anaconda.com/pkgs/main/linux-64/curl-7.71.1-hbc83047_1.conda
106 | https://repo.anaconda.com/pkgs/main/noarch/decorator-4.4.2-py_0.conda
107 | https://repo.anaconda.com/pkgs/main/linux-64/future-0.18.2-py37_1.conda
108 | https://repo.anaconda.com/pkgs/main/noarch/idna-2.10-py_0.conda
109 | https://repo.anaconda.com/pkgs/main/linux-64/kiwisolver-1.2.0-py37hfd86e86_0.conda
110 | https://repo.anaconda.com/pkgs/main/noarch/mock-4.0.2-py_0.conda
111 | https://repo.anaconda.com/pkgs/main/linux-64/msgpack-python-1.0.0-py37hfd86e86_1.conda
112 | https://conda.anaconda.org/bioconda/linux-64/perl-archive-tar-2.32-pl526_0.tar.bz2
113 | https://conda.anaconda.org/bioconda/linux-64/perl-business-isbn-3.004-pl526_0.tar.bz2
114 | https://conda.anaconda.org/bioconda/linux-64/perl-http-date-6.02-pl526_3.tar.bz2
115 | https://conda.anaconda.org/bioconda/linux-64/perl-io-socket-ssl-2.066-pl526_0.tar.bz2
116 | https://conda.anaconda.org/bioconda/linux-64/perl-json-4.02-pl526_0.tar.bz2
117 | https://conda.anaconda.org/bioconda/noarch/perl-xml-sax-1.02-pl526_0.tar.bz2
118 | https://repo.anaconda.com/pkgs/main/noarch/pycparser-2.20-py_2.conda
119 | https://repo.anaconda.com/pkgs/main/linux-64/pymongo-3.11.0-py37he6710b0_0.conda
120 | https://repo.anaconda.com/pkgs/main/noarch/pyparsing-2.4.7-py_0.conda
121 | https://repo.anaconda.com/pkgs/main/linux-64/pysocks-1.7.1-py37_1.conda
122 | https://repo.anaconda.com/pkgs/main/noarch/pytz-2020.1-py_0.conda
123 | https://repo.anaconda.com/pkgs/main/linux-64/qt-5.9.7-h5867ecd_1.conda
124 | https://conda.anaconda.org/bioconda/noarch/semidbm-0.5.1-pyh864c0ab_3.tar.bz2
125 | https://repo.anaconda.com/pkgs/main/linux-64/sip-4.19.8-py37hf484d3e_0.conda
126 | https://repo.anaconda.com/pkgs/main/noarch/six-1.15.0-py_0.conda
127 | https://conda.anaconda.org/bioconda/linux-64/spades-3.13.0-0.tar.bz2
128 | https://repo.anaconda.com/pkgs/main/linux-64/tornado-6.0.4-py37h7b6447c_1.conda
129 | https://repo.anaconda.com/pkgs/main/noarch/tqdm-4.48.2-py_0.conda
130 | https://conda.anaconda.org/bioconda/linux-64/bcftools-1.9-ha228f0b_4.tar.bz2
131 | https://repo.anaconda.com/pkgs/main/linux-64/cffi-1.14.1-py37he30daa8_0.conda
132 | https://repo.anaconda.com/pkgs/main/linux-64/cycler-0.10.0-py37_0.conda
133 | https://repo.anaconda.com/pkgs/main/linux-64/mkl-service-2.3.0-py37he904b0f_0.conda
134 | https://conda.anaconda.org/bioconda/linux-64/perl-file-listing-6.04-pl526_1.tar.bz2
135 | https://conda.anaconda.org/bioconda/linux-64/perl-uri-1.76-pl526_0.tar.bz2
136 | https://conda.anaconda.org/bioconda/linux-64/perl-xml-libxml-2.0132-pl526h7ec2d77_1.tar.bz2
137 | https://conda.anaconda.org/bioconda/linux-64/perl-xml-sax-expat-0.51-pl526_3.tar.bz2
138 | https://repo.anaconda.com/pkgs/main/linux-64/pyqt-5.9.2-py37h05f1152_2.conda
139 | https://conda.anaconda.org/bioconda/linux-64/pysam-0.15.3-py37hda2845c_1.tar.bz2
140 | https://repo.anaconda.com/pkgs/main/noarch/python-dateutil-2.8.1-py_0.tar.bz2
141 | https://conda.anaconda.org/bioconda/linux-64/samtools-1.6-h244ad75_5.tar.bz2
142 | https://repo.anaconda.com/pkgs/main/linux-64/setuptools-49.4.0-py37_0.conda
143 | https://repo.anaconda.com/pkgs/main/linux-64/brotlipy-0.7.0-py37h7b6447c_1000.conda
144 | https://repo.anaconda.com/pkgs/main/linux-64/cryptography-2.9.2-py37h1ba5d50_0.conda
145 | https://repo.anaconda.com/pkgs/main/noarch/networkx-2.4-py_1.conda
146 | https://repo.anaconda.com/pkgs/main/linux-64/numpy-base-1.19.2-py37hfa32c7d_0.conda
147 | https://conda.anaconda.org/bioconda/linux-64/perl-http-message-6.18-pl526_0.tar.bz2
148 | https://conda.anaconda.org/bioconda/noarch/perl-net-http-6.19-pl526_0.tar.bz2
149 | https://conda.anaconda.org/bioconda/linux-64/perl-www-robotrules-6.02-pl526_3.tar.bz2
150 | https://conda.anaconda.org/bioconda/linux-64/perl-xml-simple-2.25-pl526_1.tar.bz2
151 | https://conda.anaconda.org/bioconda/linux-64/sra-tools-2.10.0-pl526he1b5a44_0.tar.bz2
152 | https://repo.anaconda.com/pkgs/main/linux-64/wheel-0.34.2-py37_0.conda
153 | https://conda.anaconda.org/bioconda/linux-64/perl-http-cookies-6.04-pl526_0.tar.bz2
154 | https://conda.anaconda.org/bioconda/linux-64/perl-http-daemon-6.01-pl526_1.tar.bz2
155 | https://conda.anaconda.org/bioconda/linux-64/perl-http-negotiate-6.01-pl526_3.tar.bz2
156 | https://repo.anaconda.com/pkgs/main/linux-64/pip-20.2.2-py37_0.conda
157 | https://repo.anaconda.com/pkgs/main/noarch/pyopenssl-19.1.0-py_1.conda
158 | https://conda.anaconda.org/bioconda/noarch/perl-libwww-perl-6.39-pl526_0.tar.bz2
159 | https://repo.anaconda.com/pkgs/main/noarch/urllib3-1.25.10-py_0.conda
160 | https://conda.anaconda.org/bioconda/linux-64/perl-lwp-protocol-https-6.07-pl526_4.tar.bz2
161 | https://repo.anaconda.com/pkgs/main/noarch/requests-2.24.0-py_0.conda
162 | https://conda.anaconda.org/bioconda/linux-64/entrez-direct-13.3-pl526h375a9b1_0.tar.bz2
163 | https://conda.anaconda.org/bioconda/linux-64/blast-2.9.0-pl526h3066fca_4.tar.bz2
164 | https://repo.anaconda.com/pkgs/main/linux-64/matplotlib-3.2.2-0.conda
165 | https://conda.anaconda.org/bioconda/linux-64/htseq-0.11.2-py37h637b7d7_1.tar.bz2
166 | https://repo.anaconda.com/pkgs/main/linux-64/matplotlib-base-3.2.2-py37hef1b27d_0.conda
167 | https://repo.anaconda.com/pkgs/main/linux-64/mkl_fft-1.1.0-py37h23d657b_0.conda
168 | https://repo.anaconda.com/pkgs/main/linux-64/mkl_random-1.1.1-py37h0573a6f_0.conda
169 | https://repo.anaconda.com/pkgs/main/linux-64/numpy-1.19.2-py37h54aff64_0.conda
170 | https://repo.anaconda.com/pkgs/main/linux-64/numexpr-2.7.1-py37h423224d_0.conda
171 | https://repo.anaconda.com/pkgs/main/linux-64/pandas-1.1.0-py37he6710b0_0.conda
172 | https://repo.anaconda.com/pkgs/main/linux-64/scipy-1.5.0-py37h0b6359f_0.conda
173 | https://repo.anaconda.com/pkgs/main/linux-64/patsy-0.5.1-py37_0.conda
174 | https://repo.anaconda.com/pkgs/main/linux-64/pytables-3.6.1-py37h71ec239_0.conda
175 | https://repo.anaconda.com/pkgs/main/linux-64/statsmodels-0.11.1-py37h7b6447c_0.conda
176 | https://conda.anaconda.org/bioconda/linux-64/mgkit-0.4.2-py37h516909a_0.tar.bz2
177 |
--------------------------------------------------------------------------------
/envs/bioinfo-notebook.yml:
--------------------------------------------------------------------------------
1 | name: bioinfo-notebook
2 | channels:
3 | - bioconda
4 | - conda-forge
5 | - defaults
6 | dependencies:
7 | - _libgcc_mutex=0.1=main
8 | - asciinema=2.0.2=py37_1000
9 | - bcftools=1.9=ha228f0b_4
10 | - blas=1.0=mkl
11 | - blast=2.9.0=pl526h3066fca_4
12 | - blosc=1.19.0=hd408876_0
13 | - bowtie=1.2.3=py37hc9558a2_0
14 | - bowtie2=2.3.5.1=py37he513fc3_0
15 | - brotlipy=0.7.0=py37h7b6447c_1000
16 | - bzip2=1.0.8=h7b6447c_0
17 | - ca-certificates=2020.10.14=0
18 | - certifi=2020.6.20=pyhd3eb1b0_3
19 | - cffi=1.14.1=py37he30daa8_0
20 | - chardet=3.0.4=py37_1003
21 | - click=7.1.2=py_0
22 | - cryptography=2.9.2=py37h1ba5d50_0
23 | - curl=7.71.1=hbc83047_1
24 | - cycler=0.10.0=py37_0
25 | - dbus=1.13.16=hb2f20db_0
26 | - decorator=4.4.2=py_0
27 | - entrez-direct=13.3=pl526h375a9b1_0
28 | - expat=2.2.9=he6710b0_2
29 | - fontconfig=2.13.0=h9420a91_0
30 | - freetype=2.10.2=h5ab3b9f_0
31 | - future=0.18.2=py37_1
32 | - glib=2.65.0=h3eb4bd4_0
33 | - gst-plugins-base=1.14.0=hbbd80ab_1
34 | - gstreamer=1.14.0=hb31296c_0
35 | - hdf5=1.10.4=hb1b8bf9_0
36 | - htseq=0.11.2=py37h637b7d7_1
37 | - icu=58.2=he6710b0_3
38 | - idna=2.10=py_0
39 | - intel-openmp=2020.1=217
40 | - jpeg=9b=h024ee3a_2
41 | - kiwisolver=1.2.0=py37hfd86e86_0
42 | - krb5=1.18.2=h173b8e3_0
43 | - ld_impl_linux-64=2.33.1=h53a641e_7
44 | - libcurl=7.71.1=h20c2e04_1
45 | - libdeflate=1.0=h14c3975_1
46 | - libedit=3.1.20191231=h14c3975_1
47 | - libffi=3.3=he6710b0_2
48 | - libgcc-ng=9.1.0=hdf63c60_0
49 | - libgfortran-ng=7.3.0=hdf63c60_0
50 | - libiconv=1.15=h63c8f33_5
51 | - libpng=1.6.37=hbc83047_0
52 | - libssh2=1.9.0=h1ba5d50_1
53 | - libstdcxx-ng=9.1.0=hdf63c60_0
54 | - libuuid=1.0.3=h1bed415_2
55 | - libxcb=1.14=h7b6447c_0
56 | - libxml2=2.9.10=he19cac6_1
57 | - lz4-c=1.9.2=he6710b0_1
58 | - lzo=2.10=h7b6447c_2
59 | - matplotlib=3.2.2=0
60 | - matplotlib-base=3.2.2=py37hef1b27d_0
61 | - mgkit=0.4.2=py37h516909a_0
62 | - mkl=2020.1=217
63 | - mkl-service=2.3.0=py37he904b0f_0
64 | - mkl_fft=1.1.0=py37h23d657b_0
65 | - mkl_random=1.1.1=py37h0573a6f_0
66 | - mock=4.0.2=py_0
67 | - msgpack-python=1.0.0=py37hfd86e86_1
68 | - ncbi-ngs-sdk=2.10.0=hdf6179e_0
69 | - ncurses=6.2=he6710b0_1
70 | - networkx=2.4=py_1
71 | - numexpr=2.7.1=py37h423224d_0
72 | - numpy=1.19.2=py37h54aff64_0
73 | - numpy-base=1.19.2=py37hfa32c7d_0
74 | - openssl=1.1.1h=h7b6447c_0
75 | - pandas=1.1.0=py37he6710b0_0
76 | - patsy=0.5.1=py37_0
77 | - pcre=8.44=he6710b0_0
78 | - perl=5.26.2=h14c3975_0
79 | - perl-app-cpanminus=1.7044=pl526_1
80 | - perl-archive-tar=2.32=pl526_0
81 | - perl-base=2.23=pl526_1
82 | - perl-business-isbn=3.004=pl526_0
83 | - perl-business-isbn-data=20140910.003=pl526_0
84 | - perl-carp=1.38=pl526_3
85 | - perl-common-sense=3.74=pl526_2
86 | - perl-compress-raw-bzip2=2.087=pl526he1b5a44_0
87 | - perl-compress-raw-zlib=2.087=pl526hc9558a2_0
88 | - perl-constant=1.33=pl526_1
89 | - perl-data-dumper=2.173=pl526_0
90 | - perl-digest-hmac=1.03=pl526_3
91 | - perl-digest-md5=2.55=pl526_0
92 | - perl-encode=2.88=pl526_1
93 | - perl-encode-locale=1.05=pl526_6
94 | - perl-exporter=5.72=pl526_1
95 | - perl-exporter-tiny=1.002001=pl526_0
96 | - perl-extutils-makemaker=7.36=pl526_1
97 | - perl-file-listing=6.04=pl526_1
98 | - perl-file-path=2.16=pl526_0
99 | - perl-file-temp=0.2304=pl526_2
100 | - perl-html-parser=3.72=pl526h6bb024c_5
101 | - perl-html-tagset=3.20=pl526_3
102 | - perl-html-tree=5.07=pl526_1
103 | - perl-http-cookies=6.04=pl526_0
104 | - perl-http-daemon=6.01=pl526_1
105 | - perl-http-date=6.02=pl526_3
106 | - perl-http-message=6.18=pl526_0
107 | - perl-http-negotiate=6.01=pl526_3
108 | - perl-io-compress=2.087=pl526he1b5a44_0
109 | - perl-io-html=1.001=pl526_2
110 | - perl-io-socket-ssl=2.066=pl526_0
111 | - perl-io-zlib=1.10=pl526_2
112 | - perl-json=4.02=pl526_0
113 | - perl-json-xs=2.34=pl526h6bb024c_3
114 | - perl-libwww-perl=6.39=pl526_0
115 | - perl-list-moreutils=0.428=pl526_1
116 | - perl-list-moreutils-xs=0.428=pl526_0
117 | - perl-lwp-mediatypes=6.04=pl526_0
118 | - perl-lwp-protocol-https=6.07=pl526_4
119 | - perl-mime-base64=3.15=pl526_1
120 | - perl-mozilla-ca=20180117=pl526_1
121 | - perl-net-http=6.19=pl526_0
122 | - perl-net-ssleay=1.88=pl526h90d6eec_0
123 | - perl-ntlm=1.09=pl526_4
124 | - perl-parent=0.236=pl526_1
125 | - perl-pathtools=3.75=pl526h14c3975_1
126 | - perl-scalar-list-utils=1.52=pl526h516909a_0
127 | - perl-socket=2.027=pl526_1
128 | - perl-storable=3.15=pl526h14c3975_0
129 | - perl-test-requiresinternet=0.05=pl526_0
130 | - perl-time-local=1.28=pl526_1
131 | - perl-try-tiny=0.30=pl526_1
132 | - perl-types-serialiser=1.0=pl526_2
133 | - perl-uri=1.76=pl526_0
134 | - perl-www-robotrules=6.02=pl526_3
135 | - perl-xml-libxml=2.0132=pl526h7ec2d77_1
136 | - perl-xml-namespacesupport=1.12=pl526_0
137 | - perl-xml-parser=2.44=pl526h4e0c4b3_7
138 | - perl-xml-sax=1.02=pl526_0
139 | - perl-xml-sax-base=1.09=pl526_0
140 | - perl-xml-sax-expat=0.51=pl526_3
141 | - perl-xml-simple=2.25=pl526_1
142 | - perl-xsloader=0.24=pl526_0
143 | - pip=20.2.2=py37_0
144 | - pycparser=2.20=py_2
145 | - pymongo=3.11.0=py37he6710b0_0
146 | - pyopenssl=19.1.0=py_1
147 | - pyparsing=2.4.7=py_0
148 | - pyqt=5.9.2=py37h05f1152_2
149 | - pysam=0.15.3=py37hda2845c_1
150 | - pysocks=1.7.1=py37_1
151 | - pytables=3.6.1=py37h71ec239_0
152 | - python=3.7.7=hcff3b4d_5
153 | - python-dateutil=2.8.1=py_0
154 | - pytz=2020.1=py_0
155 | - qt=5.9.7=h5867ecd_1
156 | - readline=8.0=h7b6447c_0
157 | - requests=2.24.0=py_0
158 | - samtools=1.6=h244ad75_5
159 | - scipy=1.5.0=py37h0b6359f_0
160 | - semidbm=0.5.1=pyh864c0ab_3
161 | - setuptools=49.4.0=py37_0
162 | - sip=4.19.8=py37hf484d3e_0
163 | - six=1.15.0=py_0
164 | - snappy=1.1.8=he6710b0_0
165 | - spades=3.13.0=0
166 | - sqlite=3.32.3=h62c20be_0
167 | - sra-tools=2.10.0=pl526he1b5a44_0
168 | - statsmodels=0.11.1=py37h7b6447c_0
169 | - subread=2.0.0=hed695b0_0
170 | - tbb=2020.0=hfd86e86_0
171 | - tk=8.6.10=hbc83047_0
172 | - tornado=6.0.4=py37h7b6447c_1
173 | - tqdm=4.48.2=py_0
174 | - urllib3=1.25.10=py_0
175 | - wheel=0.34.2=py37_0
176 | - xz=5.2.5=h7b6447c_0
177 | - zlib=1.2.11=h7b6447c_3
178 | - zstd=1.4.5=h9ceee32_0
179 |
--------------------------------------------------------------------------------
/envs/orthofinder.yml:
--------------------------------------------------------------------------------
1 | name: orthofinder
2 | channels:
3 | - bioconda
4 | - conda-forge
5 | - defaults
6 | dependencies:
7 | - _libgcc_mutex=0.1=conda_forge
8 | - _openmp_mutex=4.5=1_gnu
9 | - blast=2.10.1=pl526he19e7b1_1
10 | - boost-cpp=1.70.0=h7b93d67_3
11 | - bzip2=1.0.8=h516909a_3
12 | - c-ares=1.16.1=h516909a_3
13 | - ca-certificates=2020.6.20=hecda079_0
14 | - certifi=2019.11.28=py27h8c360ce_1
15 | - curl=7.71.1=he644dc0_5
16 | - diamond=2.0.4=h56fc30b_0
17 | - dlcpar=1.0=py_2
18 | - entrez-direct=13.8=pl526h375a9b1_0
19 | - expat=2.2.9=he1b5a44_2
20 | - fastme=2.1.5=0
21 | - fasttree=2.1.10=h516909a_4
22 | - gawk=5.1.0=h516909a_0
23 | - gettext=0.19.8.1=hc5be6a0_1002
24 | - icu=67.1=he1b5a44_0
25 | - iqtree=2.0.3=h176a8bc_0
26 | - krb5=1.17.1=hfafb76e_3
27 | - ld_impl_linux-64=2.35=h769bd43_9
28 | - libblas=3.8.0=17_openblas
29 | - libcblas=3.8.0=17_openblas
30 | - libcurl=7.71.1=hcdd3856_5
31 | - libedit=3.1.20191231=he28a2e2_2
32 | - libev=4.33=h516909a_1
33 | - libffi=3.2.1=he1b5a44_1007
34 | - libgcc=7.2.0=h69d50b8_2
35 | - libgcc-ng=9.3.0=h24d8f2e_16
36 | - libgfortran-ng=7.5.0=hdf63c60_16
37 | - libgomp=9.3.0=h24d8f2e_16
38 | - libidn2=2.3.0=h516909a_0
39 | - liblapack=3.8.0=17_openblas
40 | - libnghttp2=1.41.0=h8cfc5f6_2
41 | - libopenblas=0.3.10=pthreads_hb3c22a3_4
42 | - libssh2=1.9.0=hab1572f_5
43 | - libstdcxx-ng=9.3.0=hdf63c60_16
44 | - libunistring=0.9.10=h14c3975_0
45 | - llvm-meta=7.0.0=0
46 | - lz4-c=1.9.2=he1b5a44_3
47 | - mafft=7.471=h516909a_0
48 | - mcl=14.137=pl526h516909a_5
49 | - mmseqs2=12.113e3=h2d02072_0
50 | - muscle=3.8.1551=hc9558a2_5
51 | - ncurses=6.2=he1b5a44_1
52 | - numpy=1.16.5=py27h95a1406_0
53 | - openmp=7.0.0=h2d50403_0
54 | - openssl=1.1.1g=h516909a_1
55 | - orthofinder=2.2.7=0
56 | - pcre=8.44=he1b5a44_0
57 | - perl=5.26.2=h516909a_1006
58 | - perl-app-cpanminus=1.7044=pl526_1
59 | - perl-archive-tar=2.32=pl526_0
60 | - perl-base=2.23=pl526_1
61 | - perl-business-isbn=3.004=pl526_0
62 | - perl-business-isbn-data=20140910.003=pl526_0
63 | - perl-carp=1.38=pl526_3
64 | - perl-common-sense=3.74=pl526_2
65 | - perl-compress-raw-bzip2=2.087=pl526he1b5a44_0
66 | - perl-compress-raw-zlib=2.087=pl526hc9558a2_0
67 | - perl-constant=1.33=pl526_1
68 | - perl-data-dumper=2.173=pl526_0
69 | - perl-digest-hmac=1.03=pl526_3
70 | - perl-digest-md5=2.55=pl526_0
71 | - perl-encode=2.88=pl526_1
72 | - perl-encode-locale=1.05=pl526_6
73 | - perl-exporter=5.72=pl526_1
74 | - perl-exporter-tiny=1.002001=pl526_0
75 | - perl-extutils-makemaker=7.36=pl526_1
76 | - perl-file-listing=6.04=pl526_1
77 | - perl-file-path=2.16=pl526_0
78 | - perl-file-temp=0.2304=pl526_2
79 | - perl-html-parser=3.72=pl526h6bb024c_5
80 | - perl-html-tagset=3.20=pl526_3
81 | - perl-html-tree=5.07=pl526_1
82 | - perl-http-cookies=6.04=pl526_0
83 | - perl-http-daemon=6.01=pl526_1
84 | - perl-http-date=6.02=pl526_3
85 | - perl-http-message=6.18=pl526_0
86 | - perl-http-negotiate=6.01=pl526_3
87 | - perl-io-compress=2.087=pl526he1b5a44_0
88 | - perl-io-html=1.001=pl526_2
89 | - perl-io-socket-ssl=2.066=pl526_0
90 | - perl-io-zlib=1.10=pl526_2
91 | - perl-json=4.02=pl526_0
92 | - perl-json-xs=2.34=pl526h6bb024c_3
93 | - perl-libwww-perl=6.39=pl526_0
94 | - perl-list-moreutils=0.428=pl526_1
95 | - perl-list-moreutils-xs=0.428=pl526_0
96 | - perl-lwp-mediatypes=6.04=pl526_0
97 | - perl-lwp-protocol-https=6.07=pl526_4
98 | - perl-mime-base64=3.15=pl526_1
99 | - perl-mozilla-ca=20180117=pl526_1
100 | - perl-net-http=6.19=pl526_0
101 | - perl-net-ssleay=1.88=pl526h90d6eec_0
102 | - perl-ntlm=1.09=pl526_4
103 | - perl-parent=0.236=pl526_1
104 | - perl-pathtools=3.75=pl526h14c3975_1
105 | - perl-scalar-list-utils=1.52=pl526h516909a_0
106 | - perl-socket=2.027=pl526_1
107 | - perl-storable=3.15=pl526h14c3975_0
108 | - perl-test-requiresinternet=0.05=pl526_0
109 | - perl-time-local=1.28=pl526_1
110 | - perl-try-tiny=0.30=pl526_1
111 | - perl-types-serialiser=1.0=pl526_2
112 | - perl-uri=1.76=pl526_0
113 | - perl-www-robotrules=6.02=pl526_3
114 | - perl-xml-namespacesupport=1.12=pl526_0
115 | - perl-xml-parser=2.44_01=pl526ha1d75be_1002
116 | - perl-xml-sax=1.02=pl526_0
117 | - perl-xml-sax-base=1.09=pl526_0
118 | - perl-xml-sax-expat=0.51=pl526_3
119 | - perl-xml-simple=2.25=pl526_1
120 | - perl-xsloader=0.24=pl526_0
121 | - pip=20.1.1=pyh9f0ad1d_0
122 | - python=2.7.15=h5a48372_1011_cpython
123 | - python_abi=2.7=1_cp27mu
124 | - raxml=8.2.12=h516909a_2
125 | - readline=8.0=he28a2e2_2
126 | - scipy=1.2.1=py27h921218d_2
127 | - setuptools=44.0.0=py27_0
128 | - sqlite=3.33.0=h4cf870e_0
129 | - tk=8.6.10=hed695b0_0
130 | - wget=1.20.1=h22169c7_0
131 | - wheel=0.35.1=pyh9f0ad1d_0
132 | - xz=5.2.5=h516909a_1
133 | - zlib=1.2.11=h516909a_1009
134 | - zstd=1.4.5=h6597ccf_2
135 |
--------------------------------------------------------------------------------
/envs/sgRNAcas9.yml:
--------------------------------------------------------------------------------
1 | name: sgRNAcas9
2 | channels:
3 | - bioconda
4 | - conda-forge
5 | - defaults
6 | dependencies:
7 | - _libgcc_mutex=0.1=conda_forge
8 | - _openmp_mutex=4.5=1_gnu
9 | - alsa-lib=1.2.3=h516909a_0
10 | - ca-certificates=2020.6.20=hecda079_0
11 | - cairo=1.16.0=h3fc0475_1005
12 | - certifi=2020.6.20=py38h32f6830_0
13 | - fontconfig=2.13.1=h1056068_1002
14 | - freetype=2.10.2=he06d7ca_0
15 | - gettext=0.19.8.1=hc5be6a0_1002
16 | - giflib=5.2.1=h516909a_2
17 | - glib=2.66.1=h680cd38_0
18 | - graphite2=1.3.13=he1b5a44_1001
19 | - harfbuzz=2.7.2=hee91db6_0
20 | - icu=67.1=he1b5a44_0
21 | - jpeg=9d=h516909a_0
22 | - lcms2=2.11=hbd6801e_0
23 | - ld_impl_linux-64=2.35=h769bd43_9
24 | - libffi=3.2.1=he1b5a44_1007
25 | - libgcc-ng=9.3.0=h5dbcf3e_17
26 | - libgomp=9.3.0=h5dbcf3e_17
27 | - libiconv=1.16=h516909a_0
28 | - libpng=1.6.37=hed695b0_2
29 | - libstdcxx-ng=9.3.0=h2ae2ef3_17
30 | - libtiff=4.1.0=hc7e4089_6
31 | - libuuid=2.32.1=h14c3975_1000
32 | - libwebp-base=1.1.0=h516909a_3
33 | - libxcb=1.13=h14c3975_1002
34 | - libxml2=2.9.10=h68273f3_2
35 | - lz4-c=1.9.2=he1b5a44_3
36 | - ncurses=6.2=he1b5a44_1
37 | - openjdk=11.0.8=hacce0ff_0
38 | - openssl=1.1.1h=h516909a_0
39 | - pcre=8.44=he1b5a44_0
40 | - perl=5.30.3=h516909a_1
41 | - pip=20.2.3=py_0
42 | - pixman=0.38.0=h516909a_1003
43 | - pthread-stubs=0.4=h14c3975_1001
44 | - python=3.8.5=h1103e12_9_cpython
45 | - python_abi=3.8=1_cp38
46 | - readline=8.0=he28a2e2_2
47 | - seqmap=1.0.13=hc9558a2_1
48 | - setuptools=49.6.0=py38h32f6830_1
49 | - sqlite=3.33.0=h4cf870e_0
50 | - tk=8.6.10=hed695b0_0
51 | - wheel=0.35.1=pyh9f0ad1d_0
52 | - xorg-fixesproto=5.0=h14c3975_1002
53 | - xorg-inputproto=2.3.2=h14c3975_1002
54 | - xorg-kbproto=1.0.7=h14c3975_1002
55 | - xorg-libice=1.0.10=h516909a_0
56 | - xorg-libsm=1.2.3=h84519dc_1000
57 | - xorg-libx11=1.6.12=h516909a_0
58 | - xorg-libxau=1.0.9=h14c3975_0
59 | - xorg-libxdmcp=1.1.3=h516909a_0
60 | - xorg-libxext=1.3.4=h516909a_0
61 | - xorg-libxfixes=5.0.3=h516909a_1004
62 | - xorg-libxi=1.7.10=h516909a_0
63 | - xorg-libxrender=0.9.10=h516909a_1002
64 | - xorg-libxtst=1.2.3=h516909a_1002
65 | - xorg-recordproto=1.14.2=h516909a_1002
66 | - xorg-renderproto=0.11.1=h14c3975_1002
67 | - xorg-xextproto=7.3.0=h14c3975_1002
68 | - xorg-xproto=7.0.31=h14c3975_1007
69 | - xz=5.2.5=h516909a_1
70 | - zlib=1.2.11=h516909a_1009
71 | - zstd=1.4.5=h6597ccf_2
72 |
--------------------------------------------------------------------------------
/scripts/DE_analysis_edgeR_script.R:
--------------------------------------------------------------------------------
1 | # https://github.com/rnnh/bioinfo-notebook.git
2 |
3 | # Loading required libraries
4 | library(limma)
5 | library(edgeR)
6 |
7 | # Changing working directory
8 | # Setting the working directory to the directory which contains this script
9 | if (exists("RStudio.Version")){
10 | setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
11 | } else {
12 | setwd(getSrcDirectory()[1])
13 | }
14 |
15 | # Reading in the feature count file as "counts.df"
16 | counts.df <- read.csv("../data/featCounts_S_cere_20200331.csv")
17 |
18 | # Printing the start of the counts.df object in R...
19 | head(counts.df)
20 |
21 | # Using the "Geneid" column to set the rownames
22 | rownames(counts.df) <- counts.df$Geneid
23 |
24 | # Removing the "Geneid" column
25 | counts.df$Geneid <- NULL
26 |
27 | # Printing the start of the counts.df object in R...
28 | head(counts.df)
29 |
30 | # Reading in the design table as "design.df"
31 | design.df <- read.csv("../data/design_table.csv", fileEncoding="UTF-8-BOM")
32 |
33 | # Printing the start of the design.df object in R...
34 | print(design.df)
35 |
36 | # Subsetting gene counts according to experimental condition
37 | counts_standard.df <- counts.df[,c("SRR8933535", "SRR8933536", "SRR8933537")]
38 | counts_anaerobic.df <- counts.df[,c("SRR8933506", "SRR8933511", "SRR8933512")]
39 | counts_high_temp.df <- counts.df[,c("SRR8933532", "SRR8933533", "SRR8933534")]
40 | counts_low_pH.df <- counts.df[,c("SRR8933530", "SRR8933531", "SRR8933539")]
41 | counts_pressure.df <- counts.df[,c("SRR8933509", "SRR8933510", "SRR8933538")]
42 |
43 | # Printing the structure of the gene counts set and subsets
44 | str(counts.df)
45 | str(counts_standard.df)
46 | str(counts_anaerobic.df)
47 | str(counts_high_temp.df)
48 | str(counts_low_pH.df)
49 | str(counts_pressure.df)
50 |
51 | # Defining function "RSD.test()"
52 | RSD.test <- function(dataframe){
53 | # This function tests whether the relative standard deviation (RSD) is less
54 | # than or equal to one for each row in a data frame.
55 | # It adds the result to a new variable in the data frame called "RSD.test".
56 | # For a given row, if data.frame$RSD.test is TRUE, that row has an RSD less
57 | # than or equal to one, i.e. RSD <= 1.
58 | # If data.frame$RSD.test is FALSE, that row has an RSD outside of this range.
59 | RSD_tests = dataframe[,1]
60 | for (row_index in 1:nrow(dataframe)){
61 | row = as.numeric(dataframe[row_index,])
62 | RSD = sd(row) / mean(row)
63 | RSD_tests[row_index] = RSD <= 1 || is.na(RSD)
64 | }
65 | dataframe$RSD.test <- as.factor(RSD_tests)
66 | levels(dataframe$RSD.test) <- c(FALSE, TRUE)
67 | return(dataframe)
68 | }
69 |
70 | # Applying RSD.test() to gene count subsets
71 | counts_standard.df <- RSD.test(counts_standard.df)
72 | counts_anaerobic.df <- RSD.test(counts_anaerobic.df)
73 | counts_high_temp.df <- RSD.test(counts_high_temp.df)
74 | counts_low_pH.df <- RSD.test(counts_low_pH.df)
75 | counts_pressure.df <- RSD.test(counts_pressure.df)
76 |
77 | # Printing the structure of the gene counts subsets
78 | str(counts_standard.df)
79 | str(counts_anaerobic.df)
80 | str(counts_high_temp.df)
81 | str(counts_low_pH.df)
82 | str(counts_pressure.df)
83 |
84 | # Creating list of genes which failed RSD test
85 | RSD_failed_genes <- rownames(counts_standard.df[
86 | which(counts_standard.df$RSD.test == FALSE),])
87 | RSD_failed_genes <- append(RSD_failed_genes, rownames(counts_anaerobic.df[
88 | which(counts_anaerobic.df$RSD.test == FALSE),]))
89 | RSD_failed_genes <- append(RSD_failed_genes, rownames(counts_high_temp.df[
90 | which(counts_high_temp.df$RSD.test == FALSE),]))
91 | RSD_failed_genes <- append(RSD_failed_genes, rownames(counts_low_pH.df[
92 | which(counts_low_pH.df$RSD.test == FALSE),]))
93 | RSD_failed_genes <- append(RSD_failed_genes, rownames(counts_pressure.df[
94 | which(counts_pressure.df$RSD.test == FALSE),]))
95 | RSD_failed_genes <- unique(RSD_failed_genes)
96 | length(RSD_failed_genes)
97 |
98 | # Filtering gene counts
99 | filtered_counts.df <- counts.df[
100 | which(!rownames(counts.df) %in% RSD_failed_genes),]
101 |
102 | # Printing the structure of the filtered gene counts
103 | str(filtered_counts.df)
104 |
105 | # Checking that gene counts were correctly filtered
106 | nrow(counts.df) - length(RSD_failed_genes) == nrow(filtered_counts.df)
107 |
108 | # Removing redundant objects from R environment
109 | rm(counts_anaerobic.df, counts_high_temp.df, counts_low_pH.df,
110 | counts_pressure.df, counts_standard.df, counts.df, RSD_failed_genes)
111 |
112 | # Creating a DGEList object using the filtered gene counts
113 | counts.DGEList <- DGEList(counts = filtered_counts.df,
114 | genes = rownames(filtered_counts.df))
115 |
116 | # Printing the design table
117 | print(design.df)
118 |
119 | # Confirming samples are in the same order in the gene counts and design table
120 | summary(colnames(filtered_counts.df) == design.df$run)
121 |
122 | # Add grouping information to DGEList object
123 | counts.DGEList$samples$group <- as.factor(design.df$condition)
124 |
125 | # Printing counts.DGEList
126 | counts.DGEList
127 |
128 | # Summary of the counts.DGEList object: number of genes, number of samples
129 | dim(counts.DGEList)
130 |
131 | # Creating an object to filter genes with low expression
132 | counts.keep <- filterByExpr(counts.DGEList)
133 | summary(counts.keep)
134 |
135 | # Filtering lowly expressed genes
136 | counts.DGEList <- counts.DGEList[counts.keep, , keep.lib.sizes = FALSE]
137 | dim(counts.DGEList)
138 |
139 | # Confirming that the number of genes in counts.DGEList is the same as the
140 | # number of TRUE values in counts.keep
141 | length(counts.keep[counts.keep == TRUE]) == dim(counts.DGEList)[1]
142 |
143 | # Removing counts.keep
144 | rm(counts.keep)
145 |
146 | # Printing the normalisation factors for the libraries
147 | counts.DGEList$samples$norm.factors
148 |
149 | # Calculating normalisation factors and applying them to counts.DGEList
150 | counts.DGEList <- calcNormFactors(counts.DGEList)
151 | counts.DGEList$samples$norm.factors
152 |
153 | # Estimating common dispersion and tagwise dispersion
154 | condition_ <- design.df$condition
155 | counts.DGEList <- estimateDisp(counts.DGEList,
156 | design = model.matrix(~condition_))
157 |
158 | counts.DGEList
159 |
160 | condition_
161 |
162 | # Exact tests for differences between experimental conditions
163 | std_anaerobic.DGEExact <- exactTest(counts.DGEList, pair = c("standard",
164 | "anaerobic"))
165 | std_salt.DGEExact <- exactTest(counts.DGEList, pair = c("standard",
166 | "osmotic_pressure"))
167 | std_temp.DGEExact <- exactTest(counts.DGEList, pair = c("standard",
168 | "high_temp"))
169 | std_pH.DGEExact <- exactTest(counts.DGEList, pair = c("standard",
170 | "low_pH"))
171 |
172 | # Extracting most differentially expressed genes from exact tests
173 | std_anaerobic.topTags <- topTags(std_anaerobic.DGEExact)
174 | std_salt.topTags <- topTags(std_salt.DGEExact)
175 | std_temp.topTags <- topTags(std_temp.DGEExact)
176 | std_pH.topTags <- topTags(std_pH.DGEExact)
177 |
178 | # Printing the most differentially expressed genes
179 | std_anaerobic.topTags
180 | std_salt.topTags
181 | std_temp.topTags
182 | std_pH.topTags
183 |
184 | # Printing session information
185 | sessionInfo()
186 |
--------------------------------------------------------------------------------
/scripts/UniProt_downloader.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | # https://github.com/rnnh/bioinfo-notebook.git
3 |
4 | # Help/usage text
5 | usage="$(basename "$0") [-h|--help] [-p|--processors n -o|--output] -i|--input \n
6 | \n
7 | This script takes a list of UniProt primary accession numbers (*.list), and \n
8 | downloads the corresponding protein sequences from UniProt as a FASTA amino \n
9 | acid (.faa) file.\n
10 | \n
11 | This list can be generated by searching UniProtKB for a desired term (e.g. \n
12 | 'taxonomy:147537' for the Saccharomycotina subphylum), selecting 'Download' \n
13 | and 'Format: List' to download the accession numbers of the corresponding \n
14 | results.\n
15 | \n
16 | arguments: \n
17 | \t -h | --help\t\t show this help text and exit \n
18 | \t -i | --input\t\t the list of UniProt proteins to download \n
19 | \t -p | --processors\t optional: set the number (n) of processors to \n
20 | \t\t\t\t use (default: 1) \n
21 | \t -o | --output\t\t optional: name of the output .faa file \n
22 | \t\t\t\t (default: uniprot_{date}.faa) \n
23 | "
24 |
25 | PROCESSORS=1
26 | OUTPUT=uniprot_$(date +%Y%m%d).faa
27 |
28 | # Iterating through the input arguments with a while loop
29 | while (( "$#" )); do
30 | case "$1" in
31 | -h|--help)
32 | echo -e $usage
33 | exit
34 | ;;
35 | -i|--input)
36 | INPUT=$2
37 | shift 2
38 | ;;
39 | -p|--processors)
40 | PROCESSORS=$2
41 | shift 2
42 | ;;
43 | -o|--output)
44 | OUTPUT=$2
45 | shift 2
46 | ;;
47 | --) # end argument parsing
48 | shift
49 | break
50 | ;;
51 | -*|--*) # unsupported flags
52 | echo -e "ERROR: $1 is an invalid option. \n" >&2
53 | echo -e $usage
54 | exit 1
55 | ;;
56 | esac
57 | done
58 |
59 | if test -z "$INPUT";
60 | then
61 | echo -e "ERROR: No input file given. \n" >&2
62 | echo -e $usage
63 | exit 1
64 | fi
65 |
66 | echo "$(date +%Y/%m/%d\ %H:%M) Downloading UniProt sequences..."
67 |
68 | cat $INPUT | \
69 | xargs -n 1 -P $PROCESSORS -I % curl -s https://www.uniprot.org/uniprot/%.fasta \
70 | >> $OUTPUT
71 |
72 | echo "$(date +%Y/%m/%d\ %H:%M) Script finished."
73 |
--------------------------------------------------------------------------------
/scripts/annotated_snps_filter.R:
--------------------------------------------------------------------------------
1 | # https://github.com/rnnh/bioinfo-notebook.git
2 |
3 | # Aim ==========================================================================
4 |
5 | # This script cross-references annotated SNP files created using
6 | # annotating_snps.R. It takes two files created using this script, and returns
7 | # unique SNPs for each file. If a SNP in File 1 is not found at the same
8 | # position on the same sequence as File 2, it is returned as a unique SNP, and
9 | # vice versa. These unique SNPs are then written to new .tsv files.
10 |
11 | # Selecting files ==============================================================
12 |
13 | # - Assign the name of the first annotated SNP file to be filtered to
14 | # 'annotated_SNP_file_1'
15 | # - Assign the name of the second annotated SNP file to be filtered to
16 | # 'annotated_SNP_file_2'
17 | # - These files should be in the `~/bioinfo-notebook/data/` directory.
18 | # - Optional: the name of the output files can be assigned on lines 109 and
19 | # 115 respectively.
20 |
21 | annotated_SNP_file_1 <- "<.tsv File name here>"
22 | annotated_SNP_file_2 <- "<.tsv File name here>"
23 |
24 | # Setup ========================================================================
25 |
26 | # Setting the working directory
27 | setwd("~/bioinfo-notebook/data")
28 |
29 | annotated_SNP_file_1 <- read.table(
30 | annotated_SNP_file_1,
31 | stringsAsFactors = FALSE, header = TRUE)
32 |
33 | annotated_SNP_file_2 <- read.table(
34 | annotated_SNP_file_2,
35 | stringsAsFactors = FALSE, header = TRUE)
36 |
37 | # Finding rows in common between annotated SNP data frames =====================
38 |
39 | # This needs to be carried out multiple times because the number of rows in
40 | # each annotate SNP file differ. Two files may have a SNP in common, but it may
41 | # not occur at the same row number
42 |
43 | # Loops in this section are structured as follows:
44 | # For every row index in a given data frame...
45 | # Get the row using the row index
46 | # If the SNP position for a given row index is in the other data frame...
47 | # Get the indices of the matching rows
48 | # For each index in the indices of matching row...
49 | # If the sequence names are the same for the matching rows...
50 | # Add that index to the matching row values
51 | # Keep only the unique indices
52 |
53 | # Creating empty integer values for the matching SNPs
54 | file_1_SNPs_common_with_file_2 <- integer()
55 | file_2_SNPs_common_with_file_1 <- integer()
56 |
57 | # Rows in common between file 1 and file 2
58 | for (index in 1:nrow(annotated_SNP_file_2)){
59 | row = annotated_SNP_file_2[index, ]
60 | if (row$POS %in% annotated_SNP_file_1$POS){
61 | matching_row_indices = which(annotated_SNP_file_1$POS == row$POS)
62 | for (mr_index in matching_row_indices){
63 | if (annotated_SNP_file_1$sequence[mr_index] == row$sequence){
64 | file_1_SNPs_common_with_file_2 <- c(file_1_SNPs_common_with_file_2,
65 | mr_index)
66 | file_1_SNPs_common_with_file_2 <- unique(file_1_SNPs_common_with_file_2)
67 | }
68 | }
69 | }
70 | }
71 |
72 | # Rows in common between file 2 and file 1
73 | for (index in 1:nrow(annotated_SNP_file_1)){
74 | row = annotated_SNP_file_1[index, ]
75 | if (row$POS %in% annotated_SNP_file_2$POS){
76 | matching_row_indices = which(annotated_SNP_file_2$POS == row$POS)
77 | for (mr_index in matching_row_indices){
78 | if (annotated_SNP_file_2$sequence[mr_index] == row$sequence){
79 | file_2_SNPs_common_with_file_1 <- c(file_2_SNPs_common_with_file_1,
80 | mr_index)
81 | file_2_SNPs_common_with_file_1 <- unique(file_2_SNPs_common_with_file_1)
82 | }
83 | }
84 | }
85 | }
86 |
87 | # Filtering SNPs in common between annotated SNP data frames ===================
88 |
89 | # The matching row values produced by the loops in the previous section are
90 | # used to subset each data frame: this is done by selecting non-matching rows
91 |
92 | annotated_SNP_file_1_unique.df <- annotated_SNP_file_1[-file_1_SNPs_common_with_file_2, ]
93 | annotated_SNP_file_2_unique.df <- annotated_SNP_file_2[-file_2_SNPs_common_with_file_1, ]
94 |
95 | # Checking that the correct number of rows were filtered =======================
96 |
97 | # If the correct number of rows were filtered, the following statements should
98 | # all return TRUE
99 |
100 | nrow(annotated_SNP_file_2) == nrow(annotated_SNP_file_2_unique.df) +
101 | length(file_2_SNPs_common_with_file_1)
102 |
103 | nrow(annotated_SNP_file_1) == nrow(annotated_SNP_file_1_unique.df) +
104 | length(file_1_SNPs_common_with_file_2)
105 |
106 | # Writing data frames to tab-separated values (.tsv) files =====================
107 |
108 | write.table(annotated_SNP_file_1_unique.df,
109 | file = c(annotated_SNP_file_1, "_filtered.tsv",
110 | fileEncoding = "UTF-8",
111 | sep = "\t",
112 | row.names = FALSE)
113 |
114 | write.table(annotated_SNP_file_2_unique.df,
115 | file = c(annotated_SNP_file_2, "_filtered.tsv",
116 | fileEncoding = "UTF-8",
117 | sep = "\t",
118 | row.names = FALSE)
119 |
120 | # Exiting ======================================================================
121 | quit(save = "no")
122 |
--------------------------------------------------------------------------------
/scripts/annotating_snps.R:
--------------------------------------------------------------------------------
1 | # https://github.com/rnnh/bioinfo-notebook.git
2 |
3 | # Aim ==========================================================================
4 |
5 | # The aim of this script is to cross-reference annotations of genome assemblies
6 | # with VCF files containing SNPs of sequencing reads aligned against those
7 | # genome assemblies. If a SNP falls within- or upstream of- an annotated
8 | # genome feature (start codon, stop codon, CDS, etc.), the script will return
9 | # that feature along with the SNP.
10 |
11 | # Selecting files and parameters ===============================================
12 |
13 | # - The VCF and GFF files to be cross-referenced are specified in this
14 | # section. For this script to work, these files need to use the same
15 | # sequence names: e.g. if the first sequence in the VCF is called "chrI",
16 | # there should be a corresponding sequence called "chrI" in the GFF file.
17 | # - The VCF and GFF files should be in the directory
18 | # '~/bioinfo-notebook/data/'.
19 | # - The number of lines in the VCF file header should be specified in the
20 | # 'VCF_header.int' variable. This is the number of lines that begin with '#'
21 | # in the VCF file.
22 | # - The variable 'upstream.int' is used to determine how far upstream from an
23 | # annotated feature a SNP can be. This can be set to 0 if you do not want
24 | # upstream SNPs to be considered. Setting it to 1000 will mean that SNPs
25 | # up to 1,000 bases/1kb upstream from a feature will be annotated.
26 | # - The variable 'output_name' is used to specify the name of the output file,
27 | # which should end in '.tsv' as it will be a tab-separated values text file.
28 |
29 | GFF_file <- "<.gff File name here>"
30 | VCF_file <- "<.vcf File name here>"
31 | VCF_header.int <- as.integer("")
32 | upstream.int <- as.integer("%
80 | select(-ID, -FORMAT, -FILTER) %>%
81 | filter(POS >= (start - upstream) &
82 | POS <= end)
83 |
84 | # Removing redundant data frames
85 | rm(genome_annotation.df, SNPs.df)
86 |
87 | # Ordering filtered data frame of SNPs with annotations ========================
88 | attach(SNPs_with_annotations.df)
89 | SNPs_with_annotations.df <- SNPs_with_annotations.df[order(sequence, start, end), ]
90 | detach(SNPs_with_annotations.df)
91 |
92 | # Exporting SNPs with annotations to tab-separated value (.tsv) file ===========
93 | write.table(SNPs_with_annotations.df,
94 | file = output_name,
95 | fileEncoding = "UTF-8",
96 | sep = "\t",
97 | row.names = FALSE)
98 |
99 | # Exiting ======================================================================
100 | quit(save = "no")
101 |
--------------------------------------------------------------------------------
/scripts/combining_featCount_tables.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # https://github.com/rnnh/bioinfo-notebook.git
3 | # -*- coding: utf-8 -*-
4 | """
5 | Created on Wed Mar 18 12:08:41 2020
6 |
7 | @author: ronan
8 |
9 | This script creates a single CSV feature count table from the featureCounts
10 | output tables in the target directory.
11 |
12 | This combined feature count table can be used for differential expression
13 | analysis (e.g. using DESeq2 or edgeR in R).
14 | """
15 |
16 | # Loading required libraries
17 | from time import gmtime, strftime
18 | import pandas as pd
19 | import argparse
20 | import sys
21 | import os
22 |
23 | # Parsing command line arguments
24 | parser = argparse.ArgumentParser(
25 | description = "Combines the featureCounts output tables in the target \
26 | directory.")
27 |
28 | # -d PATH -o CUSTOM_FILENAME
29 | parser.add_argument("-d", "--directory", dest = "path",
30 | help = "path to target directory. \
31 | Default: current directory")
32 | parser.add_argument("-o", "--output", dest ="custom_filename",
33 | help = "output filename.\
34 | Default: featCounts_{species}_{date}.csv")
35 |
36 | args = parser.parse_args()
37 |
38 | # Changing to the target directory
39 | if args.path is not None:
40 | path = args.path
41 | else:
42 | path = os.getcwd()
43 | os.chdir(path)
44 |
45 | # Creating variables
46 | fixed_headers = ["Geneid", "Chromosome", "Start", "End", "Strand", "Length"]
47 | target_file_prefix = "feature_counts_"
48 | date = strftime("%Y%m%d", gmtime())
49 | counts_table = pd.DataFrame()
50 | output_filename = str()
51 | target_file_count = 0
52 | species_name = str()
53 | srr = str()
54 |
55 | # Iterating through files in target directory, combining feature counts
56 | # into one DataFrame object ("counts_table")
57 | for filename in os.listdir():
58 | if filename.startswith(target_file_prefix):
59 | target_file_count = target_file_count + 1
60 | filename_list = filename.split("_")
61 | srr = filename_list[2]
62 | species_name = filename_list[3] + "_" + filename_list[4]
63 | featCounts_df = pd.read_csv(filename, sep = "\t",
64 | lineterminator = '\n', skiprows = 1,
65 | header = 0)
66 | featCounts_headers = fixed_headers.copy()
67 | featCounts_headers += [srr]
68 | featCounts_df.columns = featCounts_headers
69 | gene_ids = featCounts_df["Geneid"]
70 | counts = featCounts_df[srr]
71 | # Add the gene IDs and counts to the counts_table DataFrame as columns
72 | # if it's empty; otherwise add the counts only
73 | if counts_table.empty:
74 | counts_table = pd.concat([gene_ids, counts], axis = 1,
75 | sort = False)
76 | else:
77 | counts_table = pd.concat([counts_table, counts], axis = 1,
78 | sort = False)
79 | del featCounts_headers
80 |
81 | if target_file_count == 0:
82 | # Exiting script if there are no target files in the target directory
83 | print("ERROR: There are no featureCount files in the target directory. \n")
84 | parser.print_help(sys.stderr)
85 | exit
86 | else:
87 | # Exporting counts_table DataFrame as a CSV file
88 | if args.custom_filename is not None:
89 | output_filename = args.custom_filename
90 | else:
91 | output_filename = "featCounts_" + species_name + "_" + date + ".csv"
92 | counts_table.to_csv(output_filename, index = False)
93 |
--------------------------------------------------------------------------------
/scripts/fastq-dump_to_featureCounts.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | # https://github.com/rnnh/bioinfo-notebook.git
3 |
4 | # Help/usage text
5 | usage="$(basename "$0") [options] -a|--annotation \
6 | -f|--fasta \n
7 | \n
8 | This script downloads FASTQ reads from NCBI's SRA, aligns them to an annotated \n
9 | genome using bowtie2, and generates gene count table(s) using featureCounts.\n
10 | It can take a single SRR ID as an input, or multiple SRR IDs separated by\n
11 | spaces.\n
12 | \n
13 | Required arguments: \n
14 | \t -a | --annotation\t input genome annotation file \n
15 | \t -f | --fasta\t\t input FASTA file for annotated genome \n
16 | \t SRR ID(s)\t\t Sequence Read Archive Run ID(s) (SRR...) \n
17 | \n
18 | Optional arguments: \n
19 | \t -h | --help\t\t show this help text and exit \n
20 | \t -p | --processors\t number (n) of processors to use (default: 1) \n
21 | \t --fastq-dump\t\t use 'fastq-dump' instead of the 'fasterq-dump'\n
22 | \t --verbose\t\t make output of script more verbose\n
23 | \t --removetemp\t\t remove read and alignment files once they are\n
24 | \t \t\t\t no longer needed (minimises disk space needed) \n
25 | \t --log\t\t\t redirect terminal output to log file
26 | "
27 |
28 | # Setting FASTQDUMP to 0
29 | # This will be changed to "1" if --fastq-dump is given as an argument,
30 | # resulting in fastq-dump being used instead of the default fasterq-dump
31 | FASTQDUMP=0
32 |
33 | # Setting VERBOSE to 0
34 | # This will be changed to "1" if --verbose is given as an argument,
35 | # resulting in more verbose script output
36 | VERBOSE=0
37 |
38 | # Setting REMOVETEMP to 0
39 | # This will be changed to "1" if --removetemp is given as an argument,
40 | # resulting in *.fastq, *.fastq.gz, *.sam, *.bam and *.tsv.summary, being
41 | # removed once they are no longer needed to create a featureCounts table
42 | REMOVETEMP=0
43 |
44 | # Setting LOG to 0
45 | # This will be changed to "1" if --log is given as an argument,
46 | # resulting in the terminal output from this script being redirected to a log
47 | # file
48 | LOG=0
49 |
50 | # Setting default number of PROCESSORS to use
51 | PROCESSORS=1
52 |
53 | # Creating an empty variable for SRRs to be downloaded and aligned to genome
54 | SRRs=""
55 |
56 | # Print usage instructions if script is called without any arguments
57 | if [ "$1" = "" ] ; then
58 | echo -e "ERROR: please provide input files. \n"
59 | echo -e $usage
60 | exit 1
61 | fi
62 |
63 | # Iterating through the input arguments with a while loop
64 | while (( "$#" )); do
65 | case "$1" in
66 | -h|--help)
67 | echo -e $usage
68 | exit
69 | ;;
70 | -a|--annotation)
71 | ANNOTATION=$2
72 | shift 2
73 | ;;
74 | -f|--fasta)
75 | FASTA=$2
76 | shift 2
77 | ;;
78 | -p|--processors)
79 | PROCESSORS=$2
80 | shift 2
81 | ;;
82 | --fastq-dump)
83 | FASTQDUMP=1
84 | shift
85 | ;;
86 | --verbose)
87 | VERBOSE=1
88 | shift
89 | ;;
90 | --removetemp)
91 | REMOVETEMP=1
92 | shift
93 | ;;
94 | --log)
95 | LOG=1
96 | shift
97 | ;;
98 | --) # end argument parsing
99 | shift
100 | break
101 | ;;
102 | -*|--*) # unsupported flags
103 | echo -e "ERROR: $1 is an invalid option. \n" >&2
104 | echo -e $usage
105 | exit 1
106 | ;;
107 | *) # preserve SRR ID(s) as positional arguments
108 | SRRs="$SRRs $1"
109 | shift
110 | ;;
111 | esac
112 | done
113 |
114 | if [ $LOG -eq "1" ]
115 | then
116 | # Redirecting terminal output to log file
117 | exec 3>&1 4>&2
118 | trap 'exec 2>&4 1>&3' 0 1 2 3
119 | exec 1>fd_to_fC_$(date +%Y%m%d_%H%M%S).log 2>&1
120 | fi
121 |
122 | # Beginning the main body of the script
123 | # The sleep commands ("sleep 1s", "sleep 2s") slow down the script to make
124 | # the output more readable in real-time
125 |
126 | echo -e ~~~~~~~~~~~~~ F A S T Q - D U M P t o F E A T U R E C O U N T S ~~~~~~~~~~~~~
127 | echo Script started: $(date)
128 |
129 | # Loop through the input SRR IDs
130 | for SRR in $SRRs
131 | do
132 | printf "\n"
133 | echo ================================================================================
134 | echo SRR ID: $SRR
135 | sleep 1s
136 | echo Genome annotation: $ANNOTATION
137 | sleep 1s
138 | echo Genome multi-FASTA file: $FASTA
139 | echo ================================================================================
140 | sleep 1s
141 |
142 | if [ $VERBOSE -eq "1" ]
143 | then
144 | printf "\n"
145 | echo Listing files in directory ...
146 | sleep 1s
147 | ls
148 | sleep 2s
149 | fi
150 |
151 |
152 | if [ $FASTQDUMP -eq "1" ]
153 | then
154 | if [ $VERBOSE -eq "1" ]
155 | then
156 | echo Downloading compressed FASTQ reads using fastq-dump...
157 | fi
158 | until fastq-dump --gzip --skip-technical --readids --read-filter pass \
159 | --dumpbase --split-3 --clip $SRR; do
160 | echo fastq-dump failed, retrying in 10 seconds...
161 | sleep 10s
162 | done
163 | else
164 | if [ $VERBOSE -eq "1" ]
165 | then
166 | echo Downloading FASTQ reads using fasterq-dump...
167 | fi
168 | if [ $LOG -eq "0" ]
169 | then
170 | until fasterq-dump --progress --threads $PROCESSORS $SRR; do
171 | echo fasterq-dump failed, retrying in 10 seconds...
172 | rm -r fasterq.tmp.*
173 | sleep 10s
174 | done
175 | else
176 | until fasterq-dump --threads $PROCESSORS $SRR; do
177 | echo fasterq-dump failed, retrying in 10 seconds...
178 | rm -r fasterq.tmp.*
179 | sleep 10s
180 | done
181 | fi
182 | fi
183 |
184 | if [ $VERBOSE -eq "1" ]
185 | then
186 | sleep 1s
187 | echo Listing files in directory after downloading reads...
188 | sleep 1s
189 | ls
190 | sleep 2s
191 | fi
192 |
193 | # Checking if bowtie2 index of FASTA file exists before creating bowtie2 index
194 | # If bowtie2_$FASTA.1.bt2 (one of the bowtie2 index files) does not exist...
195 | if [ ! -f bowtie2_$FASTA.1.bt2 ]
196 | # ...then create the bowtie2_$FASTA index
197 | then
198 | if [ $VERBOSE -eq "1" ]
199 | then
200 | echo Indexing genome FASTA file using bowtie2-build...
201 | sleep 2s
202 | fi
203 | bowtie2-build $FASTA bowtie2_$FASTA
204 | if [ $VERBOSE -eq "1" ]
205 | then
206 | sleep 1s
207 | echo Listing files in directory after running bowtie2-build...
208 | sleep 1s
209 | ls
210 | sleep 2s
211 | fi
212 | # Otherwise, print a message confirming that it exists
213 | else
214 | if [ $VERBOSE -eq "1" ]
215 | then
216 | echo The bowtie2 index bowtie2_$FASTA exists
217 | sleep 1s
218 | fi
219 | fi
220 |
221 | if [ $VERBOSE -eq "1" ]
222 | then
223 | echo Aligning reads to reference genome using bowtie2...
224 | sleep 2s
225 | fi
226 |
227 | # Checking if fastq-dump or fasterq-dump was used, as this will result
228 | # in different filenames
229 | if [ $FASTQDUMP -eq "1" ]
230 | then
231 | bowtie2 -p $PROCESSORS --no-unal -x bowtie2_$FASTA \
232 | -1 $SRR\_pass_1.fastq.gz -2 $SRR\_pass_2.fastq.gz \
233 | -S $SRR\_$FASTA.sam
234 | else
235 | bowtie2 -p $PROCESSORS --no-unal -x bowtie2_$FASTA \
236 | -1 $SRR\_1.fastq -2 $SRR\_2.fastq \
237 | -S $SRR\_$FASTA.sam
238 | fi
239 |
240 | if [ $REMOVEREADS -eq "1"]
241 | then
242 | echo Removing .fastq reads...
243 | rm *.fastq *.fastq.gz
244 | fi
245 |
246 | if [ $VERBOSE -eq "1" ]
247 | then
248 | sleep 1s
249 | echo Listing files in directory after running bowtie2...
250 | sleep 1s
251 | ls
252 | sleep 2s
253 |
254 | echo Converting alignment from SAM to BAM format using samtools view...
255 | sleep 2s
256 | fi
257 | samtools view -@ $PROCESSORS -Sb $SRR\_$FASTA.sam \
258 | > $SRR\_$FASTA.bam
259 |
260 | if [ $VERBOSE -eq "1" ]
261 | then
262 | sleep 1s
263 | echo Listing files in directory after running samtools view...
264 | sleep 1s
265 | ls
266 | sleep 2s
267 |
268 | echo Sorting the BAM file using samtools sort...
269 | sleep 2s
270 | fi
271 | samtools sort -@ $PROCESSORS $SRR\_$FASTA.bam \
272 | -o sorted_$SRR\_$FASTA.bam
273 |
274 | if [ $VERBOSE -eq "1" ]
275 | then
276 | sleep 1s
277 | echo Listing files in directory after running samtools sort...
278 | sleep 1s
279 | ls
280 | sleep 2s
281 |
282 | echo Generating count table using featureCounts...
283 | sleep 2s
284 | fi
285 | featureCounts -p -s 2 -T $PROCESSORS -a $ANNOTATION \
286 | -o feature_counts_$SRR\_$FASTA.tsv \
287 | sorted_$SRR\_$FASTA.bam
288 |
289 | if [ $VERBOSE -eq "1" ]
290 | then
291 | sleep 1s
292 | echo Listing files in directory after running featureCounts...
293 | sleep 1s
294 | ls
295 | sleep 2s
296 |
297 | echo Results written to feature_counts_$SRR\_$FASTA.tsv
298 | sleep 2s
299 |
300 | echo Head of feature_counts_$SRR\_$FASTA.tsv
301 | sleep 2s
302 | head feature_counts_$SRR\_$FASTA.tsv
303 | sleep 2s
304 |
305 | echo Tail of feature_counts_$SRR\_$FASTA.tsv
306 | sleep 2s
307 | tail feature_counts_$SRR\_$FASTA.tsv
308 | sleep 2s
309 | fi
310 |
311 |
312 | if [ $REMOVETEMP -eq "1" ]
313 | then
314 | echo Removing temporary files...
315 | if [ $FASTQDUMP -eq "1" ]
316 | then
317 | rm *.fastq.gz *.sam *.bam *.tsv.summary
318 | else
319 | rm *.fastq *.sam *.bam *.tsv.summary
320 | fi
321 | fi
322 |
323 | done
324 |
325 | echo Script finished: $(date)
326 |
--------------------------------------------------------------------------------
/scripts/genome_annotation_SwissProt_CDS.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | # https://github.com/rnnh/bioinfo-notebook.git
3 |
4 | # Help/usage text
5 | usage="$(basename "$0") [-h|--help] [-d|--demo] [-i|--input] \n
6 | [-l|--log -p|--processors n -e|--email] \n
7 | \n
8 | A script to annotate proteins in a genome assembly, using BLASTx with\n
9 | UniProtKB/Swiss-Prot.\n
10 | \n
11 | When run with the arugment '-d' or '--demo' this script...\n
12 | \n
13 | \t 1. Downloads a Saccharomyces cerevisiae S288C genome assembly, and \n
14 | \t the UniProtKB/Swiss-Prot amino acid sequences. \n
15 | \t 2. Creates a BLAST database from the downloaded Swiss-Prot sequences,\n
16 | \t and searches the S. cerevisiae genome against it using BLASTx with an\n
17 | \t E-value threshold of 1e-100. \n
18 | \t 3. Filters the BLASTx results, removing results with less than 90%\n
19 | \t identity.\n
20 | \t 4. Creates a genome annotation GFF file from these BLASTx results.\n
21 | \t 5. Adds information to the genome annotation from UniProt (protein\n
22 | \t names, KeGG ortholog information, EC numbers, etc.) \n
23 | \n
24 | The end result ('S_cere.gff') is an annotation of the coding sequences (CDS) \n
25 | in the S. cerevisiae genome that are described in UniProtKB/Swiss-Prot. \n
26 | \n
27 | This script can also be run with the argument '-i' or '--input', which is used\n
28 | to specify a FASTA nucleotide file (.fasta or .fna) to annotate, instead of\n
29 | the demo sequence. The end result is also an annotation of the CDS in the input\n
30 | sequence based on UniProtKB/Swiss-Prot, called '.gff'.\n
31 | \n
32 | This script should be called from the 'bioinfo-notebook/' directory.The \n
33 | programs required for this script are in the 'bioinfo-notebook' conda \n
34 | environment (bioinfo-notebook/envs/bioinfo-notebook.yml or \n
35 | bioinfo-notebook/envs/bioinfo-notebook.txt). \n
36 | If the input file is not in the 'bioinfo-notebook/data/' directory, the full \n
37 | file path should be given.\n
38 | \n
39 | arguments: \n
40 | \t -h | --help\t\t show this help text and exit \n
41 | \t -i | --input\t\t name of input FASTA nucleotide file to annotate \n
42 | \t -d | --demo\t\t run the script with demonstration inputs\n
43 | \n
44 | optional arguments:\n
45 | \t -l | --log\t\t redirect terminal output to a log file \n
46 | \t -p | --processors\t set the number (n) of processors to use\n
47 | \t\t\t\t (default: 1) \n
48 | \t -e | --email\t\t contact email for UniProt queries
49 | "
50 |
51 | MAKELOG=false
52 | PROCESSORS=1
53 | EMAIL="none"
54 | DEMO=false
55 | INPUT=""
56 |
57 | # Iterating through the input arguments with a while loop
58 | while (( "$#" )); do
59 | case "$1" in
60 | -h|--help)
61 | echo -e $usage
62 | exit
63 | ;;
64 | -i|--input)
65 | INPUT=$2
66 | shift 2
67 | ;;
68 | -d|--demo)
69 | DEMO=true
70 | shift 1
71 | ;;
72 | -l|--log)
73 | MAKELOG=true
74 | shift 1
75 | ;;
76 | -p|--processors)
77 | PROCESSORS=$2
78 | shift 2
79 | ;;
80 | -e|--email)
81 | EMAIL=$2
82 | shift 2
83 | ;;
84 | --) # end argument parsing
85 | shift
86 | break
87 | ;;
88 | -*|--*) # unsupported flags
89 | echo -e "ERROR: $1 is an invalid option. \n" >&2
90 | echo -e $usage
91 | exit 1
92 | ;;
93 | esac
94 | done
95 |
96 | cd data
97 |
98 | if $MAKELOG ; then
99 | # Creating results directory, if it does not already exist
100 | if [ ! -d ../results ]; then
101 | mkdir ../results
102 | fi
103 | # CREATING LOG FILE
104 | # Terminal output directed to the file 'genome_annotation_[date]_[time].log'
105 | exec 3>&1 4>&2
106 | trap 'exec 2>&4 1>&3' 0 1 2 3
107 | exec 1>../results/genome_annotation_$(date +%Y%m%d_%H%M).log 2>&1
108 | fi
109 |
110 | echo "$(date +%Y/%m/%d\ %H:%M) Beginning genome annotation script."
111 |
112 | if $DEMO ; then
113 | echo Downloading genome FASTA file...
114 | curl -s -o S_cere.fna.gz ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/146\
115 | /045/GCF_000146045.2_R64/GCF_000146045.2_R64_genomic.fna.gz
116 |
117 | echo Decompressing genome FASTA file...
118 | gunzip S_cere.fna.gz
119 |
120 | fi
121 |
122 | echo Downloading Swiss-Prot sequences...
123 | curl -s -o uniprot_sprot.fasta.gz ftp://ftp.uniprot.org/pub/databases/uniprot/\
124 | current_release/knowledgebase/complete/uniprot_sprot.fasta.gz | xargs -n 1 \
125 | -P $PROCESSORS
126 |
127 | echo Decompressing Swiss-Prot sequences...
128 | gunzip uniprot_sprot.fasta.gz
129 |
130 | echo Creating BLAST database...
131 | makeblastdb -dbtype prot -in uniprot_sprot.fasta -out SwissProt
132 |
133 | echo Removing Swiss-Prot sequences...
134 | rm -v uniprot_sprot.fasta
135 |
136 | if $DEMO ; then
137 | echo Searching genome FASTA file against Swiss-Prot with BLASTx...
138 | blastx -num_threads $PROCESSORS -evalue 1e-100 -query S_cere.fna \
139 | -db SwissProt -outfmt 6 -out blastx_SwissProt_S_cere_unfiltered.tsv
140 |
141 | echo Removing Swiss-Prot database...
142 | rm -v SwissProt*
143 |
144 | echo Filtering BLASTx results with percentage identity less than 90% with awk...
145 | awk '{ if ($3 >= 90) { print } }' blastx_SwissProt_S_cere_unfiltered.tsv \
146 | > blastx_SwissProt_S_cere.tsv
147 |
148 | echo Removing unfiltered BLASTx results...
149 | rm -v blastx_SwissProt_S_cere_unfiltered.tsv
150 |
151 | echo Creating genome annotation GFF file from BLASTx results...
152 | blast2gff uniprot --fasta-file S_cere.fna blastx_SwissProt_S_cere.tsv \
153 | S_cere_without_UniProt_info.gff
154 |
155 | echo Adding information to genome annotation from UniProt...
156 | until add-gff-info uniprot --email $EMAIL --protein-names --enzymes \
157 | --kegg_orthologs --eggnog --taxon-id S_cere_without_UniProt_info.gff \
158 | S_cere.gff; do
159 | echo add-gff-info failed, retrying in 10 seconds...
160 | rm -v S_cere.gff
161 | sleep 10s
162 | done
163 |
164 | echo Removing copy of genome annotation without added UniProt info...
165 | rm -v S_cere_without_UniProt_info.gff
166 |
167 | echo First line of finished genome annotation...
168 | head -n 1 S_cere.gff
169 | fi
170 |
171 | if [ ! -z $INPUT ]; then
172 | echo Searching genome FASTA file against Swiss-Prot with BLASTx...
173 | blastx -num_threads $PROCESSORS -evalue 1e-100 -query $INPUT \
174 | -db SwissProt -outfmt 6 -out blastx_SwissProt_$INPUT\_unfiltered.tsv
175 |
176 | echo Removing Swiss-Prot database...
177 | rm -v SwissProt*
178 |
179 | echo Filtering BLASTx results with percentage identity less than 90% with awk...
180 | awk '{ if ($3 >= 90) { print } }' blastx_SwissProt_$INPUT\_unfiltered.tsv \
181 | > blastx_SwissProt_$INPUT\.tsv
182 |
183 | echo Removing unfiltered BLASTx results...
184 | rm -v blastx_SwissProt_$INPUT\_unfiltered.tsv
185 |
186 | echo Creating genome annotation GFF file from BLASTx results...
187 | blast2gff uniprot --fasta-file $INPUT blastx_SwissProt_$INPUT\.tsv \
188 | $INPUT\_without_UniProt_info.gff
189 |
190 | echo Adding information to genome annotation from UniProt...
191 | until add-gff-info uniprot --email $EMAIL --protein-names --enzymes \
192 | --kegg_orthologs --eggnog --taxon-id $INPUT\_without_UniProt_info.gff \
193 | $INPUT.gff; do
194 | echo add-gff-info failed, retrying in 10 seconds...
195 | rm -v $INPUT.gff
196 | sleep 10s
197 | done
198 |
199 | echo Removing copy of genome annotation without added UniProt info...
200 | rm -v $INPUT\_without_UniProt_info.gff
201 |
202 | echo First line of finished genome annotation...
203 | head -n 1 $INPUT.gff
204 | fi
205 |
206 | echo "$(date +%Y/%m/%d\ %H:%M) Script finished."
207 |
--------------------------------------------------------------------------------
/scripts/linux_setup.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash/
2 | # https://github.com/rnnh/bioinfo-notebook.git
3 |
4 | # Help/usage text
5 | usage="$(basename "$0") \n
6 | \n
7 | This script downloads and installs Miniconda3, and uses conda to install \n
8 | the 'bioinfo-notebook' virtual environment. \n
9 | \n
10 | Before running this script... \n
11 | \n
12 | \t 1. Please run the following command: \n
13 | \t \t \$ sudo apt-get update \n
14 | \t This will ensure that the software installed will be up-to-date. \n
15 | \n
16 | \t 2. Please ensure that the 'bioinfo-notebook/' directory is in your \n
17 | \t home directory (~). The path to this directory should look like this: \n
18 | \t \t $HOME/bioinfo-notebook \n
19 | \n
20 | The 'bash' command is used to run this script: \n
21 | \t \$ bash $0 \n
22 | \n
23 | Optional arguments: \n
24 | \t -h | --help\t show this help text and exit \n
25 | "
26 |
27 | # Iterating through the input arguments with a while loop
28 | while (( "$#" )); do
29 | case "$1" in
30 | -h|--help)
31 | echo -e $usage
32 | exit 0
33 | ;;
34 | esac
35 | done
36 |
37 | # Changing directory to the home directory ("~" or "$HOME")
38 | cd ~
39 |
40 | echo Checking if the bioinfo-notebook environment is already installed...
41 | sleep 2s # Slows down script to make terminal output more readable
42 | if [ -d ~/miniconda/envs/bioinfo-notebook ]; then
43 | echo The bioinfo-notebook environment already exists, exiting script.
44 | exit 0
45 | fi
46 |
47 | echo Checking if bioinfo-notebook/ is in the home directory...
48 | sleep 2s # Slows down script to make terminal output more readable
49 | # If bioinfo-notebook/ is not in the home directory...
50 | if [ ! -d ~/bioinfo-notebook/ ];
51 | then
52 | echo ERROR: bioinfo-notebook/ is not in the home directory
53 | echo The home directory is $HOME
54 | echo Please move the bioinfo-notebook/ directory to the home directory,
55 | echo or create a copy of bioinfo-notebook/ in $HOME
56 | exit 1
57 | fi
58 |
59 | echo Downloading Miniconda3 installation script...
60 | sleep 2s # Slows down script to make terminal output more readable
61 | # If the Linux system is 64-bit...
62 | if [ "$(uname -m)" == "x86_64" ];
63 | then
64 | # Download the script to install the 64-bit version of miniconda
65 | wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh \
66 | -O miniconda.sh
67 | # If the Linux system is not 64-bit...
68 | else
69 | # Download the script to install the 32-bit version of miniconda
70 | wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86.sh \
71 | -O miniconda.sh
72 | fi
73 |
74 | echo Installing Miniconda3...
75 | sleep 2s # Slows down script to make terminal output more readable
76 | bash miniconda.sh -b -p $HOME/miniconda
77 |
78 | echo Miniconda3 installed, removing installation script...
79 | rm -f miniconda.sh
80 |
81 | echo Setting up Miniconda3...
82 | sleep 2s # Slows down script to make terminal output more readable
83 | source "$HOME/miniconda/etc/profile.d/conda.sh"
84 | hash -r
85 | conda config --set always_yes yes --set changeps1 yes \
86 | --set auto_activate_base false
87 | conda update -q conda
88 | conda init
89 |
90 | echo Displaying information about current conda installation...
91 | sleep 2s # Slows down script to make terminal output more readable
92 | conda info -a
93 |
94 | echo Creating the bioinfo-notebook virtual environment using conda...
95 | sleep 2s # Slows down script to make terminal output more readable
96 | # If the Linux system is 64-bit...
97 | if [ "$(uname -m)" == "x86_64" ];
98 | then
99 | # Create the virtual environment using the explicit spec list
100 | conda create --name bioinfo-notebook \
101 | --file ~/bioinfo-notebook/envs/bioinfo-notebook.txt
102 | # If the Linux system is not 64-bit...
103 | else
104 | # Create the virtual environment using an "environment".yml file
105 | conda env create -f ~/bioinfo-notebook/envs/bioinfo-notebook.yml
106 | fi
107 |
108 | echo Removing unused packages and caches using conda...
109 | sleep 2s # Slows down script to make terminal output more readable
110 | conda clean --all --yes
111 |
112 | echo -e Script finished! \n
113 |
114 | echo -e Please restart your Linux system for these changes to take effect. \n
115 |
116 | echo The bioinfo-notebook environment can be activated using the command...
117 | echo -e "\t \$ conda activate bioinfo-notebook"
118 | echo A conda virtual environment can be deactivated using the command...
119 | echo -e "\t \$ conda deactivate"
120 |
--------------------------------------------------------------------------------
/scripts/snp_calling.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | # https://github.com/rnnh/bioinfo-notebook.git
3 |
4 | # Help/usage text
5 | usage="$(basename "$0") [-h|--help] [-1|--one -2|--two -r|--reference] \n
6 | [-d|--demo] [-o|--output -l|--log -p|--processors n] \n
7 | \n
8 | This script aligns sequencing reads to a reference genome, and finds genetic \n
9 | variants (SNPs/indels) based on this alignment, which are written to a variant\n
10 | call format (VCF) file.\n
11 | \n
12 | Calling this script with the argument '-d' or '--demo' will run this script \n
13 | using Saccharomyces cerevisiae FASTQ sequencing reads and a Saccharomyces \n
14 | cerevisiae reference genome, which will be downloaded from NCBI. \n
15 | \n
16 | This script should be called from the 'bioinfo-notebook/' directory.The \n
17 | programs required for this script are in the 'bioinfo-notebook' conda \n
18 | environment (bioinfo-notebook/envs/bioinfo-notebook.yml or \n
19 | bioinfo-notebook/envs/bioinfo-notebook.txt). \n
20 | If the input files are not in the 'bioinfo-notebook/data/' directory, the full \n
21 | file paths should be given.\n\n
22 | \n
23 | arguments: \n
24 | \t -h | --help\t\t show this help text and exit \n
25 | \t -1 | --one\t\t forward reads to align with reference sequence \n
26 | \t\t\t\t (FASTQ: .fastq or .fastq.gz) \n
27 | \t -2 | --two\t\t reverse reads to align with reference sequence \n
28 | \t\t\t\t (FASTQ: .fastq or .fastq.gz) \n
29 | \t -r | --reference\t reference sequence to align reads against \n
30 | \t\t\t\t (FASTA nucleotide file: .fna) \n
31 | \t -d | --demo\t\t run the script with demonstration inputs\n
32 | \n
33 | optional arguments: \n
34 | \t -o | --output\t\t optional: name of final output file \n
35 | \t\t\t\t (default: 'reference_seq_vs_reads_var.vcf', or \n
36 | \t\t\t\t 'S_cere_DRR237290_var.vcf' if demo is used). \n
37 | \t -l | --log\t\t redirect terminal output to a log file in the \n
38 | \t\t\t\t directory bioinfo-notebook/results/ \n
39 | \t -p | --processors\t optional: set the number (n) of processors to \n
40 | \t\t\t\t use (default: 1) \n
41 | "
42 |
43 | MAKELOG=false
44 | PROCESSORS=1
45 | DEMO=false
46 | ONE=""
47 | TWO=""
48 | REFERENCE=""
49 | OUTPUT=""
50 |
51 | # Iterating through the input arguments with a while loop
52 | while (( "$#" )); do
53 | case "$1" in
54 | -h|--help)
55 | echo -e $usage
56 | exit
57 | ;;
58 | -1|--one)
59 | ONE=$2
60 | shift 2
61 | ;;
62 | -2|--two)
63 | TWO=$2
64 | shift 2
65 | ;;
66 | -r|--reference)
67 | REFERENCE=$2
68 | shift 2
69 | ;;
70 | -o|--output)
71 | OUTPUT=$2
72 | shift 2
73 | ;;
74 | -d|--demo)
75 | DEMO=true
76 | shift 1
77 | ;;
78 | -l|--log)
79 | MAKELOG=true
80 | shift 1
81 | ;;
82 | -p|--processors)
83 | PROCESSORS=$2
84 | shift 2
85 | ;;
86 | --) # end argument parsing
87 | shift
88 | break
89 | ;;
90 | -*|--*) # unsupported flags
91 | echo -e "ERROR: $1 is an invalid option. \n" >&2
92 | echo -e $usage
93 | exit 1
94 | ;;
95 | esac
96 | done
97 |
98 | cd ~/bioinfo-notebook/data/
99 |
100 | if $MAKELOG ; then
101 | # Creating results directory, if it does not already exist
102 | if [ ! -d ../results ]; then
103 | mkdir ../results
104 | fi
105 | # CREATING LOG FILE
106 | # Terminal output directed to the file 'snp_calling_[date]_[time].log'
107 | exec 3>&1 4>&2
108 | trap 'exec 2>&4 1>&3' 0 1 2 3
109 | exec 1>../results/snp_calling_$(date +%Y%m%d_%H%M).log 2>&1
110 | fi
111 |
112 | echo "$(date +%Y/%m/%d\ %H:%M) Beginning SNP calling script."
113 |
114 | if $DEMO ; then
115 | echo Downloading reads...
116 | until fastq-dump --gzip --skip-technical --readids --read-filter pass \
117 | --dumpbase --split-files --clip DRR237290; do
118 | echo fastq-dump failed, retrying in 10 seconds...
119 | sleep 10s
120 | done
121 |
122 | echo Downloading reference sequence...
123 | curl -s --remote-name --remote-time ftp://ftp.ncbi.nlm.nih.gov/genomes/all/\
124 | GCF/000/146/045/GCF_000146045.2_R64/GCF_000146045.2_R64_genomic.fna.gz
125 |
126 | echo Decompressing reference sequence...
127 | gunzip GCF_000146045.2_R64_genomic.fna.gz
128 |
129 | echo Indexing reference sequence for bowtie2...
130 | bowtie2-build GCF_000146045.2_R64_genomic.fna S_cere_ref_seq
131 |
132 | echo Aligning reads to the reference genome...
133 | bowtie2 --no-unal -p $PROCESSORS -x S_cere_ref_seq \
134 | -1 DRR237290_pass_1.fastq.gz -2 DRR237290_pass_2.fastq.gz \
135 | -S S_cere_DRR237290_alignment.sam
136 |
137 | echo Converting SAM alignment to sorted BAM alignment...
138 | samtools view -@ $PROCESSORS -Sb \
139 | -o S_cere_DRR237290_alignment_unsorted.bam S_cere_DRR237290_alignment.sam
140 |
141 | samtools sort -@ $PROCESSORS -O bam -l 9 -o S_cere_DRR237290_alignment.bam \
142 | S_cere_DRR237290_alignment_unsorted.bam
143 |
144 | echo Removing redundant alignment files...
145 | rm -v S_cere_DRR237290_alignment.sam S_cere_DRR237290_alignment_unsorted.bam
146 |
147 | echo Indexing reference sequence for SAMtools...
148 | samtools faidx GCF_000146045.2_R64_genomic.fna
149 |
150 | echo Generating genotype variant likelihoods with BCFtools...
151 | bcftools mpileup --max-depth 10000 --threads $PROCESSORS \
152 | -f GCF_000146045.2_R64_genomic.fna \
153 | -o S_cere_DRR237290_full.bcf S_cere_DRR237290_alignment.bam
154 |
155 | echo Variant calling with BCFtools...
156 | bcftools call -O b --threads $PROCESSORS -vc --ploidy 1 -p 0.05 \
157 | -o S_cere_DRR237290_var_unfiltered.bcf S_cere_DRR237290_full.bcf
158 |
159 | echo Removing redundant BCF file...
160 | rm -v S_cere_DRR237290_full.bcf
161 |
162 | if [ -z $OUTPUT ]; then
163 | echo Variant filtering with BCFtools filter...
164 | bcftools filter --threads $PROCESSORS -i '%QUAL>=20' -O v \
165 | -o S_cere_DRR237290_var.vcf S_cere_DRR237290_var_unfiltered.bcf
166 |
167 | echo Head of VCF file...
168 | head S_cere_DRR237290_var.vcf
169 |
170 | echo Tail of VCF file...
171 | tail S_cere_DRR237290_var.vcf
172 |
173 | echo "$(date +%Y/%m/%d\ %H:%M) Script finished."
174 | else
175 | echo Variant filtering with BCFtools filter...
176 | bcftools filter --threads $PROCESSORS -i '%QUAL>=20' -O v \
177 | -o $OUTPUT.vcf S_cere_DRR237290_var_unfiltered.bcf
178 |
179 | echo Head of VCF file...
180 | head $OUTPUT.vcf
181 |
182 | echo Tail of VCF file...
183 | tail $OUTPUT.vcf
184 | fi
185 | fi
186 |
187 | echo Indexing reference sequence for bowtie2...
188 | bowtie2-build $REFERENCE reference_seq
189 |
190 | echo Aligning reads to the reference genome...
191 | bowtie2 --no-unal -p $PROCESSORS -x reference_seq \
192 | -1 $ONE -2 $TWO -S reference_seq_vs_reads_alignment.sam
193 |
194 | echo Converting SAM alignment to sorted BAM alignment...
195 | samtools view -@ $PROCESSORS -Sb \
196 | -o reference_seq_vs_reads_alignment_unsorted.bam \
197 | reference_seq_vs_reads_alignment.sam
198 |
199 | samtools sort -@ $PROCESSORS -O bam -l 9 \
200 | -o reference_seq_vs_reads_alignment.bam \
201 | reference_seq_vs_reads_alignment_unsorted.bam
202 |
203 | echo Removing redundant alignment files...
204 | rm -v reference_seq_vs_reads_alignment.sam \
205 | reference_seq_vs_reads_alignment_unsorted.bam
206 |
207 | echo Indexing reference sequence for SAMtools...
208 | samtools faidx $REFERENCE
209 |
210 | echo Generating genotype variant likelihoods with BCFtools...
211 | bcftools mpileup --max-depth 10000 --threads $PROCESSORS \
212 | -f $REFERENCE -o reference_seq_vs_reads_full.bcf \
213 | reference_seq_vs_reads_alignment.bam
214 |
215 | echo Variant calling with BCFtools...
216 | bcftools call -O b --threads $PROCESSORS -vc --ploidy 1 -p 0.05 \
217 | -o reference_seq_vs_reads_var_unfiltered.bcf reference_seq_vs_reads_full.bcf
218 |
219 | echo Removing redundant BCF file...
220 | rm reference_seq_vs_reads_full.bcf
221 |
222 | if [ -z $OUTPUT ]; then
223 | echo Variant filtering with BCFtools filter...
224 | bcftools filter --threads $PROCESSORS -i '%QUAL>=20' -O v \
225 | -o reference_seq_vs_reads_var.vcf reference_seq_vs_reads_var_unfiltered.bcf
226 |
227 | echo Head of VCF file...
228 | head reference_seq_vs_reads_var.vcf
229 |
230 | echo Tail of VCF file...
231 | tail reference_seq_vs_reads_var.vcf
232 | else
233 | echo Variant filtering with BCFtools filter...
234 | bcftools filter --threads $PROCESSORS -i '%QUAL>=20' -O v \
235 | -o $OUTPUT.vcf reference_seq_vs_reads_var_unfiltered.bcf
236 |
237 | echo Head of VCF file...
238 | head $OUTPUT.vcf
239 |
240 | echo Tail of VCF file...
241 | tail $OUTPUT.vcf
242 | fi
243 |
244 | echo "$(date +%Y/%m/%d\ %H:%M) Script finished."
245 |
--------------------------------------------------------------------------------