├── .github
    └── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   └── feature_request.md
├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── _config.yml
├── assets
    └── bioinfo-notebook_logo.svg
├── data
    ├── design_table.csv
    ├── example_genome_annotation.gtf
    ├── example_nucleotide_sequence.fasta
    └── featCounts_S_cere_20200331.csv
├── docs
    ├── DE_analysis_edgeR_script.md
    ├── DE_analysis_edgeR_script.pdf
    ├── SPAdes.md
    ├── UniProt_downloader.md
    ├── annotated_snps_filter.md
    ├── annotating_snps.md
    ├── augustus.md
    ├── bcftools.md
    ├── blast.md
    ├── bowtie.md
    ├── bowtie2.md
    ├── cl_intro.md
    ├── cl_solutions.md
    ├── combining_featCount_tables.md
    ├── conda.md
    ├── fasterq-dump.md
    ├── fastq-dump.md
    ├── fastq-dump_to_featureCounts.md
    ├── featureCounts.md
    ├── file_formats.md
    ├── genome_annotation_SwissProt_CDS.md
    ├── htseq-count.md
    ├── linux_setup.md
    ├── orthofinder.md
    ├── part1.md
    ├── part2.md
    ├── part3.md
    ├── report_an_issue.md
    ├── samtools.md
    ├── sgRNAcas9.md
    ├── snp_calling.md
    ├── ubuntu_virtualbox.md
    └── wsl.md
├── envs
    ├── augustus.yml
    ├── bioinfo-notebook.txt
    ├── bioinfo-notebook.yml
    ├── orthofinder.yml
    └── sgRNAcas9.yml
└── scripts
    ├── DE_analysis_edgeR_script.R
    ├── UniProt_downloader.sh
    ├── annotated_snps_filter.R
    ├── annotating_snps.R
    ├── combining_featCount_tables.py
    ├── fastq-dump_to_featureCounts.sh
    ├── genome_annotation_SwissProt_CDS.sh
    ├── linux_setup.sh
    └── snp_calling.sh


/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 | 
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 | 
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 | 
26 | **Desktop (please complete the following information):**
27 |  - OS: [e.g. iOS]
28 |  - Browser [e.g. chrome, safari]
29 |  - Version [e.g. 22]
30 | 
31 | **Smartphone (please complete the following information):**
32 |  - Device: [e.g. iPhone6]
33 |  - OS: [e.g. iOS8.1]
34 |  - Browser [e.g. stock browser, safari]
35 |  - Version [e.g. 22]
36 | 
37 | **Additional context**
38 | Add any other context about the problem here.
39 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | data/*.fastq
 2 | data/bowtie2_example_index*
 3 | data/*.sam
 4 | data/*.bam
 5 | data/*.bai
 6 | data/*.txt
 7 | data/*.summary
 8 | data/*.gz
 9 | data/*.bt2
10 | data/S_cere_GCF_000146045.2_R64_genomic.*
11 | data/*.tsv
12 | data/*.log
13 | temp/
14 | results/
15 | .temp/
16 | results/*
17 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | 
 3 | python:
 4 |   # We don't actually use the Travis Python, but this keeps it organized.
 5 |   #- "2.7"
 6 |   #- "3.5"
 7 |   #- "3.6"
 8 |   - "3.7"
 9 | 
10 | install:
11 |   - sudo apt-get update
12 |   # We do this conditionally because it saves us some downloading if the
13 |   # version is the same.
14 |   - if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then
15 |       wget https://repo.continuum.io/miniconda/Miniconda2-latest-Linux-x86_64.sh -O miniconda.sh;
16 |     else
17 |       wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh;
18 |     fi
19 |   - bash miniconda.sh -b -p $HOME/miniconda
20 |   - source "$HOME/miniconda/etc/profile.d/conda.sh"
21 |   - hash -r
22 |   - conda config --set always_yes yes --set changeps1 no
23 |   - conda update -q conda
24 |   # Useful for debugging any issues with conda
25 |   - conda info -a
26 | 
27 |   # Creating conda environment using envs/bioinfo-notebook.txt
28 |   - conda create --name bioinfo-notebook-explicit --file envs/bioinfo-notebook.txt
29 |   - conda activate bioinfo-notebook-explicit
30 |   - conda deactivate
31 | 
32 |   # Creating conda environment using envs/bioinfo-notebook.yml
33 |   - conda env create --name bioinfo-notebook-yml --file envs/bioinfo-notebook.yml
34 |   - conda activate bioinfo-notebook-yml
35 |   - conda deactivate
36 |   
37 |   # Creating conda environment using envs/augustus.yml
38 |   - conda env create --name augustus-yml --file envs/augustus.yml
39 |   - conda activate augustus-yml
40 |   - conda deactivate
41 | 
42 |   # Creating conda environment using envs/orthofinder.yml
43 |   - conda env create --name orthofinder-yml --file envs/orthofinder.yml
44 |   - conda activate orthofinder-yml
45 |   - conda deactivate
46 |   
47 |   # Creating conda environment using envs/sgRNAcas9.yml
48 |   - conda env create --name sgRNAcas9-yml --file envs/sgRNAcas9.yml
49 |   - conda activate sgRNAcas9-yml
50 |   - conda deactivate
51 |   
52 | script:
53 |   # Confirming that programs work in conda environments
54 |   # bioinfo-notebook-explicit
55 |   - conda activate bioinfo-notebook-explicit
56 |   - bowtie2 --version
57 |   - samtools --version
58 |   - fastq-dump --version
59 |   - conda deactivate
60 |   
61 |   # bioinfo-notebook-yml
62 |   - conda activate bioinfo-notebook-yml
63 |   - bowtie2 --version
64 |   - samtools --version
65 |   - fastq-dump --version
66 |   - conda deactivate
67 |   
68 |   # augustus-yml
69 |   - conda activate augustus-yml
70 |   - augustus --help
71 |   - conda deactivate
72 |   
73 |   # orthofinder-yml
74 |   - conda activate orthofinder-yml
75 |   - orthofinder --help
76 |   - conda deactivate
77 |   
78 |   # orthofinder-yml
79 |   - conda activate sgRNAcas9-yml
80 |   - java --version
81 |   - conda deactivate
82 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Ronan Harrington
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | layout: default
  3 | title: Home
  4 | nav_order: 1
  5 | description: "Quick start guides for bioinformatics programs, with video demonstrations and scripts."
  6 | permalink: /
  7 | ---
  8 | 
  9 | 
 10 | # [Bioinformatics Notebook](https://github.com/rnnh/bioinfo-notebook.git)
 11 | 
 12 | by [Ronan Harrington](https://github.com/rnnh)
 13 | 
 14 | [![Build Status](https://travis-ci.com/rnnh/bioinfo-notebook.svg?branch=master)](https://travis-ci.com/rnnh/bioinfo-notebook)
 15 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
 16 | ![GitHub issues](https://img.shields.io/github/issues/rnnh/bioinfo-notebook)
 17 | ![GitHub repo size](https://img.shields.io/github/repo-size/rnnh/bioinfo-notebook)
 18 | ![Website](https://img.shields.io/website?url=https%3A%2F%2Frnnh.github.io%2Fbioinfo-notebook)
 19 | [![DOI](https://zenodo.org/badge/243280413.svg)](https://zenodo.org/badge/latestdoi/243280413)
 20 | 
 21 | This project provides introductions to various bioinformatics tools with short guides, video demonstrations, and scripts that tie these tools together.
 22 | The documents in this project can be read locally in a plain-text editor, or viewed online at <https://rnnh.github.io/bioinfo-notebook/>.
 23 | If you are not familiar with using programs from the command line, begin with the page "[Introduction to the command line](docs/cl_intro.md)".
 24 | If you have any questions, or spot any mistakes, [please submit an issue on GitHub](https://github.com/rnnh/bioinfo-notebook/issues).
 25 | 
 26 | - [Pipeline examples](#pipeline-examples)
 27 | - [Contents](#contents)
 28 | - [Installation instructions](#installation-instructions)
 29 | - [Repository structure](#repository-structure)
 30 | 
 31 | ## Pipeline examples
 32 | 
 33 | These bioinformatics pipelines can be carried out using scripts and tools described in this project.
 34 | Input files for some of these scripts can be specified in the command line; other scripts will need to be altered to fit the given input data.
 35 | 
 36 | ### SNP analysis
 37 | 
 38 | - [FASTQ](docs/file_formats.md#fastq) reads from whole genome sequencing (WGS) can be assembled using [SPAdes](docs/SPAdes.md).
 39 | - Sequencing reads can be aligned to this assembled genome using [bowtie2](docs/bowtie2.md).
 40 | - The script [snp_calling.sh](docs/snp_calling.md) aligns sequencing reads to an assembled genome and detects single nucleotide polymorphisms (SNPs). This will produce a [Variant Call Format (VCF) file](docs/file_formats.md#vcf).
 41 | - The proteins in the assembled reference genome- the genome to which the reads are aligned- can be annotated using [genome_annotation_SwissProt_CDS.sh](docs/genome_annotation_SwissProt_CDS.md).
 42 | - The genome annotation [GFF](docs/file_formats.md#gff) file can be cross-referenced with the VCF file using [annotating_snps.R](docs/annotating_snps.md). This will produce an [annotated SNP format](docs/annotating_snps.md#annotated-snp-format) file.
 43 | - Annotated SNP format files can be cross-referenced using [annotated_snps_filter.R](docs/annotated_snps_filter.md). For two annotated SNP files, this script will produce a file with annotated SNPs unique to the first file, and a file with annotated SNPs unique to the second file.
 44 | 
 45 | ### RNA-seq analysis
 46 | 
 47 | - [fastq-dump_to_featureCounts.sh](docs/fastq-dump_to_featureCounts.md) can be used to download RNA-seq reads from NCBI's Sequence Read Archive (SRA) and align them to a reference genome. This script uses [fastq-dump](docs/fastq-dump.md) or [fasterq-dump](docs/fasterq-dump.md) to download the sequencing reads as [FASTQ](docs/file_formats.md#fastq), and [featureCounts](docs/featureCounts.md) to align them to a reference [FASTA nucleotide file.](docs/file_formats.md#fasta)
 48 | - Running [fastq-dump_to_featureCounts.sh](docs/fastq-dump_to_featureCounts.md) will produce feature count tables. These feature count tables can be combined using [combining_featCount_tables.py](docs/combining_featCount_tables.md).
 49 | - These combined feature count tables can be used for differential expression (DE) analysis. An example DE analysis script is included in this project: [DE_analysis_edgeR_script.R](docs/DE_analysis_edgeR_script.md). This script uses the [R programming language](https://cran.r-project.org/) with the [edgeR](https://bioconductor.org/packages/release/bioc/html/edgeR.html) library.
 50 | 
 51 | ### Detecting orthologs between genomes
 52 | 
 53 | - [Augustus](docs/augustus.md) can be used to predict genes from [FASTA nucleotide files](docs/file_formats.md#fasta).
 54 | - Once the FASTA amino acid sequences have been [extracted from the Augustus annotations](docs/augustus.md#extracting-the-fasta-amino-acid-sequences-of-predicted-genes-from-an-augustus-annotation), you can search for orthologs using [OrthoFinder](docs/orthofinder.md).
 55 | - To find a specific gene of interest, search the amino acid sequences of the predicted genes using [BLAST](docs/blast.md).
 56 | 
 57 | ## Contents
 58 | 
 59 | ### [1. General guides](docs/part1.md)
 60 | 
 61 | - [Introduction to the command line](docs/cl_intro.md)
 62 | - [Windows Subsystem for Linux](docs/wsl.md)
 63 | - [Using Ubuntu through a Virtual Machine](docs/ubuntu_virtualbox.md)
 64 | - [File formats used in bioinformatics](docs/file_formats.md)
 65 | 
 66 | ### [2. Program guides](docs/part2.md)
 67 | 
 68 | - [Augustus](docs/augustus.md)
 69 | - [Bcftools](docs/bcftools.md)
 70 | - [BLAST](docs/blast.md)
 71 | - [Bowtie](docs/bowtie.md)
 72 | - [Bowtie2](docs/bowtie2.md)
 73 | - [Conda](docs/conda.md)
 74 | - [Fasterq-dump](docs/fasterq-dump.md)
 75 | - [Fastq-dump](docs/fastq-dump.md)
 76 | - [FeatureCounts](docs/featureCounts.md)
 77 | - [Htseq-count](docs/htseq-count.md)
 78 | - [OrthoFinder](docs/orthofinder.md)
 79 | - [SAMtools](docs/samtools.md)
 80 | - [sgRNAcas9](docs/sgRNAcas9.md)
 81 | - [SPAdes](docs/SPAdes.md)
 82 | 
 83 | ### [3. Scripts](docs/part3.md)
 84 | 
 85 | - [Annotated SNPs filter](docs/annotated_snps_filter.md)
 86 | - [Annotating SNPs](docs/annotating_snps.md)
 87 | - [Combining featCount tables.py](docs/combining_featCount_tables.md)
 88 | - [DE_analysis_edgeR_script.R](docs/DE_analysis_edgeR_script.md)
 89 | - [Fastq-dump to featureCounts](docs/fastq-dump_to_featureCounts.md)
 90 | - [Genome annotation script](docs/genome_annotation_SwissProt_CDS.md)
 91 | - [Linux setup script](docs/linux_setup.md)
 92 | - [SNP calling script](docs/snp_calling.md)
 93 | - [UniProt downloader](docs/UniProt_downloader.md)
 94 | 
 95 | ## Installation instructions
 96 | 
 97 | After following these instructions, there will be a copy of the [bioinfo-notebook GitHub repo](https://www.github.com/rnnh/bioinfo-notebook/) on your system in the `~/bioinfo-notebook/` directory.
 98 | This means there will be a copy of all the documents and scripts in this project on your computer.
 99 | If you are using Linux and run the [Linux setup script](docs/linux_setup.sh), the `bioinfo-notebook` virtual environment- which includes the majority of the command line programs covered in this project- will also be installed using [conda](docs/conda.md).
100 | 
101 | **1.** This project is written to be used through a UNIX (Linux or Mac with macOS Mojave or later) operating system.
102 |  If you are using a Windows operating system, begin with these pages on setting up Ubuntu (a Linux operating system):
103 |  
104 | - [Windows Subsystem for Linux](docs/wsl.md)
105 | - [Using Ubuntu through a Virtual Machine](docs/ubuntu_virtualbox.md)
106 | 
107 | Once you have an Ubuntu system set up, run the following command to update the lists of available software:
108 | 
109 | ```bash
110 | $ sudo apt-get update # Updates lists of software that can be installed
111 | ```
112 | 
113 | **2.** Run the following command in your home directory (`~`) to download this project:
114 | 
115 | ```bash
116 | $ git clone https://github.com/rnnh/bioinfo-notebook.git
117 | ```
118 | 
119 | **3.** If you are using Linux, run the [Linux setup script](docs/linux_setup.md) with this command after downloading the project:
120 | 
121 | ```bash
122 | $ bash ~/bioinfo-notebook/scripts/linux_setup.sh
123 | ```
124 | 
125 | ### Video demonstration of installation
126 | 
127 | [![asciicast](https://asciinema.org/a/314853.svg)](https://asciinema.org/a/314853?autoplay=1)
128 | 
129 | ## Repository structure
130 | 
131 | ```
132 | bioinfo-notebook/
133 | ├── assets/
134 | │   └── bioinfo-notebook_logo.svg
135 | ├── data/
136 | │   ├── blastx_SwissProt_example_nucleotide_sequence.fasta.tsv
137 | │   ├── blastx_SwissProt_S_cere.tsv
138 | │   ├── design_table.csv
139 | │   ├── example_genome_annotation.gtf
140 | │   ├── example_nucleotide_sequence.fasta
141 | │   └── featCounts_S_cere_20200331.csv
142 | ├── docs/
143 | │   ├── annotated_snps_filter.md
144 | │   ├── annotating_snps.md
145 | │   ├── augustus.md
146 | │   ├── blast.md
147 | │   ├── bowtie2.md
148 | │   ├── bowtie.md
149 | │   ├── cl_intro.md
150 | │   ├── cl_solutions.md
151 | │   ├── combining_featCount_tables.md
152 | │   ├── conda.md
153 | │   ├── DE_analysis_edgeR_script.md
154 | │   ├── DE_analysis_edgeR_script.pdf
155 | │   ├── fasterq-dump.md
156 | │   ├── fastq-dump.md
157 | │   ├── fastq-dump_to_featureCounts.md
158 | │   ├── featureCounts.md
159 | │   ├── file_formats.md
160 | │   ├── genome_annotation_SwissProt_CDS.md
161 | │   ├── htseq-count.md
162 | │   ├── linux_setup.md
163 | │   ├── orthofinder.md
164 | │   ├── part1.md    # Navigation page for website
165 | │   ├── part2.md    # Navigation page for website
166 | │   ├── part3.md    # Navigation page for website
167 | │   ├── report_an_issue.md
168 | │   ├── samtools.md
169 | │   ├── sgRNAcas9.md
170 | │   ├── snp_calling.md
171 | │   ├── SPAdes.md
172 | │   ├── ubuntu_virtualbox.md
173 | │   ├── UniProt_downloader.md
174 | │   └── wsl.md
175 | ├── envs/            # conda environment files
176 | │   ├── augustus.yml            # environment for Augustus
177 | │   ├── bioinfo-notebook.txt
178 | │   ├── bioinfo-notebook.yml
179 | │   ├── orthofinder.yml         # environment for OrthoFinder
180 | │   └── sgRNAcas9.yml           # environment for sgRNAcas9
181 | ├── scripts/
182 | │   ├── annotated_snps_filter.R
183 | │   ├── annotating_snps.R
184 | │   ├── combining_featCount_tables.py
185 | │   ├── DE_analysis_edgeR_script.R
186 | │   ├── fastq-dump_to_featureCounts.sh
187 | │   ├── genome_annotation_SwissProt_CDS.sh
188 | │   ├── linux_setup.sh
189 | │   ├── snp_calling.sh
190 | │   └── UniProt_downloader.sh
191 | ├── _config.yml     # Configures github.io project website
192 | ├── .gitignore
193 | ├── LICENSE
194 | ├── README.md
195 | └── .travis.yml     # Configures Travis CI testing for GitHub repo
196 | ```
197 | 


--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
 1 | remote_theme: pmarsceill/just-the-docs
 2 | baseurl: "/bioinfo-notebook" # the subpath of your site, e.g. /blog
 3 | url: "https://rnnh.github.io" # the base hostname & protocol for your site, e.g. http://example.com
 4 | title: Bioinformatics Notebook
 5 | logo: assets/bioinfo-notebook_logo.svg
 6 | search_enabled: true
 7 | search_tokenizer_separator: /[\s/]+/
 8 | aux_links:
 9 |   "Bioinformatics Notebook on GitHub":
10 |     - "//github.com/rnnh/bioinfo-notebook"
11 | heading_anchors: true
12 | 


--------------------------------------------------------------------------------
/assets/bioinfo-notebook_logo.svg:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
  2 | <!-- Created with Inkscape (http://www.inkscape.org/) -->
  3 | 
  4 | <svg
  5 |    xmlns:dc="http://purl.org/dc/elements/1.1/"
  6 |    xmlns:cc="http://creativecommons.org/ns#"
  7 |    xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
  8 |    xmlns:svg="http://www.w3.org/2000/svg"
  9 |    xmlns="http://www.w3.org/2000/svg"
 10 |    xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
 11 |    xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
 12 |    width="295.70078"
 13 |    height="91.519691"
 14 |    viewBox="0 0 78.237985 24.214737"
 15 |    version="1.1"
 16 |    id="svg8"
 17 |    sodipodi:docname="drawing.svg"
 18 |    inkscape:version="0.92.4 (f8dce91, 2019-08-02)">
 19 |   <defs
 20 |      id="defs2" />
 21 |   <sodipodi:namedview
 22 |      id="base"
 23 |      pagecolor="#ffffff"
 24 |      bordercolor="#666666"
 25 |      borderopacity="1.0"
 26 |      inkscape:pageopacity="0.0"
 27 |      inkscape:pageshadow="2"
 28 |      inkscape:zoom="1.979899"
 29 |      inkscape:cx="257.13794"
 30 |      inkscape:cy="-32.494376"
 31 |      inkscape:document-units="mm"
 32 |      inkscape:current-layer="layer1"
 33 |      showgrid="false"
 34 |      units="px"
 35 |      inkscape:window-width="1299"
 36 |      inkscape:window-height="713"
 37 |      inkscape:window-x="67"
 38 |      inkscape:window-y="27"
 39 |      inkscape:window-maximized="1" />
 40 |   <metadata
 41 |      id="metadata5">
 42 |     <rdf:RDF>
 43 |       <cc:Work
 44 |          rdf:about="">
 45 |         <dc:format>image/svg+xml</dc:format>
 46 |         <dc:type
 47 |            rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
 48 |         <dc:title></dc:title>
 49 |       </cc:Work>
 50 |     </rdf:RDF>
 51 |   </metadata>
 52 |   <g
 53 |      inkscape:label="Layer 1"
 54 |      inkscape:groupmode="layer"
 55 |      id="layer1"
 56 |      transform="translate(0,-272.79287)">
 57 |     <g
 58 |        aria-label="Bioinformatics
 59 | Notebook"
 60 |        transform="matrix(0.26458333,0,0,0.26458333,-15.874995,260.42731)"
 61 |        style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none"
 62 |        id="flowRoot3721">
 63 |       <path
 64 |          d="m 83.76,74.489802 q 0,2.24 -0.76,3.76 -0.76,1.48 -2.08,2.4 -1.28,0.88 -3.08,1.28 -1.76,0.36 -3.8,0.36 -1.64,0 -3.4,-0.2 -1.76,-0.16 -3.48,-0.52 v -24.04 q 1.4,-0.24 3.08,-0.4 1.68,-0.2 3.32,-0.2 2.8,0 4.56,0.6 1.76,0.56 2.76,1.52 1,0.92 1.36,2.08 0.36,1.16 0.36,2.32 0,1.76 -0.88,3.12 -0.84,1.36 -2.28,2.16 2.52,0.92 3.4,2.48 0.92,1.56 0.92,3.28 z m -11.68,-3.36 v 6.72 q 0.52,0.08 1.12,0.12 0.64,0.04 1.24,0.04 0.84,0 1.64,-0.12 0.8,-0.16 1.4,-0.52 0.64,-0.4 1.04,-1.08 0.4,-0.72 0.4,-1.8 0,-1.8 -1.16,-2.56 -1.12,-0.8 -3.04,-0.8 z m 1.92,-4.08 q 1.88,0 2.84,-0.8 0.96,-0.84 0.96,-2.28 0,-0.88 -0.28,-1.4 -0.28,-0.56 -0.76,-0.84 -0.48,-0.32 -1.12,-0.4 -0.64,-0.12 -1.32,-0.12 -0.56,0 -1.16,0.04 -0.6,0.04 -1.08,0.12 v 5.68 z"
 65 |          style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:40px;font-family:'Ubuntu Mono';-inkscape-font-specification:'Ubuntu Mono, Bold';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start"
 66 |          id="path3779" />
 67 |       <path
 68 |          d="m 96.36,57.769802 q 0,1.44 -0.92,2.28 -0.92,0.84 -2.2,0.84 -1.28,0 -2.2,-0.84 -0.88,-0.84 -0.88,-2.28 0,-1.48 0.88,-2.32 0.92,-0.84 2.2,-0.84 1.28,0 2.2,0.84 0.92,0.84 0.92,2.32 z m 7.32,23.4 q -1.52,0.72 -2.88,0.96 -1.32,0.28 -2.44,0.28 -1.92,0 -3.24,-0.56 -1.28,-0.56 -2.08,-1.6 -0.76,-1.08 -1.08,-2.6 -0.32,-1.52 -0.32,-3.48 v -6.92 h -5.2 v -4.08 h 10.12 v 11.64 q 0,1.68 0.6,2.52 0.64,0.8 2.08,0.8 0.68,0 1.64,-0.16 0.96,-0.2 2.16,-0.76 z"
 69 |          style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:40px;font-family:'Ubuntu Mono';-inkscape-font-specification:'Ubuntu Mono, Bold';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start"
 70 |          id="path3781" />
 71 |       <path
 72 |          d="m 123.88,72.529802 q 0,2.24 -0.64,4.08 -0.64,1.8 -1.8,3.12 -1.16,1.32 -2.8,2.04 -1.6,0.72 -3.6,0.72 -2,0 -3.64,-0.72 -1.6,-0.72 -2.8,-2.04 -1.16,-1.32 -1.8,-3.12 -0.64,-1.84 -0.64,-4.08 0,-2.2 0.64,-4 0.68,-1.84 1.84,-3.12 1.2,-1.32 2.84,-2 1.64,-0.72 3.56,-0.72 1.96,0 3.56,0.72 1.64,0.68 2.8,2 1.2,1.28 1.84,3.12 0.64,1.8 0.64,4 z m -12.72,0.04 q 0,2.52 0.88,4.08 0.92,1.56 2.92,1.56 1.92,0 2.88,-1.56 1,-1.56 1,-4.08 0,-2.52 -0.92,-4.04 -0.88,-1.56 -2.88,-1.56 -1.92,0 -2.92,1.56 -0.96,1.52 -0.96,4.04 z"
 73 |          style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:40px;font-family:'Ubuntu Mono';-inkscape-font-specification:'Ubuntu Mono, Bold';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start"
 74 |          id="path3783" />
 75 |       <path
 76 |          d="m 136.36,57.769802 q 0,1.44 -0.92,2.28 -0.92,0.84 -2.2,0.84 -1.28,0 -2.2,-0.84 -0.88,-0.84 -0.88,-2.28 0,-1.48 0.88,-2.32 0.92,-0.84 2.2,-0.84 1.28,0 2.2,0.84 0.92,0.84 0.92,2.32 z m 7.32,23.4 q -1.52,0.72 -2.88,0.96 -1.32,0.28 -2.44,0.28 -1.92,0 -3.24,-0.56 -1.28,-0.56 -2.08,-1.6 -0.76,-1.08 -1.08,-2.6 -0.32,-1.52 -0.32,-3.48 v -6.92 h -5.2 v -4.08 h 10.12 v 11.64 q 0,1.68 0.6,2.52 0.64,0.8 2.08,0.8 0.68,0 1.64,-0.16 0.96,-0.2 2.16,-0.76 z"
 77 |          style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:40px;font-family:'Ubuntu Mono';-inkscape-font-specification:'Ubuntu Mono, Bold';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start"
 78 |          id="path3785" />
 79 |       <path
 80 |          d="m 147.52,63.769802 q 1.32,-0.36 3.24,-0.68 1.96,-0.32 4.24,-0.32 2.24,0 3.72,0.64 1.48,0.6 2.32,1.76 0.88,1.12 1.24,2.72 0.36,1.56 0.36,3.48 v 10.6 h -4.92 v -9.96 q 0,-2.64 -0.6,-3.8 -0.6,-1.16 -2.44,-1.16 -0.56,0 -1.12,0.04 -0.52,0.04 -1.12,0.12 v 14.76 h -4.92 z"
 81 |          style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:40px;font-family:'Ubuntu Mono';-inkscape-font-specification:'Ubuntu Mono, Bold';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start"
 82 |          id="path3787" />
 83 |       <path
 84 |          d="m 170.56,81.969802 v -14.72 h -4.12 v -4.08 h 4.12 v -1.08 q 0,-2.28 0.64,-3.76 0.68,-1.52 1.8,-2.4 1.12,-0.92 2.56,-1.28 1.44,-0.4 2.96,-0.4 1.56,0 3.2,0.28 1.64,0.28 2.96,0.92 l -0.84,4 q -0.92,-0.32 -2.32,-0.64 -1.36,-0.36 -2.6,-0.36 -0.68,0 -1.32,0.12 -0.6,0.12 -1.08,0.52 -0.44,0.4 -0.72,1.16 -0.28,0.72 -0.28,1.92 v 1 h 7.72 v 4.08 h -7.72 v 14.72 z"
 85 |          style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:40px;font-family:'Ubuntu Mono';-inkscape-font-specification:'Ubuntu Mono, Bold';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start"
 86 |          id="path3789" />
 87 |       <path
 88 |          d="m 203.88,72.529802 q 0,2.24 -0.64,4.08 -0.64,1.8 -1.8,3.12 -1.16,1.32 -2.8,2.04 -1.6,0.72 -3.6,0.72 -2,0 -3.64,-0.72 -1.6,-0.72 -2.8,-2.04 -1.16,-1.32 -1.8,-3.12 -0.64,-1.84 -0.64,-4.08 0,-2.2 0.64,-4 0.68,-1.84 1.84,-3.12 1.2,-1.32 2.84,-2 1.64,-0.72 3.56,-0.72 1.96,0 3.56,0.72 1.64,0.68 2.8,2 1.2,1.28 1.84,3.12 0.64,1.8 0.64,4 z m -12.72,0.04 q 0,2.52 0.88,4.08 0.92,1.56 2.92,1.56 1.92,0 2.88,-1.56 1,-1.56 1,-4.08 0,-2.52 -0.92,-4.04 -0.88,-1.56 -2.88,-1.56 -1.92,0 -2.92,1.56 -0.96,1.52 -0.96,4.04 z"
 89 |          style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:40px;font-family:'Ubuntu Mono';-inkscape-font-specification:'Ubuntu Mono, Bold';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start"
 90 |          id="path3791" />
 91 |       <path
 92 |          d="m 222.48,68.169802 q -0.48,-0.12 -1.16,-0.24 -0.64,-0.12 -1.32,-0.2 -0.68,-0.12 -1.32,-0.16 -0.64,-0.04 -1.08,-0.04 -1.04,0 -2.04,0.12 -1,0.08 -2.04,0.36 v 13.96 h -4.96 v -17.56 q 1.96,-0.72 4,-1.16 2.08,-0.44 4.84,-0.44 0.4,0 1.12,0.04 0.76,0.04 1.6,0.16 0.84,0.08 1.68,0.24 0.88,0.12 1.56,0.36 z"
 93 |          style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:40px;font-family:'Ubuntu Mono';-inkscape-font-specification:'Ubuntu Mono, Bold';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start"
 94 |          id="path3793" />
 95 |       <path
 96 |          d="m 238.72,62.969802 q 1.56,0 2.56,0.48 1,0.48 1.56,1.44 0.6,0.96 0.8,2.44 0.2,1.44 0.2,3.4 v 11.24 h -4 v -11.48 q 0,-1.12 -0.16,-1.8 -0.12,-0.68 -0.36,-1.04 -0.2,-0.36 -0.52,-0.48 -0.28,-0.12 -0.6,-0.12 -0.36,0 -0.76,0.08 -0.4,0.04 -0.72,0.2 0.12,0.76 0.2,1.68 0.08,0.92 0.08,2.04 v 3.76 h -4 v -4.32 q 0,-1.92 -0.4,-2.68 -0.36,-0.76 -1.2,-0.76 -0.24,0 -0.6,0.04 -0.32,0.04 -0.64,0.12 v 14.76 h -4 v -18.08 q 1.56,-0.44 3.08,-0.68 1.52,-0.24 2.68,-0.24 1.04,0 1.84,0.28 0.84,0.24 1.44,0.84 0.68,-0.44 1.6,-0.76 0.92,-0.36 1.92,-0.36 z"
 97 |          style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:40px;font-family:'Ubuntu Mono';-inkscape-font-specification:'Ubuntu Mono, Bold';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start"
 98 |          id="path3795" />
 99 |       <path
100 |          d="m 255.04,78.449802 q 0.8,0 1.56,0 0.76,-0.04 1.32,-0.12 v -4.32 q -0.48,-0.08 -1.16,-0.12 -0.68,-0.08 -1.24,-0.08 -0.76,0 -1.48,0.12 -0.72,0.08 -1.28,0.32 -0.52,0.24 -0.84,0.68 -0.32,0.44 -0.32,1.16 0,1.28 0.96,1.84 0.96,0.52 2.48,0.52 z m -0.44,-15.76 q 2.32,0 3.84,0.56 1.56,0.52 2.48,1.52 0.92,0.96 1.32,2.4 0.4,1.4 0.4,3.16 v 11.2 q -1.08,0.24 -3.2,0.56 -2.12,0.32 -4.76,0.32 -1.8,0 -3.28,-0.32 -1.44,-0.32 -2.48,-1.04 -1.04,-0.76 -1.6,-1.92 -0.56,-1.2 -0.56,-2.92 0,-1.64 0.64,-2.76 0.64,-1.12 1.72,-1.8 1.08,-0.68 2.48,-0.96 1.4,-0.32 2.92,-0.32 1.92,0 3.4,0.32 v -0.6 q 0,-1.4 -0.88,-2.32 -0.88,-0.96 -3.04,-0.96 -1.4,0 -2.72,0.2 -1.28,0.2 -2.08,0.48 l -0.68,-3.96 q 0.92,-0.32 2.6,-0.56 1.68,-0.28 3.48,-0.28 z"
101 |          style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:40px;font-family:'Ubuntu Mono';-inkscape-font-specification:'Ubuntu Mono, Bold';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start"
102 |          id="path3797" />
103 |       <path
104 |          d="m 270.72,67.249802 h -4.28 v -4.08 h 4.28 v -4.72 l 4.92,-0.8 v 5.52 h 7.88 v 4.08 h -7.88 v 7.6 q 0,1.04 0.2,1.68 0.2,0.64 0.56,1 0.36,0.36 0.88,0.48 0.52,0.12 1.16,0.12 0.68,0 1.24,-0.04 0.6,-0.04 1.12,-0.12 0.56,-0.12 1.12,-0.32 0.6,-0.2 1.28,-0.52 l 0.68,4.24 q -1.36,0.56 -2.96,0.8 -1.56,0.24 -3.04,0.24 -1.72,0 -3.04,-0.28 -1.32,-0.28 -2.24,-1.08 -0.92,-0.8 -1.4,-2.24 -0.48,-1.48 -0.48,-3.84 z"
105 |          style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:40px;font-family:'Ubuntu Mono';-inkscape-font-specification:'Ubuntu Mono, Bold';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start"
106 |          id="path3799" />
107 |       <path
108 |          d="m 296.36,57.769802 q 0,1.44 -0.92,2.28 -0.92,0.84 -2.2,0.84 -1.28,0 -2.2,-0.84 -0.88,-0.84 -0.88,-2.28 0,-1.48 0.88,-2.32 0.92,-0.84 2.2,-0.84 1.28,0 2.2,0.84 0.92,0.84 0.92,2.32 z m 7.32,23.4 q -1.52,0.72 -2.88,0.96 -1.32,0.28 -2.44,0.28 -1.92,0 -3.24,-0.56 -1.28,-0.56 -2.08,-1.6 -0.76,-1.08 -1.08,-2.6 -0.32,-1.52 -0.32,-3.48 v -6.92 h -5.2 v -4.08 h 10.12 v 11.64 q 0,1.68 0.6,2.52 0.64,0.8 2.08,0.8 0.68,0 1.64,-0.16 0.96,-0.2 2.16,-0.76 z"
109 |          style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:40px;font-family:'Ubuntu Mono';-inkscape-font-specification:'Ubuntu Mono, Bold';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start"
110 |          id="path3801" />
111 |       <path
112 |          d="m 306.76,72.569802 q 0,-2.04 0.64,-3.84 0.64,-1.8 1.96,-3.12 1.32,-1.36 3.32,-2.12 2.04,-0.8 4.84,-0.8 1.68,0 3.04,0.24 1.36,0.24 2.76,0.8 l -1.08,3.96 q -0.8,-0.28 -1.8,-0.48 -0.96,-0.24 -2.48,-0.24 -1.76,0 -2.96,0.44 -1.16,0.4 -1.88,1.16 -0.72,0.72 -1.04,1.76 -0.32,1.04 -0.32,2.24 0,2.6 1.44,4.08 1.48,1.48 5,1.48 1.16,0 2.4,-0.16 1.28,-0.16 2.32,-0.52 l 0.72,4.04 q -1.04,0.4 -2.52,0.64 -1.48,0.28 -3.48,0.28 -2.88,0 -4.96,-0.76 -2.04,-0.76 -3.36,-2.08 -1.32,-1.32 -1.96,-3.12 -0.6,-1.8 -0.6,-3.88 z"
113 |          style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:40px;font-family:'Ubuntu Mono';-inkscape-font-specification:'Ubuntu Mono, Bold';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start"
114 |          id="path3803" />
115 |       <path
116 |          d="m 334.12,78.249802 q 1.8,0 2.76,-0.24 0.96,-0.28 0.96,-1.08 0,-0.52 -0.32,-0.88 -0.28,-0.36 -0.84,-0.6 -0.52,-0.28 -1.24,-0.52 -0.68,-0.24 -1.44,-0.48 -1.08,-0.32 -2.2,-0.72 -1.12,-0.44 -2.04,-1.08 -0.92,-0.64 -1.52,-1.6 -0.6,-1 -0.6,-2.48 0,-1.2 0.44,-2.24 0.48,-1.08 1.44,-1.88 1,-0.84 2.52,-1.28 1.52,-0.48 3.64,-0.48 1.84,0 3.4,0.28 1.6,0.28 2.76,0.8 l -0.76,4.16 q -0.68,-0.2 -2.12,-0.64 -1.44,-0.48 -3.24,-0.48 -1.88,0 -2.56,0.48 -0.64,0.44 -0.64,0.96 0,0.44 0.28,0.8 0.32,0.32 0.8,0.6 0.52,0.28 1.16,0.56 0.68,0.24 1.44,0.48 1.08,0.36 2.24,0.8 1.16,0.44 2.08,1.12 0.96,0.64 1.56,1.68 0.6,1 0.6,2.48 0,1.16 -0.44,2.2 -0.44,1.04 -1.48,1.8 -1,0.76 -2.64,1.2 -1.64,0.44 -4.04,0.44 -2.44,0 -4.16,-0.56 -1.72,-0.56 -2.76,-1 l 0.76,-4.12 q 1.4,0.56 2.96,1.04 1.6,0.48 3.24,0.48 z"
117 |          style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:40px;font-family:'Ubuntu Mono';-inkscape-font-specification:'Ubuntu Mono, Bold';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start"
118 |          id="path3805" />
119 |       <path
120 |          d="m 79.2,133.88687 q -1.68,-4.12 -3.68,-8.2 -1.96,-4.12 -4.24,-8.24 v 16.44 h -4.44 v -24.76 h 3.96 q 1.12,1.76 2.24,3.84 1.12,2.04 2.16,4.08 1.08,2.04 1.96,3.96 0.92,1.92 1.56,3.36 v -15.24 h 4.44 v 24.76 z"
121 |          style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:40px;font-family:'Ubuntu Mono';-inkscape-font-specification:'Ubuntu Mono, Bold';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start"
122 |          id="path3807" />
123 |       <path
124 |          d="m 103.88,124.44687 q 0,2.24 -0.64,4.08 -0.64,1.8 -1.8,3.12 -1.16,1.32 -2.8,2.04 -1.6,0.72 -3.6,0.72 -2,0 -3.64,-0.72 -1.6,-0.72 -2.8,-2.04 -1.16,-1.32 -1.8,-3.12 -0.64,-1.84 -0.64,-4.08 0,-2.2 0.64,-4 0.68,-1.84 1.84,-3.12 1.2,-1.32 2.84,-2 1.64,-0.72 3.56,-0.72 1.96,0 3.56,0.72 1.64,0.68 2.8,2 1.2,1.28 1.84,3.12 0.64,1.8 0.64,4 z m -12.72,0.04 q 0,2.52 0.88,4.08 0.92,1.56 2.92,1.56 1.92,0 2.88,-1.56 1,-1.56 1,-4.08 0,-2.52 -0.92,-4.04 -0.88,-1.56 -2.88,-1.56 -1.92,0 -2.92,1.56 -0.96,1.52 -0.96,4.04 z"
125 |          style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:40px;font-family:'Ubuntu Mono';-inkscape-font-specification:'Ubuntu Mono, Bold';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start"
126 |          id="path3809" />
127 |       <path
128 |          d="m 110.72,119.16687 h -4.28 v -4.08 h 4.28 v -4.72 l 4.92,-0.8 v 5.52 h 7.88 v 4.08 h -7.88 v 7.6 q 0,1.04 0.2,1.68 0.2,0.64 0.56,1 0.36,0.36 0.88,0.48 0.52,0.12 1.16,0.12 0.68,0 1.24,-0.04 0.6,-0.04 1.12,-0.12 0.56,-0.12 1.12,-0.32 0.6,-0.2 1.28,-0.52 l 0.68,4.24 q -1.36,0.56 -2.96,0.8 -1.56,0.24 -3.04,0.24 -1.72,0 -3.04,-0.28 -1.32,-0.28 -2.24,-1.08 -0.92,-0.8 -1.4,-2.24 -0.48,-1.48 -0.48,-3.84 z"
129 |          style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:40px;font-family:'Ubuntu Mono';-inkscape-font-specification:'Ubuntu Mono, Bold';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start"
130 |          id="path3811" />
131 |       <path
132 |          d="m 126.24,124.64687 q 0,-2.48 0.76,-4.36 0.76,-1.88 2,-3.12 1.24,-1.28 2.84,-1.92 1.6,-0.64 3.28,-0.64 4.16,0 6.32,2.48 2.16,2.44 2.16,7.16 0,0.48 -0.04,1 0,0.52 -0.04,0.84 h -12.2 q 0,1.84 1.52,2.92 1.52,1.04 3.92,1.04 1.48,0 2.8,-0.32 1.36,-0.32 2.28,-0.64 l 0.68,4.2 q -1.28,0.44 -2.72,0.72 -1.44,0.32 -3.24,0.32 -2.4,0 -4.32,-0.6 -1.88,-0.64 -3.24,-1.84 -1.32,-1.24 -2.04,-3.04 -0.72,-1.8 -0.72,-4.2 z m 12.6,-1.96 q 0,-0.76 -0.2,-1.44 -0.2,-0.72 -0.64,-1.28 -0.44,-0.56 -1.12,-0.88 -0.68,-0.36 -1.68,-0.36 -0.96,0 -1.68,0.32 -0.68,0.32 -1.16,0.88 -0.44,0.56 -0.72,1.28 -0.24,0.72 -0.32,1.48 z"
133 |          style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:40px;font-family:'Ubuntu Mono';-inkscape-font-specification:'Ubuntu Mono, Bold';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start"
134 |          id="path3813" />
135 |       <path
136 |          d="m 158.8,124.36687 q 0,-2.64 -0.88,-4.04 -0.84,-1.4 -2.6,-1.4 -0.8,0 -1.6,0.24 -0.76,0.2 -1.32,0.52 v 10.2 q 0.56,0.12 1.16,0.16 0.64,0.04 1.08,0.04 1.92,0 3.04,-1.28 1.12,-1.32 1.12,-4.44 z m 5,0.12 q 0,2.24 -0.6,4.08 -0.6,1.8 -1.76,3.08 -1.16,1.28 -2.84,2 -1.68,0.68 -3.84,0.68 -1.92,0 -3.92,-0.28 -2,-0.28 -3.36,-0.72 v -26.36 l 4.92,-0.84 v 9.4 q 1.04,-0.48 2,-0.64 0.96,-0.2 1.92,-0.2 1.84,0 3.24,0.72 1.4,0.68 2.32,2 0.96,1.28 1.44,3.08 0.48,1.8 0.48,4 z"
137 |          style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:40px;font-family:'Ubuntu Mono';-inkscape-font-specification:'Ubuntu Mono, Bold';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start"
138 |          id="path3815" />
139 |       <path
140 |          d="m 183.88,124.44687 q 0,2.24 -0.64,4.08 -0.64,1.8 -1.8,3.12 -1.16,1.32 -2.8,2.04 -1.6,0.72 -3.6,0.72 -2,0 -3.64,-0.72 -1.6,-0.72 -2.8,-2.04 -1.16,-1.32 -1.8,-3.12 -0.64,-1.84 -0.64,-4.08 0,-2.2 0.64,-4 0.68,-1.84 1.84,-3.12 1.2,-1.32 2.84,-2 1.64,-0.72 3.56,-0.72 1.96,0 3.56,0.72 1.64,0.68 2.8,2 1.2,1.28 1.84,3.12 0.64,1.8 0.64,4 z m -12.72,0.04 q 0,2.52 0.88,4.08 0.92,1.56 2.92,1.56 1.92,0 2.88,-1.56 1,-1.56 1,-4.08 0,-2.52 -0.92,-4.04 -0.88,-1.56 -2.88,-1.56 -1.92,0 -2.92,1.56 -0.96,1.52 -0.96,4.04 z"
141 |          style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:40px;font-family:'Ubuntu Mono';-inkscape-font-specification:'Ubuntu Mono, Bold';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start"
142 |          id="path3817" />
143 |       <path
144 |          d="m 203.88,124.44687 q 0,2.24 -0.64,4.08 -0.64,1.8 -1.8,3.12 -1.16,1.32 -2.8,2.04 -1.6,0.72 -3.6,0.72 -2,0 -3.64,-0.72 -1.6,-0.72 -2.8,-2.04 -1.16,-1.32 -1.8,-3.12 -0.64,-1.84 -0.64,-4.08 0,-2.2 0.64,-4 0.68,-1.84 1.84,-3.12 1.2,-1.32 2.84,-2 1.64,-0.72 3.56,-0.72 1.96,0 3.56,0.72 1.64,0.68 2.8,2 1.2,1.28 1.84,3.12 0.64,1.8 0.64,4 z m -12.72,0.04 q 0,2.52 0.88,4.08 0.92,1.56 2.92,1.56 1.92,0 2.88,-1.56 1,-1.56 1,-4.08 0,-2.52 -0.92,-4.04 -0.88,-1.56 -2.88,-1.56 -1.92,0 -2.92,1.56 -0.96,1.52 -0.96,4.04 z"
145 |          style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:40px;font-family:'Ubuntu Mono';-inkscape-font-specification:'Ubuntu Mono, Bold';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start"
146 |          id="path3819" />
147 |       <path
148 |          d="m 212.48,122.52687 q 0.72,-0.84 1.44,-1.76 0.76,-0.96 1.44,-1.96 0.72,-1 1.36,-1.92 0.64,-0.96 1.12,-1.8 h 5.92 q -0.72,0.92 -1.64,2 -0.92,1.08 -1.92,2.2 -0.96,1.12 -1.96,2.16 -0.96,1.04 -1.76,1.92 1,0.96 2.16,2.32 1.16,1.32 2.28,2.76 1.12,1.44 2.04,2.88 0.96,1.44 1.52,2.56 h -5.64 q -0.48,-1 -1.28,-2.24 -0.76,-1.28 -1.64,-2.48 -0.88,-1.24 -1.8,-2.32 -0.88,-1.12 -1.64,-1.84 v 8.88 h -4.96 v -26.92 l 4.96,-0.84 z"
149 |          style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:40px;font-family:'Ubuntu Mono';-inkscape-font-specification:'Ubuntu Mono, Bold';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start"
150 |          id="path3821" />
151 |     </g>
152 |   </g>
153 | </svg>
154 | 


--------------------------------------------------------------------------------
/data/design_table.csv:
--------------------------------------------------------------------------------
 1 | ﻿run,name,condition
 2 | SRR8933532,SCEhightemp3,high_temp
 3 | SRR8933534,SCEhightemp1,high_temp
 4 | SRR8933509,SCEkcl3,osmotic_pressure
 5 | SRR8933530,SCElowPH2,low_pH
 6 | SRR8933511,SCEanaer2,anaerobic
 7 | SRR8933533,SCEhightemp2,high_temp
 8 | SRR8933537,SCEstan1,standard
 9 | SRR8933506,SCEanaer3,anaerobic
10 | SRR8933531,SCElowPH1,low_pH
11 | SRR8933538,SCEkcl1,osmotic_pressure
12 | SRR8933512,SCEanaer1,anaerobic
13 | SRR8933510,SCEkcl2,osmotic_pressure
14 | SRR8933535,SCEstan3,standard
15 | SRR8933536,SCEstan2,standard
16 | SRR8933539,SCElowPH3,low_pH
17 | 


--------------------------------------------------------------------------------
/docs/DE_analysis_edgeR_script.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rnnh/bioinfo-notebook/aa1c8f5318d40c4105a50108ea1a6102433be8a0/docs/DE_analysis_edgeR_script.pdf


--------------------------------------------------------------------------------
/docs/SPAdes.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | title: SPAdes
 4 | parent: 2. Program guides
 5 | ---
 6 | 
 7 | # SPAdes
 8 | 
 9 | SPAdes [is an assembly toolkit containing various assembly pipelines](https://github.com/ablab/spades/blob/spades_3.14.1/README.md).
10 | 
11 | ## Assembling a genome from Illumina paired-end reads using `SPAdes`
12 | 
13 | `SPAdes` can be used to assemble paired-end reads as follows:
14 | 
15 | ```bash
16 | $ spades -1 reads_1.fq.gz -2 reads_2.fq.gz -t 5 -m 200 -o results/directory/
17 | ```
18 | 
19 | In this command...
20 | 
21 | 1. **`-1`** is the file with forward reads.
22 | 2. **`-2`** is the file with reverse reads.
23 | 3. **`-t`** or **`--threads`** sets the number of processors/threads to use. The default is 16.
24 | 4. **`-m`** or **`--memory`** is memory the limit in Gb. SPAdes terminates if it reaches this limit. The default value is 250Gb.
25 | 5. **`-o`** or **`--outdir`** is the output directory to use. The default is the current directory.
26 | 
27 | SPAdes supports uncompressed (**`.fastq`** or **`.fq`**) or compressed (**`.fastq.gz`** or **`.fq.gz`**) sequencing read inputs.
28 | In the output directory, the assembled genome will be available as contigs (**`contigs.fasta`**) and scaffolds (**`scaffolds.fasta`**), both of which are FASTA nucleotide files.
29 | 
30 | ## See also
31 | 
32 | - [conda](conda.md): The `bioinfo-notebook` conda environment includes SPAdes
33 | - [File formats used in bioinformatics](file_formats.md)
34 | 
35 | ## Further reading
36 | 
37 | - [SPAdes on GitHub](https://github.com/ablab/spades/)
38 | 


--------------------------------------------------------------------------------
/docs/UniProt_downloader.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | title: UniProt downloader
 4 | parent: 3. Scripts
 5 | ---
 6 | 
 7 | # UniProt downloader
 8 | 
 9 | [UniProt_downloader.sh](../scripts/UniProt_downloader.sh) is a `bash` shell script for downloading [UniProt](https://www.uniprot.org/) protein sequences to a FASTA amino acid ([.faa](file_formats.md)) file.
10 | It takes a list of UniProt accession numbers as input, and then pipes each one into a `curl` command to download the corresponding protein.
11 | This is essentially a [one-line program](https://en.wikipedia.org/wiki/One-liner_program) wrapped in a shell script to make downloading UniProt sequences easier.
12 | 
13 | ## Usage
14 | 
15 | ```
16 | UniProt_downloader.sh [-h|--help] [-p|--processors n -o|--output] -i|--input 
17 |  
18 |  This script takes a list of UniProt primary accession numbers (*.list), and 
19 |  downloads the corresponding protein sequences from UniProt as a FASTA amino 
20 |  acid (.faa) file.
21 |  
22 |  This list can be generated by searching UniProtKB for a desired term (e.g. 
23 |  'taxonomy:147537' for the Saccharomycotina subphylum), selecting 'Download' 
24 |  and 'Format: List' to download the accession numbers of the corresponding 
25 |  results.
26 |  
27 |  arguments: 
28 |  	 -h | --help		 show this help text and exit 
29 |  	 -i | --input		 the list of UniProt proteins to download 
30 |  	 -p | --processors	 optional: set the number (n) of processors to 
31 |  				 use (default: 1) 
32 |  	 -o | --output		 optional: name of the output .faa file 
33 |  				 (default: uniprot_{date}.faa) 
34 | ```
35 | 
36 | ## See also
37 | 
38 | - [UniProt_downloader.sh on GitHub](https://github.com/rnnh/bioinfo-notebook/blob/master/scripts/UniProt_downloader.sh)
39 | - [File formats used in bioinformatics](file_formats.md)
40 | - [UniProt](https://www.uniprot.org/)
41 | 


--------------------------------------------------------------------------------
/docs/annotated_snps_filter.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | title: Annotated SNPs filter
 4 | parent: 3. Scripts
 5 | ---
 6 | 
 7 | # Annotated SNPs filter
 8 | 
 9 | [annotated_snps_filter.R](../scripts/annotated_snps_filter.R) is an `R` script cross-references annotated SNP files created using [annotating_snps.R](annotating_snps.md).
10 | It takes two files created using this script, and returns unique SNPs for each file.
11 | If a SNP in File 1 is not found at the same position on the same sequence as File 2, it is returned as a unique SNP, and vice versa.
12 | These unique SNPs are then written to new `.tsv` files.
13 | 
14 | ## Usage
15 | 
16 | To use this script, variables need to be defined on lines 21 and 22 of the script:
17 | 
18 | - Assign the name of the first annotated SNP file to be filtered to 'annotated_SNP_file_1'.
19 | - Assign the name of the second annotated SNP file to be filtered to 'annotated_SNP_file_2'.
20 | - These files should be in the `~/bioinfo-notebook/data/` directory.
21 | - Optional: the name of the output files can be assigned on lines 109 and 115 respectively.
22 | 
23 | ## See also
24 | 
25 | - [annotated_snps_filter.R on GitHub](https://github.com/rnnh/bioinfo-notebook/blob/master/scripts/annotated_snps_filter.R)
26 | - [annotating_snps.R](annotating_snps.md)
27 | - [File formats used in bioinformatics](file_formats.md)
28 | - [snp_calling.sh](snp_calling.md), a script for generating VCF files of SNPs.
29 | - [genome_annotation_SwissProt_CDS.sh](genome_annotation_SwissProt_CDS.md), a script for generating genome annotation GFF files.
30 | 


--------------------------------------------------------------------------------
/docs/annotating_snps.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | title: Annotating SNPs
 4 | parent: 3. Scripts
 5 | ---
 6 | 
 7 | # Annotating SNPs
 8 | 
 9 | [annotating_snps.R](../scripts/annotating_snps.R) is an `R` script that cross-references annotations of genome assemblies with VCF files containing SNPs of sequencing reads aligned against
10 | those genome assemblies.
11 | If a SNP falls within- or upstream of- an annotated genome feature (start codon, stop codon, CDS, etc.), the script will return that feature along with the SNP.
12 | For this script to work, these files need to use the same sequence names: e.g. if the first sequence in the VCF is called "chrI", there should be a corresponding sequence called "chrI" in the GFF file.
13 | 
14 | ## Usage
15 | 
16 | To use this script, variables need to be defined on lines 28 to 32 of the script:
17 | 
18 | - The GFF file name should be assigned to the variable `GFF_file`.
19 | - The VCF file name should be assigned to the variable `VCF_file`.
20 | - The VCF and GFF files should be in the directory `~/bioinfo-notebook/data/`.
21 | - The number of lines in the VCF file header should be specified in the `VCF_header.int` variable. This is the number of lines that begin with `#` in the VCF file.
22 | - The variable `upstream.int` is used to determine how far upstream from an annotated feature a SNP can be. This can be set to 0 if you do not want upstream SNPs to be considered. Setting it to 1000 will mean that SNPs up to 1,000 bases/1kb upstream from a feature will be annotated.
23 | - The variable 'output_name' is used to specify the name of the output file, which should end in '.tsv' as it will be a tab-separated values text file.
24 | 
25 | ## Annotated SNP format
26 | 
27 | The `.tsv` files created by this script have a combination of columns from the [GFF and VCF formats](file_formats.md) as follows...
28 | 
29 | 1. `sequence` The name of the sequence where the feature is located.
30 | 2. `source` Keyword identifying the source of the feature, like a program (e.g. Augustus) or an organization (e.g. [SGD](https://www.yeastgenome.org/)).
31 | 3. `feature` The feature type name, like `gene` or `exon`. In a well-structured GFF file, all the children features always follow their parents in a single block (so all exons of a transcript are put after their parent `transcript` feature line and before any other parent transcript line).
32 | 4. `start` Genomic start of the feature, with a 1-base offset.
33 | 5. `end` Genomic end of the feature, with a 1-base offset.
34 | 6. `score` Numeric value that generally indicates the confidence of the source in the annotated feature. A value of `.` (a dot) is used to define a null value.
35 | 7. `strand` Single character that indicates the strand of the feature; it can assume the values of `+` (positive, or `5'->3'`), `-`, (negative, or `3'->5'`), `.` (undetermined).
36 | 8. `phase` Phase of coding sequence (CDS) features, indicating where the feature starts in relation to the reading frame. It can be either one of `0`, `1`, `2` (for CDS features) or `.` (for everything else).
37 | 9. `attributes` All the other information pertaining to this feature. The format, structure and content of this field is the one which varies the most between GFF formats.
38 | 10. `POS` The 1-based position of the variation on the given sequence.
39 | 11. `REF` The reference base (or bases in the case of an indel) at the given position on the given reference sequence.
40 | 12. `ALT` The list of alternative alleles at this position.
41 | 13. `QUAL` A quality score associated with the inference of the given alleles.
42 | 14. `FILTER` A flag indicating which of a given set of filters the variation has passed.
43 | 15. `INFO` An extensible list of key-value pairs (fields) describing the variation. Multiple fields are separated by semicolons with optional values in the format: <key>=<data>[,data].
44 | 16. `SAMPLE` For each (optional) sample described in the file, values are given for the fields listed in FORMAT. If multiple samples have been aligned to the reference sequence, each sample will have its own column.
45 | 
46 | 
47 | ## See also
48 | 
49 | - [annotating_snps.R on GitHub](https://github.com/rnnh/bioinfo-notebook/blob/master/scripts/annotating_snps.R)
50 | - [annotated_snps_filter.R](annotated_snps_filter.md)
51 | - [File formats used in bioinformatics](file_formats.md)
52 | - [snp_calling.sh](snp_calling.md), a script for generating VCF files of SNPs.
53 | - [genome_annotation_SwissProt_CDS.sh](genome_annotation_SwissProt_CDS.md), a script for generating genome annotation GFF files.
54 | 


--------------------------------------------------------------------------------
/docs/augustus.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | title: Augustus
 4 | parent: 2. Program guides
 5 | ---
 6 | 
 7 | # Augustus
 8 | 
 9 | Augustus is a program that predicts genes in eukaryotic genomic sequences.
10 | It can be run online, with a server for [smaller files](http://bioinf.uni-greifswald.de/augustus/submission.php) and one for [larger files](http://bioinf.uni-greifswald.de/webaugustus/), or locally.
11 | The local version of Augustus can be installed through [conda](conda.md).
12 | This project includes an example [augustus conda environment](../envs/augustus.yml).
13 | 
14 | ## Predicting genes in a eukaryotic FASTA nucleic acid file using `augustus`
15 | 
16 | `augustus` can be used to predict genes as follows:
17 | 
18 | ```bash
19 | $ augustus --species=species_name input_file.fna > output_file.gff
20 | ```
21 | 
22 | In this command...
23 | 
24 | 1. `--species` is used to specify the target species for gene predictions (`species_name`).
25 | 2. `input_file.fna` is the input FASTA nucleic acid file ([.fna](file_formats.md#fasta)).
26 | 3. `output_file.gff` is the general feature format ([GFF](file_formats.md#generic-feature-formats)) genome annotation output file.
27 | Lines beginning with `#` are Augustus comments: these lines do not follow the GFF structure.
28 | 
29 | The following command gives the list of valid species names for use with Augustus:
30 | 
31 | ```bash
32 | $ augustus --species=help
33 | ```
34 | 
35 | ## Extracting the FASTA amino acid sequences of predicted genes from an Augustus annotation
36 | 
37 | The genome annotation file produced by `augustus` (`output_file.gff`) contains the amino acid sequences of predicted genes in comment lines.
38 | These amino acid sequences can be extracted to a FASTA file with the following command:
39 | 
40 | ```bash
41 | $ getAnnoFasta.pl output_file.gff
42 | ```
43 | 
44 | The amino acid sequences will be written to `output_file.aa`.
45 | This is a FASTA amino acid ([.faa](file_formats.md#fasta)).
46 | The extension of this file can be changed from ".aa" to ".faa" with the following command:
47 | 
48 | ```bash
49 | $ mv output_file.aa output_file.faa
50 | ```
51 | 
52 | ## Removing comments from Augustus annotations
53 | 
54 | Genome annotations produced by Augustus follow the [Generic Feature Format](file_formats.md#generic-feature-formats), with the addition of comment lines for amino acid sequences.
55 | These are the same FASTA amino acid sequences that are extracted using `getAnnoFasta.pl`.
56 | These lines begin with the character `#`, and removing them results a standard GFF file.
57 | 
58 | Here is one method for removing these amino acid lines, using `grep -v` to select lines which do not contain the `#` character:
59 | 
60 | ```bash
61 | $ grep -v "#" augustus_annotation.gff > clean_augustus_annotation.gff
62 | ```
63 | 
64 | ## Demonstration
65 | 
66 | In this video, `augustus` is used to predict genes in `example_nucleotide_sequence.fasta`.
67 | This results in a genome annotation file: `augustus_example.gff`.
68 | The script `getAnnoFasta.pl` is used to extract the amino acid sequences in this genome annotation file to a new FASTA amino acid file: `augustus_example.aa`.
69 | The `mv` command is used to change the extension of this file from ".aa" to ".faa".
70 | 
71 | [![asciicast](https://asciinema.org/a/346541.svg)](https://asciinema.org/a/346541?autoplay=1)
72 | 
73 | ## See also
74 | 
75 | - [conda](conda.md)
76 | - [augustus conda environment](../envs/augustus.yml)
77 | - [File formats used in bioinformatics](file_formats.md)
78 | 
79 | ## References
80 | 
81 | - [The Augustus website](http://bioinf.uni-greifswald.de/augustus/)
82 | - [GNU grep](https://www.gnu.org/software/grep/manual/grep.html)
83 | 


--------------------------------------------------------------------------------
/docs/bcftools.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | title: Bcftools
 4 | parent: 2. Program guides
 5 | ---
 6 | 
 7 | # Bcftools
 8 | 
 9 | Bcftools are a set of [utilities for variant calling and manipulating VCFs and BCFs](https://samtools.github.io/bcftools/bcftools.html).
10 | 
11 | ## Generating genotype likelihoods for alignment files using `bcftools mpileup`
12 | 
13 | `bcftools mpileup` can be used to generate VCF or BCF files containing genotype likelihoods for one or multiple alignment (BAM or CRAM) files as follows:
14 | 
15 | ```bash
16 | $ bcftools mpileup --max-depth 10000 --threads n -f reference.fasta -o genotype_likelihoods.bcf reference_sequence_alignmnet.bam
17 | ```
18 | 
19 | In this command...
20 | 
21 | 1. **`--max-depth`** or **`-d`** sets the reads per input file for each position in the alignment. In this case, it is set to 10000
22 | 2. **`--threads`** sets the number (*n*) of processors/threads to use.
23 | 3. **`--fasta-ref`** or **`-f`** is used to select the [faidx-indexed FASTA](samtools.md#indexing-a-fasta-file-using-samtools-faidx) nucleotide reference file (*reference.fasta*) used for the alignment.
24 | 4. **`--output `** or **`-o`** is used to name the ouput file (*genotype_likelihoods.bcf*).
25 | 5. The final argument given is the input BAM alignment file (*reference_sequence_alignment.bam*). Multiple input files can be given here.
26 | 
27 | ## Variant calling using `bcftools call`
28 | 
29 | `bcftools call` can be used to call SNP/indel variants from a BCF file as follows:
30 | 
31 | ```bash
32 | $ bcftools call -O b --threads n -vc --ploidy 1 -p 0.05 -o variants_unfiltered.bcf genotype_likelihoods.bcf
33 | ```
34 | 
35 | In this command...
36 | 
37 | 1. **`--output-type`** or **`-O`** is used to select the output format. In this case, *b* for BCF.
38 | 2. **`--threads`** sets the number (*n*) of processors/threads to use.
39 | 3. **`-vc`** specifies that we want the output to contain variants only, using the original [SAMtools](samtools.md) consensus caller.
40 | 4. **`--ploidy`** specifies the ploidy of the assembly.
41 | 5. **`--pval-threshold`** or **`-p`** is used to the set the p-value threshold for variant sites (*0.05*).
42 | 6. **`--output `** or **`-o`** is used to name the ouput file (*variants_unfiltered.bcf*).
43 | 7. The final argument is the input BCF file (*genotype_likelihoods.bcf*).
44 | 
45 | ## Filtering variants using `bcftools filter`
46 | 
47 | `bcftools filter` can be used to filter variants from a BCF file as follows...
48 | 
49 | ```bash
50 | $ bcftools filter --threads n -i '%QUAL>=20' -O v -o variants_filtered.vcf variants_unfiltered.bcf
51 | ```
52 | 
53 | In this command...
54 | 
55 | 1. **`--threads`** sets the number (*n*) of processors/threads to use.
56 | 2. **`--include`** or **`-i`** is used to define the expression used to filter sites. In this case, *`%QUAL>=20`* results in sites with a quality score greater than or equal to 20.
57 | 3. **`--output-type`** or **`-O`** is used to select the output format. In this case, *v* for VCF.
58 | 4. **`--output `** or **`-o`** is used to name the ouput file (*variants_filtered.vcf*).
59 | 5. The final argument is the input BCF file (*genotype_likelihoods.bcf*).
60 | 
61 | ## See also
62 | 
63 | - [File formats used in bioinformatics](file_formats.md)
64 | - [SNP calling script](snp_calling.md)
65 | 
66 | ## Futher reading
67 | 
68 | - [bcftools documentation](https://samtools.github.io/bcftools/bcftools.html)
69 | 


--------------------------------------------------------------------------------
/docs/blast.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | layout: default
  3 | title: BLAST
  4 | parent: 2. Program guides
  5 | ---
  6 | 
  7 | # BLAST
  8 | 
  9 | The Basic Local Alignment Search Tool (BLAST) is an algorithm and program for comparing primary biological sequence information, such as the amino-acid sequences of proteins or the nucleotides of DNA and/or RNA sequences.
 10 | BLAST is one of the most widely used tools in bioinformatics; it can be applied to different problems or projects in a myriad ways.
 11 | 
 12 | ## Contents
 13 | 
 14 | - [How BLAST works](#how-blast-works)
 15 | - [The command line version of BLAST](#the-command-line-version-of-blast)
 16 | - [Types of BLAST search](#types-of-blast-search)
 17 | - [E-value and Bit-score](#e-value-and-bit-score)
 18 | - [Creating a BLAST database using `makeblastdb`](#creating-a-blast-database-using-makeblastdb)
 19 | - [Creating local BLAST database from Swiss-Prot](#downloading-swiss-prot-fasta-sequences-and-creating-a-blast-protein-database)
 20 | - [Searching against a BLAST nucleotide database using `blastn`](#searching-against-a-blast-nucleotide-database-using-blastn)
 21 | - [BLAST `-outfmt 6` results](#blast--outfmt-6-results)
 22 | - [Video demonstration](#video-demonstration)
 23 | - [See also](#see-also)
 24 | - [References](#references)
 25 | 
 26 | ## How BLAST works
 27 | 
 28 | There are two main steps in BLAST:
 29 | 
 30 | 1. A list of "words" (sets of characters/residues) of length *k* is created for the query sequence. By default, *k* = 3 for amino acid sequences, and *k* = 11 for nucleotide sequences.
 31 | 2. An alignment is made for database (subject) sequences that share many words with the query sequence. This is a local alignment in which only High-scoring Segment Pairs (HSPs) are reported. In other words, BLAST finds islands of similarity between sequences.
 32 | 
 33 | ![An outline of the BLAST algorithm](https://www.ncbi.nlm.nih.gov/books/NBK62051/bin/blast_glossary-Image001.jpg "An outline of the BLAST algorithm")
 34 | 
 35 | ## The command line version of BLAST
 36 | 
 37 | BLAST can be used online, or through the command line.
 38 | Most biologists are familiar with [NCBI's web application for BLAST](<https://blast.ncbi.nlm.nih.gov/Blast.cgi>).
 39 | If you use this web application regularly, the command line BLAST program is worth your consideration.
 40 | The command line version of BLAST has several advantages over the web version:
 41 | 
 42 | 1. BLAST on the command line can be used to run *local searches*, i.e. searches which use files that are on your computer, instead of files that are on an NCBI database.
 43 | 2. BLAST searches on the command line can be made more specific by adding additional arguments.
 44 | 3. BLAST searches carried out on the command line can be automated, and incorporated into larger scripts.
 45 | 4. The command line BLAST program can output search results in various structured text formats.
 46 | 
 47 | The command line version of BLAST can be downloaded via [conda](conda.md) using the following command:
 48 | 
 49 | ```bash
 50 | $ conda install -c bioconda blast
 51 | ```
 52 | 
 53 | This program is included in the [bioinfo-notebook conda environment](../envs/bioinfo-notebook.txt).
 54 | 
 55 | ## Types of BLAST search
 56 | 
 57 | There are five main types of BLAST search:
 58 | 
 59 | 1. **BLASTp** searches a protein database with a protein query sequence.
 60 | 2. **BLASTn** searches a nucleic acid database with nucleic acid query sequence.
 61 | 3. **BLASTx** searches a protein database with nucleic acid query sequence, which is translated into an amino acid sequence.
 62 | 4. **tBLASTx** searches a nucleic acid database with nucleic acid query sequence. In this case, both the database (subject) sequences and query sequence are translated into amino acid sequences.
 63 | 5. **tBLASTn** searches a nucleic acid database with protein query sequence. In this case, the nucleic acid database is translated into a set of amino acid sequences.
 64 | 
 65 | While the type of query and subject sequences required for each of these BLAST searches differs, the command line arguments that can be used for these BLAST searches are interchangeable.
 66 | 
 67 | ## E-value and Bit-score
 68 | 
 69 | Two important variables when interpreting BLAST results are *E-value* and *bit-score*.
 70 | These are both derived from the *raw alignment score (S)*, which is based on the number of residues (i.e. individual amino/nucleic acids) that two sequences have in common.
 71 | The more identical residues that two sequences have at the same position in an alignment, the higher the alignment score.
 72 | 
 73 | - **Bit-score (S')** is the raw alignment score (S) normalised with respect to the scoring system used for the alignment.
 74 | - **E-value** or Expectation value is the number of different alignments with scores equivalent to or better than S that is expected to occur in a database search by chance. The lower the E value, the more significant the score and the alignment. An exact match between query and subject sequences results in an E-value of zero.
 75 | 
 76 | While bit-scores are comparable between searches, as they are normalised, they do not take the size of the database into account.
 77 | E-values, however, do account for the size of the database.
 78 | The lower the E-value and the higher the bit-score, the better the BLAST result.
 79 | 
 80 | ## Creating a BLAST database using `makeblastdb`
 81 | 
 82 | To search against a set of nucleotide or amino acid sequences using BLAST, a database must be created.
 83 | This can be done using the `makeblastdb` command.
 84 | 
 85 | ```bash
 86 | $ makeblastdb -dbtype prot/nucl -in input_file -out database_name
 87 | ```
 88 | 
 89 | In this command...
 90 | 
 91 | 1. `-dbtype` specifies the type of sequences used to create the database. For amino acid (protein) sequences, `prot` is used ("`-dbtype prot`"). For nucleic acid sequences, `nucl` is used ("`-dbtype nucl`").
 92 | 2. `-in` is used to specify the input file. The database created can be used to search against the sequences in this file.
 93 | 3. `-out` is used to name the database that will be created from the input file.
 94 | 
 95 | ## Downloading Swiss-Prot FASTA sequences and creating a BLAST protein database
 96 | 
 97 | In this video, the FASTA amino acid sequences of Swiss-Prot are downloaded, and a BLAST protein database is created from these sequences using `makeblastdb`.
 98 | [UniProtKB/Swiss-Prot](https://en.wikipedia.org/wiki/UniProt#UniProtKB.2FSwiss-Prot) is a manually annotated, non-redundant protein sequence database.
 99 | As it is well-annotated and curated, the Swiss-Prot database gives informative results when searched locally using `blastp` and `blastx`.
100 | The link used in the `wget` command is copied and pasted from the [UniProt downloads page](https://www.uniprot.org/downloads).
101 | The compressed FASTA sequences of the Swiss-Prot database on `ftp.uniprot.org`.
102 | 
103 | These FASTA amino acid sequences are compressed into a `.gz` (gzip) file.
104 | Before using the `makeblastdb` command, this FASTA file is uncompressed using `gunzip`, turning `uniprot_sprot`**`.fasta.gz`** into `uniprot_sprot`**`.fasta`**.
105 | Once the FASTA file is downloaded and uncompressed, `makeblastdb` is used to create a BLAST protein database of the amino acid sequences in this FASTA file.
106 | This BLAST protein database is named `swissprot`, and consists of three binary files.
107 | 
108 | Once the BLAST protein database is created, `blastp` and `blastx` can be used to search sequences against it.
109 | This database can be selected using the argument `-db swissprot`with `blastp` or `blastx` (the path to the `swissprot` database will need to be given if the command is run from a different directory).
110 | 
111 | [![asciicast](https://asciinema.org/a/338534.svg)](https://asciinema.org/a/338534?autoplay=1)
112 | 
113 | ## Searching against a BLAST nucleotide database using `blastn`
114 | 
115 | The program `blastn` is used for searching nucleotide databases with a nucleotide query.
116 | 
117 | ```bash
118 | $ blastn -query query_file.fna -db nucl_database_name -out results_file.tsv -outfmt 6 -evalue x -max_hsps y -num_threads n
119 | ```
120 | 
121 | In this command...
122 | 
123 | 1. `-query` is used to select the FASTA nucleic acids file you want to search against the BLAST database (the `query_file.fna`).
124 | 2. `-db` is used to select the BLAST nucleotide database you want to search against (`nucl_database_name`).
125 | 3. `-out` is used to direct the results to an output file (`results_file.tsv`).
126 | 4. `-outfmt` is used to specify how this results file should be formatted. In this case, as `-outfmt` is `6`, the results will be written to a file as tab-separated values: this is why `results_file.tsv` has a `.tsv` extension.
127 | 5. `-evalue` is used to set an E-value threshold (`x`). Results which have an E-value greater than this threshold will not be written to the results file.
128 | 6. `-max_hsps` is used to set a High-scoring Segment Pairs (HSPs) threshold (`y`). When given, no more than `y` HSPs (alignments) for each query-subject pair will be written to the results file.
129 | 7. `-num_threads` is used to set the number (*`n`*) of threads/processors to use (default 1).
130 | 
131 | The last two arguments given in this command- `-evalue` and `-max_hsps`- are optional, but they are useful as they allow the results to be filtered before being written to the file.
132 | Using these arguments will result in more specific results, and will reduce the need to manually filter results later.
133 | 
134 | ## BLAST `-outfmt 6` results
135 | 
136 | These BLAST results are taken from the [video demonstration](#video_demonstration) and are in BLAST output format 6.
137 | 
138 | ```
139 | gi|242120357|gb|FJ461870.1|	NC_001144.5	93.252	163	11	0	196	358	454921	454759	7.57e-63	241
140 | gi|242120357|gb|FJ461870.1|	NC_001144.5	93.252	163	11	0	196	358	464058	463896	7.57e-63	241
141 | gi|242120357|gb|FJ461870.1|	CP036478.1	93.252	163	11	0	196	358	454829	454667	7.57e-63	241
142 | gi|242120357|gb|FJ461870.1|	CP036478.1	93.252	163	11	0	196	358	463966	463804	7.57e-63	241
143 | gi|242120357|gb|FJ461870.1|	CP024006.1	93.252	163	11	0	196	358	453978	453816	7.57e-63	241
144 | ```
145 | 
146 | These results are tab-separated values, meaning each column in the results is separated by a `Tab` character.
147 | These columns always appear in the same order:
148 | 
149 | ```
150 | query_id	subject_id	per_identity	aln_length	mismatches	gap_openings	q_start	q_end	s_start	s_end	e-value	bit_score
151 | ```
152 | 
153 | In this format...
154 | 
155 | 1. `query_id` is the FASTA header of the sequence being searched against the database (the query sequence).
156 | 2. `subject_id` is the FASTA header of the sequence in the database that the query sequence has been aligned to (the subject sequence).
157 | 3. `per_identity` is the percentage identity- the extent to which the query and subject sequences have the same residues at the same positions.
158 | 4. `aln_length` is the alignment length.
159 | 5. `mismatches` is the number of mismatches.
160 | 6. `gap_openings` is the number of gap openings in the alignment.
161 | 7. `q_start` is the start of the alignment in the query sequence.
162 | 8. `q_end` is the end of the alignment in the query sequence.
163 | 9. `s_start` is the start of the alignment in the subject sequence.
164 | 10. `s_end` is the end of the alignment in the subject sequence.
165 | 11. `e_value` is the expect value (E-value) for the alignment.
166 | 12. `bit_score` is the bit-score of the alignment.
167 | 
168 | All BLAST output formats above 4 (i.e. `--outfmt > 4`) use this tabular layout, formatted in different ways.
169 | For example, `--outfmt 10` gives the same information in a comma-separated values (`.csv`) file instead of a tab-separated values (`.tsv`) file.
170 | 
171 | ## Video demonstration
172 | 
173 | In this demonstration, `makeblastdb` is used to create a BLAST database from the file `S_cere_genomes.fna`.
174 | This FASTA nucleic acids (`.fna`) file was created by concatenating the following *Saccharomyces cerevisiae* genome assemblies, which were downloaded from NCBI: [GCA_003086655.1](https://www.ncbi.nlm.nih.gov/assembly/GCA_003086655.1), [GCA_003709285.1](https://www.ncbi.nlm.nih.gov/assembly/GCA_003709285.1) and [GCA_004328465.1](https://www.ncbi.nlm.nih.gov/assembly/GCA_004328465.1).
175 | 
176 | The program `blastn` is then used to query `23S_rRNA_gene.fna` against this database.
177 | This file is a copy of the [*Scutellospora fulgida* isolate NC303A 25S ribosomal RNA gene](https://www.ncbi.nlm.nih.gov/nuccore/FJ461870.1?report=fasta) from NCBI.
178 | 
179 | The program `tblastn` is also used to query `YPK1.faa` against this database multiple times.
180 | This FASTA amino acid (`.faa`) file is a copy of the [serine/threonine-protein kinase YPK1](https://www.uniprot.org/uniprot/P12688) from UniProt.
181 | This search is carried out multiple times with additional parameters: the flag `-evalue`is used to set an E-value threshold, and the flag `-max_hsps` is used to set a maximum number of High-scoring Segment Pairs (HSPs).
182 | 
183 | The results from these BLAST searches are written to tab-separated values (`.tsv`) files.
184 | This output format is specified with the flag `-outfmt 6`.
185 | 
186 | [![asciicast](https://asciinema.org/a/327279.svg)](https://asciinema.org/a/327279?autoplay=1)
187 | 
188 | ## See also
189 | 
190 | - [File formats used in bioinformatics](file_formats.md)
191 | - [Introduction to the command line](cl_intro.md)
192 | - [conda](conda.md)
193 | - [NCBI's web application for BLAST](<https://blast.ncbi.nlm.nih.gov/Blast.cgi>)
194 | 
195 | ## References
196 | 
197 | - [BLAST® Command Line Applications User Manual](https://www.ncbi.nlm.nih.gov/books/NBK279690/)
198 | - [BLAST Glossary](https://www.ncbi.nlm.nih.gov/books/NBK62051/)
199 | 


--------------------------------------------------------------------------------
/docs/bowtie.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | title: Bowtie
 4 | parent: 2. Program guides
 5 | ---
 6 | 
 7 | # Bowtie
 8 | 
 9 | `bowtie` can be used to:
10 |  - index reference FASTA nucleotide genomes/sequences
11 |  - align FASTQ sequencing reads to those genomes/sequences
12 | 
13 |  If you want to align short reads (50bp or less), [bowtie is more suitable than bowtie2](bowtie2.md#differences-between-bowtie-and-bowtie2).
14 | 
15 | ## Indexing a reference genome/sequence using `bowtie-build`
16 | 
17 |  Before aligning reads to a reference genome with `bowtie`, it must be indexed using `bowtie-bui
18 |     ld`.
19 | This command will create six files with the extensions `.1.ebwt`, `.2.ebwt`, `.3.ebwt`, `.4.ebwt`, `.rev.1.ebwt`, and `.rev.2.ebwt`.
20 | These six files together are the index.
21 | Once an index has been created, the original reference genome/sequence is no longer needed to align reads.
22 | Here's an example `bowtie2-build` command:
23 | 
24 | ```
25 | $ bowtie-build reference_sequence.fasta index_name
26 | ```
27 | 
28 | In this command, the `reference_sequence.FASTA` is the nucleotide FASTA sequence we want to index, and `index_name` is the name of the index.
29 | There will be six files beginning with the `index_name` in the output directory: `index_name.1.ebwt`, `index_name.2.ebwt`, `index_name.3.ebwt`, `index_name.4.ebwt`, `index_name.rev.1.ebwt`, and `index_name.rev.2.ebwt`.
30 | There's no need to specify any of these files individually in subsequent `bowtie` commands, the `index_name` alone is enough to refer to the entire index.
31 | 
32 | ## Aligning reads to an indexed genome/sequence using `bowtie`
33 | 
34 | Now that the genome has been indexed, FASTQ sequencing reads can be aligned to it.
35 | This is done using the `bowtie` command.
36 | Here is an example `bowtie2` command:
37 | 
38 | ```
39 | $ bowtie --no-unal --threads n --sam index_name -1 reads_1.fastq -2 reads_2.fastq output.sam
40 | ```
41 | 
42 | In this command...
43 |  
44 | 1. **`--no-unal`** is an optional argument, meaning reads that do not align to the reference genome will not be written to `sam` output
45 | 2. **`--threads`** is the number (*n*) of processors/threads used
46 | 3. **`--sam`** specifies that the output should be written in the [SAM format](file_formats.md#sam)
47 | 4. **`index_name`** is the name of the genome index
48 | 4. **`-1`** is the file(s) containing mate 1 reads ([`reads_1.fastq`](file_formats.md#fastq))
49 | 5. **`-2`** is the file(s) containing mate 2 reads ([`reads_2.fastq`](file_formats.md#fastq))
50 | 6. **`output.sam`** is the output alignment in `sam` format
51 | 
52 | ## Demonstration
53 | 
54 | In this video, `bowtie-build` is used to index `S_cere_GCF_000146045.2_R64_genomic.fna`, which is a copy of the [*Saccharomyces cerevisiae* S288C genome from RefSeq](https://www.ncbi.nlm.nih.gov/assembly/GCF_000146045.2).
55 | The `bowtie` command is then used to align [*Saccharomyces cerevisiae* RNAseq reads](https://www.ncbi.nlm.nih.gov/sra/SRR11462797) to this bowtie index.
56 | 
57 | [![asciicast](https://asciinema.org/a/316272.svg)](https://asciinema.org/a/316272?autoplay=1)
58 | 
59 | ## Further reading
60 | 
61 | 1. The `bowtie` manual: <http://bowtie-bio.sourceforge.net/manual.shtml>
62 | 


--------------------------------------------------------------------------------
/docs/bowtie2.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | title: Bowtie2
 4 | parent: 2. Program guides
 5 | ---
 6 | 
 7 | # Bowtie2
 8 | 
 9 | From the manual: [*"Bowtie 2 is an ultrafast and memory-efficient tool for aligning sequencing reads to long reference sequences"*](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml).
10 | 
11 | `bowtie2` can be used to:
12 | - index reference FASTA nucleotide genomes/sequences
13 | - align FASTQ sequencing reads to those genomes/sequences
14 | 
15 | ## Differences between `bowtie` and `bowtie2`
16 | 
17 | - `bowtie2` has no upper limit on read length
18 | - `bowtie2` can make gapped alignments
19 | - `bowtie2` is more flexible for paired-end alignment
20 | - `bowtie2` is faster and more memory efficient
21 | - `bowtie` is advantageous over `bowtie2` for relatively short sequencing reads (50bp or less)
22 | 
23 | ## Indexing a reference genome/sequence using `bowtie2-build`
24 | 
25 | Before aligning reads to a reference genome with `bowtie2`, it must be indexed using `bowtie2-build`.
26 | This command will create six files with the extensions `.1.bt2`, `.2.bt2`, `.3.bt2`, `.4.bt2`, `.rev.1.bt2`, and `.rev.2.bt2`.
27 | These six files together are the index.
28 | Once an index has been created, the original reference genome/sequence is no longer needed to align reads.
29 | Here's an example `bowtie2-build` command:
30 | 
31 | ```
32 | $ bowtie2-build reference_sequence.fasta index_name
33 | ```
34 | 
35 | In this command, the `reference_sequence.FASTA` is the nucleotide FASTA sequence we want to index, and `index_name` is the name of the index.
36 | There will be six files beginning with the `index_name` in the output directory: `index_name.1.bt2`, `index_name.2.bt2`, `index_name.3.bt2`, `index_name.4.bt2`, `index_name.rev.1.bt2`, and `index_name.rev.2.bt2`.
37 | There's no need to specify any of these files individually, just the `index_name` alone is enough to refer to the entire index.
38 | 
39 | ## Aligning reads to an indexed genome/sequence using `bowtie2`
40 | 
41 | Now that the genome has been indexed, FASTQ sequencing reads can be aligned to it.
42 | This is done using the `bowtie2` command.
43 | Here's an example `bowtie2` command:
44 | 
45 | ```
46 | $ bowtie2 --no-unal -p n -x index_name -1 reads_1.fastq -2 reads_2.fastq -S output.sam
47 | ```
48 | 
49 | In this command...
50 | 
51 | 1. **`--no-unal`** is an optional argument, meaning reads that do not align to the reference genome will not be written to `sam` output
52 | 2. **`-p`** is the number (*n*) of processors/threads used
53 | 3. **`-x`** is the genome index
54 | 4. **`-1`** is the file(s) containing mate 1 reads
55 | 5. **`-2`** is the file(s) containing mate 2 reads
56 | 6. **`-S`** is the output alignment in `sam` format
57 | 
58 | ## Demonstration
59 | 
60 | In this video, `bowtie2-build` is used to index `example_nucleotide_sequence.fasta`, and the command `bowtie2` is used to align reads to this bowtie2 index.
61 | 
62 | [![asciicast](https://asciinema.org/a/306546.svg)](https://asciinema.org/a/306546?autoplay=1)
63 | 
64 | ## Further reading
65 | 
66 | 1. The `bowtie2` manual: <http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml>
67 | 


--------------------------------------------------------------------------------
/docs/cl_solutions.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | layout: default
  3 | title: Command line exercise solutions
  4 | nav_exclude: true
  5 | ---
  6 | 
  7 | # Command line exercise solutions
  8 | 
  9 | The `pwd` commands in these solutions are added to clarify the working directories used.
 10 | 
 11 | **1.** Change the working directory from `bioinfo-notebook/` to `bioinfo-notebook/data/`.
 12 | 
 13 | ```bash
 14 | ronan@dell:~/bioinfo-notebook$ pwd
 15 | /home/ronan/bioinfo-notebook
 16 | ronan@dell:~/bioinfo-notebook$ cd data/
 17 | ronan@dell:~/bioinfo-notebook/data$ pwd
 18 | /home/ronan/bioinfo-notebook/data
 19 | ```
 20 | 
 21 | **2.** Change the working directory from `bioinfo-notebook/data` to `bioinfo-notebook/docs`, using `../` in your command.
 22 | 
 23 | ```bash
 24 | ronan@dell:~/bioinfo-notebook/data$ pwd
 25 | /home/ronan/bioinfo-notebook/data
 26 | ronan@dell:~/bioinfo-notebook/data$ cd ../docs/
 27 | ronan@dell:~/bioinfo-notebook/docs$ pwd
 28 | /home/ronan/bioinfo-notebook/docs
 29 | ```
 30 | 
 31 | **3.** List the files in the `bioinfo-notebook/docs/` directory.
 32 | 
 33 | ```bash
 34 | ronan@dell:~/bioinfo-notebook/docs$ pwd
 35 | /home/ronan/bioinfo-notebook/docs
 36 | ronan@dell:~/bioinfo-notebook/docs$ ls
 37 | bowtie2.md                      file_formats.md
 38 | bowtie.md                       htseq-count.md
 39 | cl_intro.md                     linux_setup.md
 40 | cl_solutions.md                 part1.md
 41 | combining_featCount_tables.md   part2.md
 42 | conda.md                        part3.md
 43 | fasterq-dump.md                 samtools.md
 44 | fastq-dump.md                   to_do.md
 45 | fastq-dump_to_featureCounts.md  ubuntu_virtualbox.md
 46 | featureCounts.md                wsl.md
 47 | ```
 48 | 
 49 | **4.** Select a file in the `bioinfo-notebook/docs/` directory, and display the first 6 lines of it using the `head` command.
 50 | 
 51 | ```bash
 52 | ronan@dell:~/bioinfo-notebook/docs$ pwd
 53 | /home/ronan/bioinfo-notebook/docs
 54 | ronan@dell:~/bioinfo-notebook/docs$ head cl_solutions.md 
 55 | ---
 56 | layout: default
 57 | title: Command line exercise solutions
 58 | nav_exclude: true
 59 | ---
 60 | 
 61 | # Command line exercise solutions
 62 | 
 63 | 1. Change the working directory from `bioinfo-notebook/` to `bioinfo-notebook/data/`.
 64 | ```
 65 | 
 66 | **5.** Display the last 2 lines of all the files in the `bioinfo-notebook/docs/` directory, using the `tail` command.
 67 | 
 68 | ```bash
 69 | ronan@dell:~/bioinfo-notebook/docs$ pwd
 70 | /home/ronan/bioinfo-notebook/docs
 71 | ronan@dell:~/bioinfo-notebook/docs$ tail -n 2 *
 72 | ```
 73 | ```
 74 | ==> bowtie2.md <==
 75 | 
 76 | 1. The `bowtie2` manual: <http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml>
 77 | 
 78 | ==> bowtie.md <==
 79 | 
 80 | 1. The `bowtie` manual: <http://bowtie-bio.sourceforge.net/manual.shtml>
 81 | 
 82 | ==> cl_intro.md <==
 83 | - [File formats used in bioinformatics](file_formats.md)
 84 | - [The DataCamp "Introduction to Shell" interactive course](https://www.datacamp.com/courses/introduction-to-shell-for-data-science)
 85 | 
 86 | ==> cl_solutions.md <==
 87 | 5. Display the last 2 lines of all the files in the `bioinfo-notebook/docs/` directory, using the `tail` command.
 88 | 6. From the `bioinfo-notebook/docs/` directory, list the files in the `bioinfo-notebook/envs/` directory.
 89 | 
 90 | ==> combining_featCount_tables.md <==
 91 | 
 92 | - [fastq-dump_to_featureCounts.sh](fastq-dump_to_featureCounts.md)
 93 | 
 94 | ==> conda.md <==
 95 | 2. Conda packages: <https://docs.conda.io/projects/conda/en/latest/user-guide/concepts/packages.html>
 96 | 3. Conda environments: <https://docs.conda.io/projects/conda/en/latest/user-guide/concepts/environments.html>
 97 | 
 98 | ==> fasterq-dump.md <==
 99 | 
100 | 1. [How to use fasterq-dump from the sra-tools wiki on GitHub](https://github.com/ncbi/sra-tools/wiki/HowTo:-fasterq-dump)
101 | 
102 | ==> fastq-dump.md <==
103 | 
104 | 1. Rob Edward's notes on `fastq-dump`: <https://edwards.sdsu.edu/research/fastq-dump/>
105 | 
106 | ==> fastq-dump_to_featureCounts.md <==
107 | 
108 | 1. [fastq-dump_to_featureCounts.sh on GitHub](https://github.com/rnnh/bioinfo-notebook/blob/master/scripts/fastq-dump_to_featureCounts.sh)
109 | 
110 | ==> featureCounts.md <==
111 | 1. The `subread` user guide: <http://bioinf.wehi.edu.au/subread-package/SubreadUsersGuide.pdf>
112 | 2. The `featureCounts` paper: <https://doi.org/10.1093/bioinformatics/btt656>
113 | 
114 | ==> file_formats.md <==
115 | - [GTF2.2: A Gene Annotation Format (Revised Ensembl GTF)](http://mblab.wustl.edu/GTF22.html)
116 | - [GFF3 Specification](https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md)
117 | 
118 | ==> htseq-count.md <==
119 | 
120 | 1. The `htseq-count` manual: <https://htseq.readthedocs.io/en/release_0.11.1/count.html>
121 | 
122 | ==> linux_setup.md <==
123 | - [Using Ubuntu through a Virtual Machine](ubuntu_virtualbox.md)
124 | - [Windows Subsystem for Linux](wsl.md)
125 | 
126 | ==> part1.md <==
127 | 
128 | These are general guides for installing Ubuntu, using the command line, and the types of files used in bioinformatics.
129 | 
130 | ==> part2.md <==
131 | 
132 | These are guides to individual programs.
133 | 
134 | ==> part3.md <==
135 | 
136 | These are scripts that use the programs discussed in this project.
137 | 
138 | ==> samtools.md <==
139 | - [Alignment formats](file_formats.md#alignment-formats)
140 | - The `samtools` manual: <https://www.htslib.org/doc/samtools.html>
141 | 
142 | ==> to_do.md <==
143 | - Add page on `trimmomatic`
144 | - Entry on BED/bigWig
145 | 
146 | ==> ubuntu_virtualbox.md <==
147 | - [What is a Virtual Machine?](https://azure.microsoft.com/en-us/overview/what-is-a-virtual-machine/)
148 | - [How to Install Ubuntu on VirtualBox](https://www.wikihow.com/Install-Ubuntu-on-VirtualBox)
149 | 
150 | ==> wsl.md <==
151 | - [Using Ubuntu through a Virtual Machine](ubuntu_virtualbox.md) 
152 | - [conda](conda.md)
153 | ```
154 | 
155 | **6.** From the `bioinfo-notebook/docs/` directory, list the files in the `bioinfo-notebook/envs/` directory.
156 | 
157 | ```bash
158 | ronan@dell:~/bioinfo-notebook/docs$ pwd
159 | /home/ronan/bioinfo-notebook/docs
160 | ronan@dell:~/bioinfo-notebook/docs$ ls ../envs/
161 | bioinfo-notebook.txt  bioinfo-notebook.yml
162 | ```
163 | 


--------------------------------------------------------------------------------
/docs/combining_featCount_tables.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | title: Combining featCount tables.py
 4 | parent: 3. Scripts
 5 | ---
 6 | 
 7 | 
 8 | # Combining featCount tables.py
 9 | 
10 | This is a Python script that creates a single CSV feature count table from the featureCounts output tables in the target directory.
11 | This combined feature count table can be used for differential expression analysis (e.g. using DESeq2 or edgeR in R).
12 | 
13 | ## Demonstration
14 | 
15 | This is a video demonstartion of [combining_featCount_tables.py](../scripts/combining_featCount_tables.py).
16 | 
17 | [![asciicast](https://asciinema.org/a/311771.svg)](https://asciinema.org/a/311771?autoplay=1)
18 | 
19 | In this video, `combining_featCount_tables.py` is used to combine the following [featureCounts](featureCounts.md) tables:
20 | 
21 | ```
22 | feature_counts_SRR8933506_S_cere_GCF_000146045.2_R64_genomic.fna.tsv
23 | feature_counts_SRR8933509_S_cere_GCF_000146045.2_R64_genomic.fna.tsv
24 | feature_counts_SRR8933510_S_cere_GCF_000146045.2_R64_genomic.fna.tsv
25 | feature_counts_SRR8933511_S_cere_GCF_000146045.2_R64_genomic.fna.tsv
26 | feature_counts_SRR8933512_S_cere_GCF_000146045.2_R64_genomic.fna.tsv
27 | feature_counts_SRR8933530_S_cere_GCF_000146045.2_R64_genomic.fna.tsv
28 | feature_counts_SRR8933531_S_cere_GCF_000146045.2_R64_genomic.fna.tsv
29 | feature_counts_SRR8933532_S_cere_GCF_000146045.2_R64_genomic.fna.tsv
30 | feature_counts_SRR8933533_S_cere_GCF_000146045.2_R64_genomic.fna.tsv
31 | feature_counts_SRR8933534_S_cere_GCF_000146045.2_R64_genomic.fna.tsv
32 | feature_counts_SRR8933535_S_cere_GCF_000146045.2_R64_genomic.fna.tsv
33 | feature_counts_SRR8933536_S_cere_GCF_000146045.2_R64_genomic.fna.tsv
34 | feature_counts_SRR8933537_S_cere_GCF_000146045.2_R64_genomic.fna.tsv
35 | feature_counts_SRR8933538_S_cere_GCF_000146045.2_R64_genomic.fna.tsv
36 | feature_counts_SRR8933539_S_cere_GCF_000146045.2_R64_genomic.fna.tsv
37 | ```
38 | 
39 | These featureCounts results were generated using the following [fastq-dump_to_featureCounts.sh](fastq-dump_to_featureCounts.md) command:
40 | 
41 | ```bash
42 | $ bash ../scripts/fastq-dump_to_featureCounts.sh -a S_cere_GCF_000146045.2_R64_genomic.gtf -f S_cere_GCF_000146045.2_R64_genomic.fna --verbose -p 3 SRR8933506 SRR8933509 SRR8933510 SRR8933511 SRR8933512 SRR8933530 SRR8933531 SRR8933532 SRR8933533 SRR8933534 SRR8933535 SRR8933536 SRR8933537 SRR8933538 SRR8933539
43 | ```
44 | 
45 | In this command, the full genome sequence (`S_cere_GCF_000146045.2_R64_genomic.fna`) and genome annotation (`S_cere_GCF_000146045.2_R64_genomic.gtf`) for [*Saccharomyces cerevisiae* S288C](https://www.ncbi.nlm.nih.gov/assembly/GCF_000146045.2) are used.
46 | 
47 | These featureCounts results were then combined using the following command:
48 | 
49 | ```bash
50 | $ python ../scripts/combining_featCount_tables.py
51 | ```
52 | 
53 | Running this script combines all the featureCounts results in a directory into a single CSV file.
54 | If a custom name for this file is not given, it will be given a name using this scheme: `featCounts_{species}_{date}.csv`.
55 | 
56 | ## Usage
57 | 
58 | ```
59 | usage: combining_featCount_tables.py [-h] [-d PATH] [-o CUSTOM_FILENAME]
60 | 
61 | Combines the featureCounts output tables in the target directory.
62 | 
63 | optional arguments:
64 |   -h, --help            show this help message and exit
65 |   -d PATH, --directory PATH
66 |                         path to target directory. Default: current directory
67 |   -o CUSTOM_FILENAME, --output CUSTOM_FILENAME
68 |                         output filename. Default:
69 |                         featCounts_{species}_{date}.csv
70 | ```
71 | 
72 | ## See also
73 | - [combining_featCount_tables.py on GitHub](https://github.com/rnnh/bioinfo-notebook/blob/master/scripts/combining_featCount_tables.py)
74 | - [fastq-dump_to_featureCounts.sh](fastq-dump_to_featureCounts.md)
75 | 


--------------------------------------------------------------------------------
/docs/conda.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | title: Conda
 4 | parent: 2. Program guides
 5 | ---
 6 | 
 7 | # Conda
 8 | 
 9 | From the website, `conda` provides ["Package, dependency and environment management for any language"](https://docs.conda.io/en/latest/).
10 | 
11 | Conda is a package manager allows specific versions of programs to be installed, alongside their dependencies.
12 | Different sets of programs can be installed to different [virtual environments](https://www.anaconda.com/moving-conda-environments/).
13 | A virtual environment is basically a set of programs.
14 | 
15 | ## Installing `conda`
16 | 
17 | Conda is part of [Anaconda](https://www.anaconda.com/distribution/), which is available for free.
18 | Conda is also available through [Miniconda](https://docs.conda.io/en/latest/miniconda.html), a free minimal installer for conda.
19 | 
20 | Conda can be installed on a 64-bit Linux system with the following commands...
21 | 
22 | ```bash
23 | # Downloading miniconda
24 | $ wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
25 | # Installing miniconda
26 | $ bash miniconda.sh -b -p $HOME/miniconda
27 | # Updating conda
28 | $ conda update -q conda
29 | ```
30 | 
31 | ## Cloning and activating a `conda` environment
32 | 
33 | Conda virtual environments can be shared, either as a `.yml` file or a `.txt` file.
34 | A `.yml` copy of a conda environment can be used to recreate that environment on another machine, regardless of the operating system platform used.
35 | A `.txt` copy of a conda environment is more explicit: it can be used to create an identical copy of a conda environment using the same operating system platform as the original machine.
36 | A conda virtual environment is used throughout this project: a [`.yml` copy](../envs/bioinfo-notebook.yml) and an [explicit `.txt` copy](../envs/bioinfo-notebook.txt) of this conda environment are provided.
37 | 
38 | A conda environment can be activated using `$ conda activate name_of_environment`.
39 | Once activated, the programs installed in this environment are available.
40 | Conda can be deactivated using `$ conda deactivate`.
41 | 
42 | The `conda` environment used throughout this project can be created from [bioinfo-notebook.txt](../envs/bioinfo-notebook.txt) and activated using the following commands...
43 | 
44 | ```bash
45 | # Creating the bioinfo-notebook environment
46 | /bioinfo-notebook $ conda create --name bioinfo-notebook --file envs/bioinfo-notebook.txt
47 | # Activating the bioinfo-notebook environment
48 | $ conda activate bioinfo-notebook
49 | # Once activated, the environment name is at the start of the bash prompt
50 | (bioinfo-notebook) $
51 | ```
52 | 
53 | ## Demonstration
54 | 
55 | In this video demonstration, a conda virtual environment is created using [bioinfo-notebook.txt](../envs/bioinfo-notebook.txt).
56 | This virtual environment is then activated using `conda activate bioinfo-notebook`.
57 | Note that the name of the active conda environment is displayed in brackets at the start of the bash prompt: `(name of active environment) ... $`.
58 | 
59 | [![asciicast](https://asciinema.org/a/305992.svg)](https://asciinema.org/a/305992?autoplay=1)
60 | 
61 | ## Further reading
62 | 1. Downloading conda: <https://docs.conda.io/projects/conda/en/latest/user-guide/install/download.html>
63 | 2. Conda packages: <https://docs.conda.io/projects/conda/en/latest/user-guide/concepts/packages.html>
64 | 3. Conda environments: <https://docs.conda.io/projects/conda/en/latest/user-guide/concepts/environments.html>
65 | 


--------------------------------------------------------------------------------
/docs/fasterq-dump.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | title: Fasterq-dump
 4 | parent: 2. Program guides
 5 | ---
 6 | 
 7 | # Fasterq-dump
 8 | 
 9 | `fasterq-dump` is a tool for downloading sequencing reads from [NCBI's Sequence Read Archive (SRA)](https://www.ncbi.nlm.nih.gov/sra).
10 | These sequence reads will be downloaded as [FASTQ files](file_formats.md#fastq).
11 | `fasterq-dump` is a newer, streamlined alternative to [fastq-dump](fastq-dump.md); both of these programs are a part of [sra-tools](https://anaconda.org/bioconda/sra-tools).
12 | 
13 | ## `fasterq-dump` vs `fastq-dump`
14 | 
15 | Here are a few of the differences between `fastq-dump` and `fasterq-dump`:
16 | 
17 | 1. In `fastq-dump`, the flag `--split-3` is required to separate paired reads into left and right ends. This is the default setting in `fasterq-dump`.
18 | 2. The `fastq-dump` flag `--skip-technical` is no longer required to skip technical reads in `fasterq-dump`. Instead, the flag `--include-technical` is required to include technical reads when using `fasterq-dump`.
19 | 3. There is no `--gzip` or `--bzip2` flag in `fasterq-dump` to download compressed reads with `fasterq-dump`. However, FASTQ files downloaded using `fasterq-dump` can still be subsequently compressed.
20 | 
21 | The following commands are equivalent, but will be executed faster using `fasterq-dump`:
22 | 
23 | ```
24 | $ fastq-dump SRR_ID --split-3 --skip-technical
25 | $ fasterq-dump SRR_ID
26 | ```
27 | 
28 | ## Downloading reads from the SRA using `fasterq-dump`
29 | 
30 | In this example, we want to download FASTQ reads for a mate-pair library.
31 | 
32 | ```
33 | fastq-dump --threads n --progress SRR_ID
34 | ```
35 | 
36 | In this command...
37 | 
38 | 1. **`--threads`** specifies the number (*`n`*) processors/threads to be used.
39 | 2. **`--progress`** is an optional argument that displays a progress bar when the reads are being downloaded.
40 | 3. **`SRR_ID`** is the ID of the run from the SRA to be downloaded. This ID begins with "SRR" and is followed by around seven digits (e.g. `SRA1234567`).
41 | 
42 | ## Demonstration
43 | 
44 | In this video, `fasterq-dump` is used to download [*Saccharomyces cerevisiae* RNAseq reads](https://www.ncbi.nlm.nih.gov/sra/SRR11462797) from the SRA.
45 | 
46 | [![asciicast](https://asciinema.org/a/316273.svg)](https://asciinema.org/a/316273?autoplay=1)
47 | 
48 | ## See also
49 | 
50 | - [fastq-dump](fastq-dump.md)
51 | 
52 | ## References
53 | 
54 | 1. [How to use fasterq-dump from the sra-tools wiki on GitHub](https://github.com/ncbi/sra-tools/wiki/HowTo:-fasterq-dump)
55 | 


--------------------------------------------------------------------------------
/docs/fastq-dump.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | title: Fastq-dump
 4 | parent: 2. Program guides
 5 | ---
 6 | 
 7 | # Fastq-dump
 8 | 
 9 | `fastq-dump` is a tool for downloading sequencing reads from [NCBI's Sequence Read Archive (SRA)](https://www.ncbi.nlm.nih.gov/sra).
10 | These sequence reads will be downloaded as FASTQ files.
11 | How these FASTQ files are formatted depends on the `fastq-dump` options used.
12 | 
13 | ## Downloading reads from the SRA using `fastq-dump`
14 | 
15 | In this example, we want to download FASTQ reads for a mate-pair library.
16 | 
17 | ```
18 | $ fastq-dump --gzip --skip-technical --readids --read-filter pass --dumpbase --split-3 --clip --outdir path/to/reads/ SRR_ID
19 | ```
20 | 
21 | In this command...
22 | 
23 | 1. **`--gzip`**: Compress output using gzip. Gzip archived reads can be read directly by [bowtie2](bowtie2.md).
24 | 2. **`--skip-technical`**: Dump only biological reads, skip the technical reads.
25 | 3. **`--readids`** or **`-I`**: Append read ID after spot ID as 'accession.spot.readid'. With this flag, one sequence gets appended the ID `.1` and the other `.2`. Without this option, pair-ended reads will have identical IDs.
26 | 4. **`--read-filter pass`**: Only returns reads that pass filtering (without `N`s).
27 | 5. **`--dumpbase`** or **`-B`**: Formats sequence using base space (default for other than SOLiD). Included to avoid colourspace (in which pairs of bases are represented by numbers).
28 | 6. **`--split-3`** separates the reads into left and right ends. If there is a left end without a matching right end, or a right end without a matching left end, they will be put in a single file.
29 | 7. **`--clip`** or **`-W`**: Some of the sequences in the SRA contain tags that need to be removed. This will remove those sequences.
30 | 8. **`--outdir`** or **`-O`**: *(Optional)* Output directory, default is current working directory.
31 | 9. **`SRR_ID`**: This is is the ID of the run from SRA to be downloaded. This ID begins with "SRR" and is followed by around seven digits (e.g. `SRA1234567`).
32 | 
33 | Other options that can be used instead of `--split-3`:
34 | 
35 | 1. **`--split-files`** splits the FASTQ reads into two files: one file for mate 1s (`...1`), and another for mate 2s (`..._2`). This option will not mateless pairs into a third file.
36 | 2. **`--split-spot`** splits the FASTQ reads into two (mate 1s and mate 2s) within one file. `--split-spot` gives you an 8-line fastq format where forward precedes reverse (see <https://www.biostars.org/p/178586/#258378>).
37 | 
38 | ## Demonstration
39 | 
40 | In this demo, `fastq-dump` is used to download compressed FASTQ reads.
41 | 
42 | [![asciicast](https://asciinema.org/a/306937.svg)](https://asciinema.org/a/306937?autoplay=1)
43 | 
44 | ## Further reading
45 | 
46 | 1. Rob Edward's notes on `fastq-dump`: <https://edwards.sdsu.edu/research/fastq-dump/>
47 | 


--------------------------------------------------------------------------------
/docs/fastq-dump_to_featureCounts.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | title: Fastq-dump to featureCounts
 4 | parent: 3. Scripts
 5 | ---
 6 | 
 7 | # Fastq-dump to featureCounts.sh
 8 | 
 9 | [fastq-dump_to_featureCounts.sh](../scripts/fastq-dump_to_featureCounts.sh) is a `bash` script that...
10 | 
11 | 1. Downloads FASTQ reads from NCBI's SRA using [fastq-dump](fastq-dump.md)
12 | 2. Indexes a reference genome and aligns reads to that index using [bowtie2](bowtie2.md)
13 | 3. Converts the alignment file created by bowtie2 to BAM format and sorts it using [samtools](samtools.md)
14 | 4. Assigns the read alignments to genes in a genome annotation file using [featureCounts](featureCounts.md)
15 | 
16 | ## Demonstration
17 | 
18 | This is a video demonstration of [fastq-dump_to_featureCounts.sh](../scripts/fastq-dump_to_featureCounts.sh).
19 | 
20 | During this demonstration, the full genome sequence and genome annotation for [*Saccharomyces cerevisiae* S288C](https://www.ncbi.nlm.nih.gov/assembly/GCF_000146045.2) are used. The files [example_nucleotide_sequence.fasta](../data/example_nucleotide_sequence.fasta) and [example_genome_annotation.gtf](../data/example_genome_annotation.gtf) are fragments of the nucleotide sequence and annotation for this genome. [RNA-Seq reads for *Saccharomyces cerevisiae* (SRR8933512)](https://www.ncbi.nlm.nih.gov/sra/SRR8933512) are used as the example FASTQ files in this demonstration.
21 | 
22 | [![asciicast](https://asciinema.org/a/308745.svg)](https://asciinema.org/a/308745?autoplay=1)
23 | 
24 | ## Usage
25 | 
26 | ```
27 | fastq-dump_to_featureCounts.sh [options] -a|--annotation <annotation_file> -f|--fasta <fasta_file> <SRR ID(s)> 
28 |  
29 |  This script downloads FASTQ reads from NCBI's SRA, aligns them to an annotated 
30 |  genome using bowtie2, and generates gene count table(s) using featureCounts.
31 |  It can take a single SRR ID as an input, or multiple SRR IDs separated by
32 |  spaces.
33 |  
34 |  Required arguments: 
35 |  	 -a | --annotation	 input genome annotation file 
36 |  	 -f | --fasta		 input FASTA file for annotated genome 
37 |  	 SRR ID(s)		 Sequence Read Archive Run ID(s) (SRR...) 
38 |  
39 |  Optional arguments: 
40 |  	 -h | --help		 show this help text and exit 
41 |  	 -p | --processors	 number (n) of processors to use (default: 1) 
42 |  	 --fastq-dump		 use 'fastq-dump' instead of the 'fasterq-dump'
43 |  	 --verbose		 make output of script more verbose
44 |  	 --removetemp		 remove read and alignment files once they are
45 |  	 			 no longer needed (minimises disk space needed) 
46 |  	 --log			 redirect terminal output to log file
47 | ```
48 | 
49 | ## See also
50 | 
51 | 1. [fastq-dump_to_featureCounts.sh on GitHub](https://github.com/rnnh/bioinfo-notebook/blob/master/scripts/fastq-dump_to_featureCounts.sh)
52 | 


--------------------------------------------------------------------------------
/docs/featureCounts.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | title: FeatureCounts
 4 | parent: 2. Program guides
 5 | ---
 6 | 
 7 | 
 8 | # FeatureCounts
 9 | 
10 | `featureCounts` is a program that counts how many reads map to genomic features, such as genes, exon, promoter and genomic bins.
11 | 
12 | ## Counting how many reads align to each gene in a genome annotation using `featureCounts`
13 | 
14 | `featureCounts` can be used to count how many reads align to genes as follows:
15 | 
16 | ```
17 | $ featureCounts -p -O -T n -a example_genome_annotation.gtf -o example_featureCounts_output.txt sorted_example_alignment.bam
18 | ```
19 | 
20 | In this command...
21 | 
22 | 1. **`-p`** species that fragments (or templates) will be counted instead of reads. This is only applicable for paired-end reads.
23 | 2. **`-O`** assigns reads to all their overlapping meta-features.
24 | 3. **`-T`** specifies the number (*`n`*) of threads to be used.
25 | 4. **`-a`** is the genome annotation file (`example_genome_annotation.gtf`).
26 | 5. **`-o`** specifies the name of the output file, which includes the read counts (`example_featureCounts_output.txt`).
27 | 6. **`sorted_example_alignment.bam`** is an alignment file: in this file, the reads we want to count are aligned to the same genome as the annotation file.
28 | 
29 | ### Demonstration
30 | 
31 | In this video, `featureCounts` is used to assign reads in an alignment file (`sorted_example_alignment.bam`) to genes in a genome annotation file (`example_genome_annotation.gtf`).
32 | 
33 | [![asciicast](https://asciinema.org/a/306584.svg)](https://asciinema.org/a/306584?autoplay=1)
34 | 
35 | ## More important options for `featureCounts`
36 | 
37 | 1. **`-s`** specifies strand-specific read counting. `0` for unstranded reads, `1` for stranded reads and `2` for reversely stranded reads. This depends on the library used in the sequencing protocol.
38 | 
39 | ## Further reading
40 | 
41 | 1. The `subread` user guide: <http://bioinf.wehi.edu.au/subread-package/SubreadUsersGuide.pdf>
42 | 2. The `featureCounts` paper: <https://doi.org/10.1093/bioinformatics/btt656>
43 | 


--------------------------------------------------------------------------------
/docs/genome_annotation_SwissProt_CDS.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | title: Genome annotation script
 4 | parent: 3. Scripts
 5 | ---
 6 | 
 7 | # Genome annotation SwissProt CDS.sh
 8 | 
 9 | [genome annotation SwissProt CDS.sh](../scripts/genome_annotation_SwissProt_CDS.sh) is a bash script that annotates the coding sequences (CDS) in a given genome assembly.
10 | It uses [BLAST](blast.md) and [MGKit](https://github.com/frubino/mgkit), which are included in the `bioinfo-notebook` [conda environment](conda.md).
11 | 
12 | ## Usage
13 | 
14 | ```
15 | genome_annotation_SwissProt_CDS.sh [-h|--help] [-d|--demo] [-i|--input] 
16 |  [-l|--log -p|--processors n -e|--email] 
17 |  
18 |  A script to annotate proteins in a genome assembly, using BLASTx with
19 |  UniProtKB/Swiss-Prot.
20 |  
21 |  When run with the arugment '-d' or '--demo' this script...
22 |  
23 |  	 1. Downloads a Saccharomyces cerevisiae S288C genome assembly, and 
24 |  	 the UniProtKB/Swiss-Prot amino acid sequences. 
25 |  	 2. Creates a BLAST database from the downloaded Swiss-Prot sequences,
26 |  	 and searches the S. cerevisiae genome against it using BLASTx with an
27 |  	 E-value threshold of 1e-100. 
28 |  	 3. Filters the BLASTx results, removing results with less than 90%
29 |  	 identity.
30 |  	 4. Creates a genome annotation GFF file from these BLASTx results.
31 |  	 5. Adds information to the genome annotation from UniProt (protein
32 |  	 names, KeGG ortholog information, EC numbers, etc.) 
33 |  
34 |  The end result ('S_cere.gff') is an annotation of the coding sequences (CDS) 
35 |  in the S. cerevisiae genome that are described in UniProtKB/Swiss-Prot. 
36 |  
37 |  This script can also be run with the argument '-i' or '--input', which is used
38 |  to specify a FASTA nucleotide file (.fasta or .fna) to annotate, instead of
39 |  the demo sequence. The end result is also an annotation of the CDS in the input
40 |  sequence based on UniProtKB/Swiss-Prot, called '<input>.gff'. 
41 |  
42 |  This script should be called from the 'bioinfo-notebook/' directory.The 
43 |  programs required for this script are in the 'bioinfo-notebook' conda 
44 |  environment (bioinfo-notebook/envs/bioinfo-notebook.yml or 
45 |  bioinfo-notebook/envs/bioinfo-notebook.txt). 
46 |  If the input file is not in the 'bioinfo-notebook/data/' directory, the full 
47 |  file path should be given.
48 |  
49 |  arguments: 
50 |  	 -h | --help		 show this help text and exit 
51 |  	 -i | --input		 name of input FASTA nucleotide file to annotate 
52 |  	 -d | --demo		 run the script with demonstration inputs
53 |  
54 |  optional arguments:
55 |  	 -l | --log		 redirect terminal output to a log file 
56 |  	 -p | --processors	 set the number (n) of processors to use
57 |  				 (default: 1) 
58 |  	 -e | --email		 contact email for UniProt queries
59 | ```
60 | 
61 | ## See also
62 | 
63 | - [genome_annotation_SwissProt_CDS.sh on GitHub](https://github.com/rnnh/bioinfo-notebook/blob/master/scripts/genome_annotation_SwissProt_CDS.sh)
64 | - [BLAST](blast.md)
65 | - [MGKit](https://github.com/frubino/mgkit)
66 | - [Conda](conda.md)
67 | 


--------------------------------------------------------------------------------
/docs/htseq-count.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | title: Htseq-count
 4 | parent: 2. Program guides
 5 | ---
 6 | 
 7 | 
 8 | # Htseq-count
 9 | 
10 | Given a file with aligned sequencing reads and a list of genomic features, `htseq-count` can be used to count how many reads map to each feature.
11 | 
12 | ## Aligining reads to a genome annotation using `htseq-count`
13 | 
14 | `htseq-count` can be used to align reads to a genome annotation as follows:
15 | 
16 | ```
17 | $ htseq-count --format bam sorted_alignment_file.bam genome_annotation > output_file.txt
18 | ```
19 | 
20 | In this command...
21 | 
22 | 1. **`--format`** or **`-f`** is the format of the input data. Possible values are `sam` (for text SAM files) and `bam` (for binary BAM files). Default is `sam`. A `bam` file is used in this example.
23 | 2. **`--order`** specifies whether the alignments have been sorted by name (`name`) or coordinates/position (`pos`).
24 | 3. **`sorted_alignment_file.bam`** is a `bam` format alignment file, sorted by name.
25 | 4. **`genome_annotation`** is the genome annotation file the reads in the `alignment_file` are aligned to (`.gtf` or `.gff`).
26 | 5. **`> output_file.txt`** redirects the output (`STDOUT`) to `output_file.txt`.
27 | 
28 | ### Demonstration
29 | 
30 | In this video, `htseq-counts` is used to count how many reads in an alignment file (`sorted_example_alignment.bam`) match the genes in a genome annotation (`example_genome_annotation.gtf`).
31 | 
32 | [![asciicast](https://asciinema.org/a/306597.svg)](https://asciinema.org/a/306597?autoplay=1)
33 | 
34 | ## The `htseq-count` output file
35 | 
36 | The program outputs a table with counts for each feature, followed by the special counters, which count reads that were not counted for any feature for various reasons.
37 | The names of the special counters all start with a double underscore, to facilitate filtering (**Note:** The double underscore was absent up to version 0.5.4).
38 | The special counters are:
39 | 
40 | 1. **`__no_feature`**: reads (or read pairs) which could not be assigned to any feature (set S as described above was empty).
41 | 2. **`__ambiguous`**: reads (or read pairs) which could have been assigned to more than one feature and hence were not counted for any of these, unless the --nonunique all option was used (set S had more than one element).
42 | 3. **`__too_low_aQual`**: reads (or read pairs) which were skipped due to the optional minimal alignment quality flag.
43 | 4. **`__not_aligned`**: reads (or read pairs) in the SAM/BAM file without an alignment.
44 | 5. **`__alignment_not_unique`**: reads (or read pairs) with more than one reported alignment.
45 | 
46 | ## Further reading
47 | 
48 | 1. The `htseq-count` manual: <https://htseq.readthedocs.io/en/release_0.11.1/count.html>
49 | 


--------------------------------------------------------------------------------
/docs/linux_setup.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | title: Linux setup script
 4 | parent: 3. Scripts
 5 | ---
 6 | 
 7 | # Linux setup script
 8 | 
 9 | [linux_setup.sh](../scripts/linux_setup.sh) is a `bash` shell script that...
10 | 
11 | 1. Downloads and installs [Miniconda3](conda.md)
12 | 2. Installs the `bioinfo-notebook` [virtual environment using conda](conda.md#cloning-and-activating-a-conda-environment)
13 | 
14 | This will use around 2.3 GB of hard disk space in total.
15 | 
16 | If you are using a Linux system that does not have Anaconda/Miniconda installed, this script will set up everything you need to follow the guides on this website.
17 | If you are using a freshly installed [Ubuntu virtual machine](ubuntu_virtualbox.md) or [Ubuntu through Windows Subsystem for Linux](wsl.md), this script is the ideal way to set up your new system.
18 | 
19 | ## Demonstration
20 | 
21 | This is a video demonstration of [linux_setup.sh](../scripts/linux_setup.sh).
22 | 
23 | In this demonstration, the [bioinfo-notebook GitHub repository](https://github.com/rnnh/bioinfo-notebook) (or "repo") is cloned into the home directory of the Linux system (Ubuntu).
24 | This means that all the files for this project will be downloaded from GitHub into the `~/bioinfo-notebook/` directory.
25 | A GitHub repo can be cloned using the command `$ git clone` followed by the URL of the target repo (which can be found on GitHub using the "Clone or download" button).
26 | The Linux setup script is then run from this cloned GitHub repo.
27 | 
28 | [![asciicast](https://asciinema.org/a/314853.svg)](https://asciinema.org/a/314853?autoplay=1)
29 | 
30 | ## Usage
31 | 
32 | ```
33 | This script downloads and installs Miniconda3, and uses conda to install
34 | the 'bioinfo-notebook' virtual environment.
35 | 
36 | Before running this script...
37 | 
38 | 	1. Please run the following command:
39 | 		$ sudo apt-get update
40 | 	This will ensure that the software installed will be up-to-date.
41 | 
42 | 	2. Please ensure that the 'bioinfo-notebook/' directory is in your
43 | 	home directory (~). The path to this directory should look like this:
44 | 		$HOME/bioinfo-notebook
45 | 
46 | The 'bash' command is used to run this script:
47 | 	$ bash $0
48 | 
49 | Optional arguments:
50 | 	-h | --help	show this help text and exit
51 | ```
52 | 
53 | ## See also
54 | 
55 | - [linux_setup.sh on GitHub](https://github.com/rnnh/bioinfo-notebook/blob/master/scripts/linux_setup.sh)
56 | - [Conda](conda.md)
57 | - [Cloning and activating a conda environment](conda.md#cloning-and-activating-a-conda-environment)
58 | - [Using Ubuntu through a Virtual Machine](ubuntu_virtualbox.md)
59 | - [Windows Subsystem for Linux](wsl.md)
60 | 


--------------------------------------------------------------------------------
/docs/orthofinder.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | title: OrthoFinder
 4 | parent: 2. Program guides
 5 | ---
 6 | 
 7 | # OrthoFinder
 8 | 
 9 | OrthoFinder is [a program for phylogenetic orthology inference](https://davidemms.github.io/).
10 | It can be installed using the [orthofinder.yml](../envs/orthofinder.yml) virtual environment using [conda](conda.md).
11 | 
12 | ## Running `OrthoFinder` to find orthologs between sets of FASTA amino acid sequences
13 | 
14 | `OrthoFinder` can be used to find orthologs between sets of FASTA amino acid files as follows:
15 | 
16 | ```bash
17 | $ orthofinder -t n -S diamond -f path/to/fasta/files/
18 | ```
19 | 
20 | In this command...
21 | 
22 | 1. **`-t`** sets the number of threads/processors to use (*n*).
23 | 2. **`-S`** is used to select the search tool OrthoFinder uses. Setting it to [`diamond` is far faster than the default BLAST method](https://github.com/davidemms/OrthoFinder/releases/tag/v2.2.7).
24 | 3. **`-f`** is used to select the directory of [FASTA amino acid sequences](file_formats.md#fasta) files you want to compare.
25 | 
26 | OrthoFinder will create a `Results` directory (ending with the current month and day, e.g. `Results_Sep16/`) in the target directory specified with **`-f`**.
27 | This directory will contain summary statistics of orthologs found between the FASTA files, as well as putative gene duplication events, and phylogenetic trees of the detected orthogroups.
28 | 
29 | ## See also
30 | 
31 | - [conda](conda.md)
32 | - [File formats used in bioinformatics](file_formats.md)
33 | 
34 | ## Further reading
35 | 
36 | - [OrthoFinder tutorials](https://davidemms.github.io/menu/tutorials.html)
37 | 


--------------------------------------------------------------------------------
/docs/part1.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | title: 1. General guides
 4 | nav_order: 2
 5 | description: "These are general guides for installing Ubuntu, using the command line, and the types of files used in bioinformatics."
 6 | has_children: true
 7 | has_toc: True
 8 | ---
 9 | 
10 | # 1. General guides
11 | 
12 | These are general guides for installing Ubuntu, using the command line, and the types of files used in bioinformatics.
13 | 


--------------------------------------------------------------------------------
/docs/part2.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | title: 2. Program guides
 4 | nav_order: 3
 5 | description: "These are guides to individual programs."
 6 | has_children: true
 7 | has_toc: True
 8 | ---
 9 | 
10 | # 2. Program guides
11 | 
12 | These are brief guides to individual programs.
13 | They are not comprehensive, but instead aim to introduce the essential features of each program.
14 | 


--------------------------------------------------------------------------------
/docs/part3.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | title: 3. Scripts
 4 | nav_order: 4
 5 | description: "These are scripts that use the programs and file formats discussed in this project."
 6 | has_children: true
 7 | has_toc: True
 8 | ---
 9 | 
10 | # 3. Scripts
11 | 
12 | These are scripts that use the programs and file formats discussed in this project.
13 | 


--------------------------------------------------------------------------------
/docs/report_an_issue.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | title: Report an Issue
 4 | nav_order: 4
 5 | description: "Report an Issue"
 6 | ---
 7 | 
 8 | # Report an Issue
 9 | 
10 | [If there are any errors or mistakes, please let me know.](https://github.com/rnnh/bioinfo-notebook/issues)
11 | 


--------------------------------------------------------------------------------
/docs/samtools.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | title: SAMtools
 4 | parent: 2. Program guides
 5 | ---
 6 | 
 7 | # SAMtools
 8 | 
 9 | SAMtools is a set of utilities that can manipulate alignment formats.
10 | It imports from and exports to the [SAM](file_formats.md#sam), [BAM](file_formats.md#bam) & [CRAM](file_formats.md#cram); does sorting, merging & indexing; and allows reads in any region to be retrieved swiftly. 
11 | 
12 | ## Converting a `sam` alignment file to a sorted, indexed `bam` file using `samtools`
13 | 
14 | Sequence Alignment Map (SAM/`.sam`) is a text-based file is a text-based file format for sequence alignments.
15 | It's binary equivalent is Binary Alignment Map (BAM/`.bam`), which stores the same data as a compressed binary file.
16 | A binary file for a sequence alignment is preferable over a text file, as binary files are faster to work with.
17 | A SAM alignment file (`example_alignment.sam`) can be converted to a BAM alignment using `samtools view`.
18 | 
19 | ```
20 | $ samtools view -@ n -Sb -o example_alignment.bam example_alignment.sam
21 | ```
22 | 
23 | In this command...
24 | 
25 | 1. **`-@`** sets the number (*`n`*) of threads/CPUs to be used. This flag is optional and can be used with other `samtools` commands.
26 | 2. **`-Sb`** specifies that the input is in SAM format (`S`) and the output will be be BAM format(`b`).
27 | 3. **`-o`** sets the name of the output file (`example_alignment.bam`).
28 | 4. **`example_alignment.sam`** is the name of the input file.
29 | 
30 | Now that the example alignment is in BAM format, we can sort it using `samtools sort`.
31 | Sorting this alignment will allow us to create a index.
32 | 
33 | ```
34 | $ samtools sort -O bam -o sorted_example_alignment.bam example_alignment.bam
35 | ```
36 | 
37 | In this command...
38 | 
39 | 1. **`-O`** specifies the output format (`bam`, `sam`, or `cram`).
40 | 2. **`-o`** sets the name of the output file (`sorted_example_alignment.bam`).
41 | 3. **`example_alignment.bam`** is the name of the input file.
42 | 
43 | This sorted BAM alignment file can now be indexed using `samtools index`.
44 | Indexing speeds allows fast random access to this alignment, allowing the information in the alignment file to be processed faster.
45 | 
46 | ```
47 | $ samtools index sorted_example_alignment.bam
48 | ```
49 | 
50 | In this command...
51 | 
52 | 1. **`sorted_example_alignment.bam`** is the name of the input file.
53 | 
54 | ### Demonstration 1
55 | 
56 | In this video, `samtools` is used to convert `example_alignment.sam` into a BAM file, sort that BAM file, and index it.
57 | 
58 | [![asciicast](https://asciinema.org/a/U1Flwg3EljOfI1Sx77h8PvuNf.svg)](https://asciinema.org/a/U1Flwg3EljOfI1Sx77h8PvuNf?autoplay=1)
59 | 
60 | ## Simulating short reads using `wgsim`
61 | 
62 | `wgsim` is a SAMtools program that can simulate short sequencing reads from a reference genome.
63 | This is useful for creating FASTQ files to practice with.
64 | 
65 | ```
66 | $ wgsim example_nucleotide_sequence.fasta example_reads_1.fastq example_reads_2.fastq
67 | ```
68 | 
69 | In this command...
70 | 
71 | 1. **`example_nucleotide_sequence.fasta`** is the reference genome input.
72 | 2. **`example_reads_1.fastq`** and **`example_reads_2.fastq`** are the names of the simulated read output files.
73 | 
74 | ### Demonstration 2
75 | 
76 | In this video, `wgsim` is used to simulate reads from `example_nucleotide_sequence.fasta`.
77 | 
78 | [![asciicast](https://asciinema.org/a/m89gXtx4cKRnKpI6amWj3BEAH.svg)](https://asciinema.org/a/m89gXtx4cKRnKpI6amWj3BEAH?autoplay=1)
79 | 
80 | ## Indexing a FASTA file using `samtools faidx`
81 | 
82 | SAMtools can be used to index a FASTA file as follows...
83 | 
84 | ```bash
85 | $ samtools faidx file.fasta
86 | ```
87 | 
88 | After running this command, `file.fasta` can now be used by [bcftools](bcftools.md).
89 | 
90 | ## See also
91 | 
92 | - [Alignment formats](file_formats.md#alignment-formats)
93 | - The `samtools` manual: <https://www.htslib.org/doc/samtools.html>
94 | 


--------------------------------------------------------------------------------
/docs/sgRNAcas9.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | title: sgRNAcas9
 4 | parent: 2. Program guides
 5 | ---
 6 | 
 7 | # sgRNAcas9
 8 | 
 9 | sgRNAcas9 is [a package for designing CRISPR sgRNA and evaluating potential off-target cleavage sites](https://doi.org/10.1371/journal.pone.0100448).
10 | 
11 | ## Running sgRNAcas9
12 | 
13 | 1. Install the [conda](conda.md) virutal environment for [sgRNAcas9](../envs/sgRNAcas9.yml).
14 | 2. Download [the GUI version of sgRNAcas9 from SourceForge](https://sourceforge.net/projects/sgrnacas9/).
15 | 3. Activate the sgRNAcas9 virtual environment.
16 | 4. In the directory for sgRNAcas9, run the following command to launch the sgRNAcas9 graphical user interface (GUI):
17 | 
18 | ```bash
19 | (sgRNAcas9) ~/sgRNAcas9_V3.0_GUI$ java -jar sgRNAcas9.jar
20 | ```
21 | 
22 | ## Using sgRNAcas9
23 | 
24 | In the sgRNAcas9 GUI...
25 | 
26 | - Select the [FASTA nucleic acid](file_formats.md#fasta) file of the target sequences in the "Target sequences(FASTA):" dialog box.
27 | - Select the [FASTA nucleic acid](file_formats.md#fasta) file of the genome you want to design the guide RNAs for in the "Genome sequence(FASTA):" dialog box.
28 | - Click "RUN" to run the program
29 | 
30 | sgRNAcas9 will create a `report` directory in the current working directory.
31 | This directory contains its results.
32 | The most important file in this directory is `sgRNAcas9_report.xls`.
33 | This Excel files contains reported guide RNA sequences for CRISPR with quality score, and counts of potential off-target sites.
34 | 
35 | ## References
36 | 
37 | - [sgRNAcas9 paper](https://sourceforge.net/projects/sgrnacas9/)
38 | - [sgRNAcas9 website](http://biootools.com/software.html)
39 | - [sgRNAcas9 on SourceForge](https://sourceforge.net/projects/sgrnacas9/)
40 | 


--------------------------------------------------------------------------------
/docs/snp_calling.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | title: SNP calling script
 4 | parent: 3. Scripts
 5 | ---
 6 | 
 7 | # SNP calling script
 8 | 
 9 | [snp_calling.sh](../scripts/snp_calling.sh) is a `bash` shell script that downloads [FASTQ](file_formats.md) sequencing reads using [fastq-dump](fastq-dump.md), aligns them to a genome using [bowtie2](bowtie2.md), and writes variants (SNPs and indels) to a variant call format (VCF) file.
10 | 
11 | ## Usage
12 | 
13 | ```
14 | snp_calling.sh [-h|--help] [-1|--one -2|--two -r|--reference] 
15 |  [-d|--demo] [-o|--output -l|--log -p|--processors n] 
16 |  
17 |  This script aligns sequencing reads to a reference genome, and finds genetic 
18 |  variants (SNPs/indels) based on this alignment, which are written to a variant
19 |  call format (VCF) file.
20 |  
21 |  Calling this script with the argument '-d' or '--demo' will run this script 
22 |  using Saccharomyces cerevisiae FASTQ sequencing reads and a Saccharomyces 
23 |  cerevisiae reference genome, which will be downloaded from NCBI. 
24 |  
25 |  This script should be called from the 'bioinfo-notebook/' directory.The 
26 |  programs required for this script are in the 'bioinfo-notebook' conda 
27 |  environment (bioinfo-notebook/envs/bioinfo-notebook.yml or 
28 |  bioinfo-notebook/envs/bioinfo-notebook.txt). 
29 |  If the input files are not in the 'bioinfo-notebook/data/' directory, the full 
30 |  file paths should be given.
31 | 
32 |  
33 |  arguments: 
34 |  	 -h | --help		 show this help text and exit 
35 |  	 -1 | --one		 forward reads to align with reference sequence 
36 |  				 (FASTQ: .fastq or .fastq.gz) 
37 |  	 -2 | --two		 reverse reads to align with reference sequence 
38 |  				 (FASTQ: .fastq or .fastq.gz) 
39 |  	 -r | --reference	 reference sequence to align reads against 
40 |  				 (FASTA nucleotide file: .fna) 
41 |  	 -d | --demo		 run the script with demonstration inputs
42 |  
43 |  optional arguments: 
44 |  	 -o | --output		 optional: name of final output file 
45 |  				 (default: 'reference_seq_vs_reads_var.vcf', or 
46 |  				 'S_cere_DRR237290_var.vcf' if demo is used). 
47 |  	 -l | --log		 redirect terminal output to a log file in the 
48 |  				 directory bioinfo-notebook/results/ 
49 |  	 -p | --processors	 optional: set the number (n) of processors to 
50 |  				 use (default: 1)
51 | ```
52 | 
53 | ## See also
54 | 
55 | - [snp_calling.sh on GitHub](https://github.com/rnnh/bioinfo-notebook/blob/master/scripts/snp_calling.sh)
56 | - [File formats used in bioinformatics](file_formats.md)
57 | - [samtools](samtools.md)
58 | - [fastq-dump](fastq-dump.md)
59 | - [bowtie2](bowtie2.md)
60 | 


--------------------------------------------------------------------------------
/docs/ubuntu_virtualbox.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | layout: default
  3 | title: Using Ubuntu through a Virtual Machine
  4 | parent: 1. General guides
  5 | nav_order: 3
  6 | ---
  7 | 
  8 | 
  9 | # Using Ubuntu through a Virtual Machine
 10 | 
 11 | *Ubuntu* is a Linux operating system that is widely used for bioinformatics.
 12 | If you have not used a Linux system before, an Ubuntu virtual machine is an ideal way to try the programs documented on this website.
 13 | 
 14 | A *virtual machine* is a computer file, typically called an image, that behaves like an actual computer.
 15 | It acts as a computer within a computer.
 16 | Virtual machines run in a window, much like any other program running in a window on your computer.
 17 | The virtual machine is sequestered from the rest of the system, meaning that the software inside a virtual machine can not tamper with the computer itself.
 18 | This produces an ideal environment for testing other operating systems, and running software or applications on operating systems they were not originally intended for.
 19 | 
 20 | An Ubuntu virtual machine can be created using *VirtualBox*, and an Ubuntu *disk image*.
 21 | VirtualBox is a program that can be used to create, manage, and access virtual machines.
 22 | A disk image is a file that acts like a compact disc, or another storage device.
 23 | VirtualBox and the Ubuntu disk image are freely available online.
 24 | 
 25 | ## Contents
 26 | 
 27 | - [Files required to set up an Ubuntu virtual machine](#files-required-to-set-up-an-ubuntu-virtual-machine)
 28 | 	- [Direct links to download required files](#direct-links-to-download-required-files)
 29 | - [How to create an Ubuntu virtual machine using VirtualBox](#how-to-create-an-ubuntu-virtual-machine-using-virtualbox)
 30 | - [Increasing the screen resolution of the Ubuntu virtual machine](#increasing-the-screen-resolution-of-the-ubuntu-virtual-machine)
 31 | - [See also](#see-also)
 32 | - [References](#references)
 33 | 
 34 | ## Files required to set up an Ubuntu virtual machine
 35 | 
 36 | To set up an Ubuntu virtual machine, you will need an Ubuntu disk image, a file to install VirtualBox, and the VirtualBox Extension Package.
 37 | This requires around 13 GB of free hard drive space on your computer in total.
 38 | The Ubuntu disk image is around 2 GB in size, and may take a while to download depending on your internet connection.
 39 | The file required to install VirtualBox is around 108 or 123 MB in size, depending on the platform of your computer (i.e. Windows or Mac).
 40 | 
 41 | ### Direct links to download required files
 42 | 
 43 | 1. [The Ubuntu disk image (filename: `ubuntu-18.04.4-desktop-amd64.iso`)](http://releases.ubuntu.com/18.04.4/ubuntu-18.04.4-desktop-amd64.iso)
 44 | 2. [VirtualBox installer for Windows](https://download.virtualbox.org/virtualbox/6.1.4/VirtualBox-6.1.4-136177-Win.exe)
 45 | 3. [VirtualBox installer for Mac](https://download.virtualbox.org/virtualbox/6.1.4/VirtualBox-6.1.4-136177-OSX.dmg)
 46 | 4. [VirtualBox Extension Pack (all platforms)](https://download.virtualbox.org/virtualbox/6.1.4/Oracle_VM_VirtualBox_Extension_Pack-6.1.4.vbox-extpack)
 47 | 
 48 | If the above links do not work, they may have expired.
 49 | In this case, the above files can be found on the [VirtualBox website](https://www.virtualbox.org/wiki/Downloads) and the [Ubuntu website](https://ubuntu.com/download/desktop).
 50 | 
 51 | ## How to create an Ubuntu virtual machine using VirtualBox
 52 | 
 53 | 1. Download the [VirtualBox installer](#direct-links-to-download-required-files) for your computer (either Windows or Mac).
 54 | 2. Once the VirtualBox installer is downloaded, open it and follow the on-screen instructions to install the VirtualBox program.
 55 | 3. **Windows only:** If you get a "Windows Security" prompt asking *"Would you like to install this device software?"* for driver software from *"Publisher: Oracle Corporation"*, select "Install".
 56 | 4. **Mac only:** If you get a *"This package will run a program to determine if the software can be installed"* prompt while installing VirtualBox, select "Continue". You may also be asked to enter your user password while installing VirtualBox on a Mac.
 57 | 5. Once installed, open the VirtualBox program.
 58 | 6. In VirtualBox, click on "New" (the blue badge). This will open a menu to create a new virtual machine.
 59 | 7. In the "Name" field of the "Name and operating system" window, type "ubuntu". VirtualBox will automatically set the type and version for this virtual machine as "Linux" and "Ubuntu".
 60 | 8. Select "Next" to proceed to the "Memory size" section.
 61 | 9. In this section, you can set the amount of Random Access Memory (RAM) that the virtual machine can use. A suggested amount of RAM will automatically be selected when you get to this page, but you can increase the amount of RAM allocated using the slider on this page.
 62 | 10. **Note:** If you use the slider to increase the amount of RAM allocated on the "Memory Size" page, keep the slider in the green zone. Setting the slider in the orange or red zone (>50% of your computer's available RAM) will negatively affect the performance of the virtual machine.
 63 | 11. Select "Next" to proceed to the "Hard disk" page.
 64 | 12. Select "Create a virtual hard disk now", and then select "Create".
 65 | 13. On the "Hard disk file type" page, select "VDI (VirtualBox Disk Image)", and then select "Next".
 66 | 14. On the "Store on physical hard disk", select "Dynamically allocated", and then select "Next" to proceed to the "File location and size" page.
 67 | 15. On this page, you can change the location and size of the virtual hard disk. There is no need to adjust the size of the virtual hard disk, but take note of its location (the folder/directory it will be created in). Select "Create".
 68 | 16. In the left side of the VirtualBox main menu, double-click the name of the virtual machine you just created ("ubuntu").
 69 | 17. This will bring up the "Select start-up disk" window. In this window, select the folder icon to open the "Optical Disk Selector" menu.
 70 | 18. In this menu, select "Add", which will open a window titled "Please choose a virtual optical disk file".
 71 | 19. In this window, go to the folder into which the Ubuntu disk image downloaded (e.g. "Downloads"), and click the [Ubuntu disk image (filename: `ubuntu-18.04.4-desktop-amd64.iso`)](#direct-links-to-download-required-files) to select it, and then select "Open".
 72 | 20. This will bring you back to the "Optical Disk Selector" window. Select the Ubuntu disk image you selected in the previous window, and click on "Choose".
 73 | 21. This will bring you back to the "Select start-up disk" window. The Ubuntu disk image should be selected in the drop down menu (this read "Empty" before the Ubuntu disk image was added). Select "Start" to start the virtual machine.
 74 | 22. The Ubuntu virtual machine is now running in its own window. It may take a few minutes to start up the first time.
 75 | 23. On the "Welcome" screen in Ubuntu, select "Install Ubuntu".
 76 | 24. In the "Keyboard layout" section, select your keyboard layout, and then select "Continue". This will bring you to the "Updates and other software" window.
 77 | 25. In this window, in the section "What apps would you like to install to start with?", select "Minimal installation".
 78 | 26. In the "Other options" section, select "Download updates while installing Ubuntu", and leave "Install third-party software..." unselected. Select "Continue" to proceed to the "Installation type" window.
 79 | 27. In this window, select "Erase disk and install Ubuntu". As this is a virtual machine, in this instance "disk" refers to the virtual disk image (`.vdi`) file created earlier (see steps 12 to 15). Select "Install now".
 80 | 28. A window titled "Write the changes to disks?" will appear. In this window, select "Continue".
 81 | 29. This will bring you to the "Where are you?" window. In this window, enter your location (which is needed to set the system clock) and select "Continue".
 82 | 30. Fill in the requested details in the "Who are you?" window: your name, your computer's name, your username (both of which will be filled in automatically when you enter your name), and your password. Make sure you remember your password, you will need it to install programs in your Ubuntu virtual machine. Select "Continue" to proceed.
 83 | 31. At this point, Ubuntu will begin installing on the virtual disk image created earlier (the `.vdi` file). This will take a few minutes.
 84 | 32. Once the installation is complete, select "Restart Now" from the "Installation complete" dialog window.
 85 | 33. When asked "Please remove the installation media and press ENTER", press Enter (a.k.a. Return).
 86 | 34. The virtual machine will then restart, and the Ubuntu login page will load. On this page, select the user you created during the installation, and enter your password to log in.
 87 | 35. Once you have logged in, you have finished setting up your Ubuntu virtual machine. Click through the "What's new in Ubuntu" window for a brief introduction to Ubuntu.
 88 | 36. When you want to close your Ubuntu virtual machine, close the window it is running in to bring up the "Close Virtual Machine" window, select "Power off the machine" and click "OK". This is the equivalent of shutting down the machine. Alternatively, you can select "Power off" within the Ubuntu virtual machine.
 89 | 
 90 | Once you have finished installing the Ubuntu virtual machine, you can delete the Ubuntu disk image (filename: `ubuntu-18.04.4-desktop-amd64.iso`), and the VirtualBox installer.
 91 | 
 92 | ## Increasing the screen resolution of the Ubuntu virtual machine
 93 | 
 94 | At this point, the Ubuntu virtual machine takes up only a small portion of the VirtualBox window it runs in.
 95 | To increase the screen resolution of the Ubuntu virtual machine, you will need to download the [VirtualBox Extension Package](#direct-links-to-download-required-files) and follow the steps below.
 96 | 
 97 | 1. Once downloaded, double click the VirtualBox Extension Pack (file extension `.vbox-extpack`). If you have installed the VirtualBox program, it will open this file.
 98 | 2. VirtualBox will open with a window notifying that an extension pack is about to be installed. In this window, select "Install" to proceed with the extension pack installation.
 99 | 3. Scroll to the bottom of the Terms and Conditions window that opens, and select "I Agree" to install the extension pack.
100 | 4. Open the Ubuntu virtual machine in VirtualBox.
101 | 5. In the menu bar of the VirtualBox window in which Ubuntu is running, select the "Devices" menu, and select "Insert Guest Additions CD image...".
102 | 6. A notification will appear in the Ubuntu virtual machine: '"VBox_GAs_6.1.4" contains software intended to be automatically started. Would you like to run it?". In this window, select "Run", and enter your Ubuntu password to install the VirtualBox Guest Additions on the Ubuntu virtual machine.
103 | 7. A terminal window will open showing the VirtualBox Guest Additions installation progress. Once the installation has finished, press Return (Enter) to close this window.
104 | 8. Close the Ubuntu virtual machine by closing the window it is running in, and selecting "Power off the machine" from the "Close Virtual Machine" window.
105 | 9. Open the Ubuntu virtual machine in VirtualBox.
106 | 10. In the menu bar of the window in which Ubuntu is running, select the "View" menu, and confirm that "Auto-resize Guest Display" is enabled.
107 | 
108 | ## See also
109 | 
110 | - [Introduction to the command line](cl_intro.md)
111 | - [Windows Subsystem for Linux](wsl.md)
112 | - [The Ubuntu Website](https://ubuntu.com/)
113 | - [The VirtualBox Website](https://www.virtualbox.org/)
114 | 
115 | ## References
116 | 
117 | - [What is a Virtual Machine?](https://azure.microsoft.com/en-us/overview/what-is-a-virtual-machine/)
118 | - [How to Install Ubuntu on VirtualBox](https://www.wikihow.com/Install-Ubuntu-on-VirtualBox)
119 | 


--------------------------------------------------------------------------------
/docs/wsl.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | title: Windows Subsystem for Linux
 4 | parent: 1. General guides
 5 | nav_order: 2
 6 | ---
 7 | 
 8 | # Windows Subsystem for Linux
 9 | 
10 | Windows Subsystem for Linux (WSL) is a feature of Windows 10.
11 | When enabled, WSL allows Linux systems (e.g. Ubuntu) to be used as Windows applications.
12 | These Linux systems can be downloaded directly from the Microsoft Store.
13 | The bioinfo-notebook [conda](conda.md) environment can be installed in an Ubuntu system running using WSL.
14 | 
15 | ## Installing Ubuntu on Windows 10 using WSL
16 | 
17 | Before you begin, make sure you have around 1.20 GB of free disk space.
18 | 
19 | ### Enable WSL
20 | 
21 | *Note:* Enabling the WSL feature will take a few minutes, and you will need to restart your computer for it to take effect.
22 | 
23 | 1. In the search box on the taskbar, type "control panel", and then select Control Panel.
24 | 2. In the Control Panel, select "Programs".
25 | 3. Under Programs and Features, select "Turn Windows features on or off".
26 | 4. If asked "Do you want this app to make changes to your device?", select "Yes".
27 | 5. From the list of Windows features, tick the box next to "Windows Subsystem for Linux" to enable WSL, and click OK.
28 | 
29 | ### Download Ubuntu from the Microsoft Store
30 | 
31 | 1. In the search box on the taskbar, type "microsoft store", and select Microsoft Store.
32 | 2. In the Microsoft Store, search for "Ubuntu".
33 | 3. Select the Ubuntu app.
34 | 4. On the app page, select "Get" to download Ubuntu.
35 | 5. If asked to sign in with a Microsoft account, select "No, thanks".
36 | 
37 | After enabling WSL and downloading Ubuntu from the Microsoft Store, Ubuntu can be used like a regular Windows application.
38 | 
39 | ### Running Ubuntu for the first time
40 | 
41 | 1. In the search box on the taskbar, type "Ubuntu", and select the Ubuntu app to launch it. It will take a few minutes to install the first time it runs.
42 | 2. When prompted, enter a UNIX username- this does not need to be the same as your Windows account name.
43 | 3. You will need to set a UNIX password. This is only used for the Ubuntu app, it does not need to be the same as your Windows password. Make sure you remember your UNIX password, as you will need it for installing new programs in Ubuntu.
44 | 
45 | Once your UNIX password has been updated successfully, you will see the `bash` command prompt in the Ubuntu window:
46 | 
47 | ```
48 | (Your UNIX username)@(Your computer's alias):~$ _
49 | ```
50 | 
51 | In this command prompt, the tilde character (`~`) indicates that you are currently in your home directory.
52 | The dollar sign (`$`) indicates that this command line uses the `bash` shell language.
53 | 
54 | ## See also
55 | 
56 | - [Introduction to the command line](cl_intro.md)
57 | - [Using Ubuntu through a Virtual Machine](ubuntu_virtualbox.md) 
58 | - [conda](conda.md)
59 | 


--------------------------------------------------------------------------------
/envs/augustus.yml:
--------------------------------------------------------------------------------
  1 | name: augustus
  2 | channels:
  3 |   - bioconda
  4 |   - conda-forge
  5 |   - cf-staging
  6 |   - defaults
  7 | dependencies:
  8 |   - _libgcc_mutex=0.1=conda_forge
  9 |   - _openmp_mutex=4.5=0_gnu
 10 |   - augustus=3.3.3=pl526hce533f5_0
 11 |   - biopython=1.77=py38h1e0a361_0
 12 |   - boost=1.70.0=py38h9de70de_1
 13 |   - boost-cpp=1.70.0=h7b93d67_3
 14 |   - bzip2=1.0.8=h516909a_2
 15 |   - ca-certificates=2020.6.20=hecda079_0
 16 |   - certifi=2020.6.20=py38h32f6830_0
 17 |   - curl=7.71.1=he644dc0_0
 18 |   - gsl=2.5=h294904e_1
 19 |   - htslib=1.9=h4da6232_3
 20 |   - icu=67.1=he1b5a44_0
 21 |   - krb5=1.17.1=hfafb76e_1
 22 |   - ld_impl_linux-64=2.34=h53a641e_5
 23 |   - libblas=3.8.0=11_openblas
 24 |   - libcblas=3.8.0=11_openblas
 25 |   - libcurl=7.71.1=hcdd3856_0
 26 |   - libdeflate=1.2=h516909a_1
 27 |   - libedit=3.1.20191231=h46ee950_1
 28 |   - libffi=3.2.1=he1b5a44_1007
 29 |   - libgcc-ng=9.2.0=h24d8f2e_2
 30 |   - libgfortran-ng=7.5.0=hdf63c60_9
 31 |   - libgomp=9.2.0=h24d8f2e_2
 32 |   - liblapack=3.8.0=11_openblas
 33 |   - libopenblas=0.3.6=h6e990d7_6
 34 |   - libssh2=1.9.0=hab1572f_3
 35 |   - libstdcxx-ng=9.2.0=hdf63c60_2
 36 |   - lp_solve=5.5.2.5=h14c3975_1001
 37 |   - lz4-c=1.9.2=he1b5a44_1
 38 |   - metis=5.1.0=he1b5a44_1005
 39 |   - ncurses=6.1=hf484d3e_1002
 40 |   - numpy=1.18.5=py38h8854b6b_0
 41 |   - openblas=0.3.6=h6e990d7_6
 42 |   - openssl=1.1.1g=h516909a_0
 43 |   - perl=5.26.2=h516909a_1006
 44 |   - perl-apache-test=1.40=pl526_1
 45 |   - perl-app-cpanminus=1.7044=pl526_1
 46 |   - perl-base=2.23=pl526_1
 47 |   - perl-carp=1.38=pl526_3
 48 |   - perl-class-load=0.25=pl526_0
 49 |   - perl-class-load-xs=0.10=pl526h6bb024c_2
 50 |   - perl-class-method-modifiers=2.12=pl526_0
 51 |   - perl-constant=1.33=pl526_1
 52 |   - perl-cpan-meta=2.150010=pl526_0
 53 |   - perl-cpan-meta-requirements=2.140=pl526_0
 54 |   - perl-cpan-meta-yaml=0.018=pl526_0
 55 |   - perl-data-dumper=2.173=pl526_0
 56 |   - perl-data-optlist=0.110=pl526_2
 57 |   - perl-dbi=1.642=pl526_0
 58 |   - perl-devel-globaldestruction=0.14=pl526_0
 59 |   - perl-devel-overloadinfo=0.005=pl526_0
 60 |   - perl-devel-stacktrace=2.04=pl526_0
 61 |   - perl-dist-checkconflicts=0.11=pl526_2
 62 |   - perl-encode=2.88=pl526_1
 63 |   - perl-eval-closure=0.14=pl526h6bb024c_4
 64 |   - perl-exporter=5.72=pl526_1
 65 |   - perl-extutils-cbuilder=0.280230=pl526_1
 66 |   - perl-extutils-makemaker=7.36=pl526_1
 67 |   - perl-extutils-manifest=1.72=pl526_0
 68 |   - perl-extutils-parsexs=3.35=pl526_0
 69 |   - perl-file-path=2.16=pl526_0
 70 |   - perl-file-temp=0.2304=pl526_2
 71 |   - perl-file-which=1.23=pl526_0
 72 |   - perl-getopt-long=2.50=pl526_1
 73 |   - perl-ipc-cmd=1.02=pl526_0
 74 |   - perl-json-pp=4.04=pl526_0
 75 |   - perl-locale-maketext-simple=0.21=pl526_2
 76 |   - perl-module-build=0.4224=pl526_3
 77 |   - perl-module-corelist=5.20190524=pl526_0
 78 |   - perl-module-implementation=0.09=pl526_2
 79 |   - perl-module-load=0.32=pl526_1
 80 |   - perl-module-load-conditional=0.68=pl526_2
 81 |   - perl-module-metadata=1.000036=pl526_0
 82 |   - perl-module-runtime=0.016=pl526_1
 83 |   - perl-module-runtime-conflicts=0.003=pl526_0
 84 |   - perl-moo=2.003004=pl526_0
 85 |   - perl-moose=2.2011=pl526hf484d3e_1
 86 |   - perl-mro-compat=0.13=pl526_0
 87 |   - perl-package-deprecationmanager=0.17=pl526_0
 88 |   - perl-package-stash=0.38=pl526hf484d3e_1
 89 |   - perl-package-stash-xs=0.28=pl526hf484d3e_1
 90 |   - perl-parallel-forkmanager=2.02=pl526_0
 91 |   - perl-params-check=0.38=pl526_1
 92 |   - perl-params-util=1.07=pl526h6bb024c_4
 93 |   - perl-parent=0.236=pl526_1
 94 |   - perl-pathtools=3.75=pl526h14c3975_1
 95 |   - perl-perl-ostype=1.010=pl526_1
 96 |   - perl-role-tiny=2.000008=pl526_0
 97 |   - perl-scalar-list-utils=1.52=pl526h516909a_0
 98 |   - perl-storable=3.15=pl526h14c3975_0
 99 |   - perl-sub-exporter=0.987=pl526_2
100 |   - perl-sub-exporter-progressive=0.001013=pl526_0
101 |   - perl-sub-identify=0.14=pl526h14c3975_0
102 |   - perl-sub-install=0.928=pl526_2
103 |   - perl-sub-name=0.21=pl526_1
104 |   - perl-sub-quote=2.006003=pl526_1
105 |   - perl-text-abbrev=1.02=pl526_0
106 |   - perl-text-parsewords=3.30=pl526_0
107 |   - perl-try-tiny=0.30=pl526_1
108 |   - perl-version=0.9924=pl526_0
109 |   - perl-xsloader=0.24=pl526_0
110 |   - perl-yaml=1.29=pl526_0
111 |   - pip=20.1.1=py_1
112 |   - python=3.8.3=cpython_he5300dc_0
113 |   - python_abi=3.8=1_cp38
114 |   - readline=8.0=h46ee950_1
115 |   - setuptools=49.1.0=py38h32f6830_0
116 |   - sqlite=3.32.3=hcee41ef_1
117 |   - suitesparse=4.5.6=h717dc36_1204
118 |   - tbb=2020.1=hc9558a2_0
119 |   - tk=8.6.10=hed695b0_0
120 |   - wheel=0.34.2=py_1
121 |   - xz=5.2.5=h516909a_1
122 |   - zlib=1.2.11=h516909a_1006
123 |   - zstd=1.4.4=h6597ccf_3
124 | 


--------------------------------------------------------------------------------
/envs/bioinfo-notebook.txt:
--------------------------------------------------------------------------------
  1 | # This file may be used to create an environment using:
  2 | # $ conda create --name <env> --file <this file>
  3 | # platform: linux-64
  4 | @EXPLICIT
  5 | https://repo.anaconda.com/pkgs/main/linux-64/_libgcc_mutex-0.1-main.conda
  6 | https://repo.anaconda.com/pkgs/main/linux-64/blas-1.0-mkl.conda
  7 | https://repo.anaconda.com/pkgs/main/linux-64/ca-certificates-2020.10.14-0.conda
  8 | https://repo.anaconda.com/pkgs/main/linux-64/intel-openmp-2020.1-217.conda
  9 | https://repo.anaconda.com/pkgs/main/linux-64/jpeg-9b-h024ee3a_2.conda
 10 | https://repo.anaconda.com/pkgs/main/linux-64/ld_impl_linux-64-2.33.1-h53a641e_7.conda
 11 | https://repo.anaconda.com/pkgs/main/linux-64/libgcc-ng-9.1.0-hdf63c60_0.conda
 12 | https://repo.anaconda.com/pkgs/main/linux-64/libgfortran-ng-7.3.0-hdf63c60_0.conda
 13 | https://repo.anaconda.com/pkgs/main/linux-64/libstdcxx-ng-9.1.0-hdf63c60_0.conda
 14 | https://repo.anaconda.com/pkgs/main/linux-64/bzip2-1.0.8-h7b6447c_0.conda
 15 | https://repo.anaconda.com/pkgs/main/linux-64/expat-2.2.9-he6710b0_2.conda
 16 | https://repo.anaconda.com/pkgs/main/linux-64/icu-58.2-he6710b0_3.conda
 17 | https://conda.anaconda.org/bioconda/linux-64/libdeflate-1.0-h14c3975_1.tar.bz2
 18 | https://repo.anaconda.com/pkgs/main/linux-64/libffi-3.3-he6710b0_2.conda
 19 | https://repo.anaconda.com/pkgs/main/linux-64/libiconv-1.15-h63c8f33_5.conda
 20 | https://repo.anaconda.com/pkgs/main/linux-64/libuuid-1.0.3-h1bed415_2.conda
 21 | https://repo.anaconda.com/pkgs/main/linux-64/libxcb-1.14-h7b6447c_0.conda
 22 | https://repo.anaconda.com/pkgs/main/linux-64/lz4-c-1.9.2-he6710b0_1.conda
 23 | https://repo.anaconda.com/pkgs/main/linux-64/lzo-2.10-h7b6447c_2.conda
 24 | https://repo.anaconda.com/pkgs/main/linux-64/mkl-2020.1-217.conda
 25 | https://repo.anaconda.com/pkgs/main/linux-64/ncurses-6.2-he6710b0_1.conda
 26 | https://repo.anaconda.com/pkgs/main/linux-64/openssl-1.1.1h-h7b6447c_0.conda
 27 | https://repo.anaconda.com/pkgs/main/linux-64/pcre-8.44-he6710b0_0.conda
 28 | https://repo.anaconda.com/pkgs/main/linux-64/perl-5.26.2-h14c3975_0.conda
 29 | https://repo.anaconda.com/pkgs/main/linux-64/snappy-1.1.8-he6710b0_0.conda
 30 | https://repo.anaconda.com/pkgs/main/linux-64/tbb-2020.0-hfd86e86_0.conda
 31 | https://repo.anaconda.com/pkgs/main/linux-64/xz-5.2.5-h7b6447c_0.conda
 32 | https://repo.anaconda.com/pkgs/main/linux-64/zlib-1.2.11-h7b6447c_3.conda
 33 | https://repo.anaconda.com/pkgs/main/linux-64/blosc-1.19.0-hd408876_0.conda
 34 | https://repo.anaconda.com/pkgs/main/linux-64/glib-2.65.0-h3eb4bd4_0.conda
 35 | https://repo.anaconda.com/pkgs/main/linux-64/gstreamer-1.14.0-hb31296c_0.conda
 36 | https://repo.anaconda.com/pkgs/main/linux-64/hdf5-1.10.4-hb1b8bf9_0.conda
 37 | https://repo.anaconda.com/pkgs/main/linux-64/libedit-3.1.20191231-h14c3975_1.conda
 38 | https://repo.anaconda.com/pkgs/main/linux-64/libpng-1.6.37-hbc83047_0.conda
 39 | https://repo.anaconda.com/pkgs/main/linux-64/libssh2-1.9.0-h1ba5d50_1.conda
 40 | https://repo.anaconda.com/pkgs/main/linux-64/libxml2-2.9.10-he19cac6_1.conda
 41 | https://conda.anaconda.org/bioconda/linux-64/perl-app-cpanminus-1.7044-pl526_1.tar.bz2
 42 | https://conda.anaconda.org/bioconda/linux-64/perl-base-2.23-pl526_1.tar.bz2
 43 | https://conda.anaconda.org/bioconda/linux-64/perl-common-sense-3.74-pl526_2.tar.bz2
 44 | https://conda.anaconda.org/bioconda/linux-64/perl-compress-raw-bzip2-2.087-pl526he1b5a44_0.tar.bz2
 45 | https://conda.anaconda.org/bioconda/linux-64/perl-compress-raw-zlib-2.087-pl526hc9558a2_0.tar.bz2
 46 | https://conda.anaconda.org/bioconda/linux-64/perl-constant-1.33-pl526_1.tar.bz2
 47 | https://conda.anaconda.org/bioconda/linux-64/perl-data-dumper-2.173-pl526_0.tar.bz2
 48 | https://conda.anaconda.org/bioconda/linux-64/perl-digest-hmac-1.03-pl526_3.tar.bz2
 49 | https://conda.anaconda.org/bioconda/linux-64/perl-digest-md5-2.55-pl526_0.tar.bz2
 50 | https://conda.anaconda.org/bioconda/linux-64/perl-exporter-5.72-pl526_1.tar.bz2
 51 | https://conda.anaconda.org/bioconda/linux-64/perl-exporter-tiny-1.002001-pl526_0.tar.bz2
 52 | https://conda.anaconda.org/bioconda/linux-64/perl-extutils-makemaker-7.36-pl526_1.tar.bz2
 53 | https://conda.anaconda.org/bioconda/linux-64/perl-html-tagset-3.20-pl526_3.tar.bz2
 54 | https://conda.anaconda.org/bioconda/linux-64/perl-io-html-1.001-pl526_2.tar.bz2
 55 | https://conda.anaconda.org/bioconda/linux-64/perl-io-zlib-1.10-pl526_2.tar.bz2
 56 | https://conda.anaconda.org/bioconda/linux-64/perl-mozilla-ca-20180117-pl526_1.tar.bz2
 57 | https://conda.anaconda.org/bioconda/linux-64/perl-parent-0.236-pl526_1.tar.bz2
 58 | https://conda.anaconda.org/bioconda/linux-64/perl-scalar-list-utils-1.52-pl526h516909a_0.tar.bz2
 59 | https://conda.anaconda.org/bioconda/linux-64/perl-socket-2.027-pl526_1.tar.bz2
 60 | https://conda.anaconda.org/bioconda/linux-64/perl-try-tiny-0.30-pl526_1.tar.bz2
 61 | https://conda.anaconda.org/bioconda/linux-64/perl-xml-parser-2.44-pl526h4e0c4b3_7.tar.bz2
 62 | https://conda.anaconda.org/bioconda/linux-64/perl-xml-sax-base-1.09-pl526_0.tar.bz2
 63 | https://conda.anaconda.org/bioconda/linux-64/perl-xsloader-0.24-pl526_0.tar.bz2
 64 | https://repo.anaconda.com/pkgs/main/linux-64/readline-8.0-h7b6447c_0.conda
 65 | https://conda.anaconda.org/bioconda/linux-64/subread-2.0.0-hed695b0_0.tar.bz2
 66 | https://repo.anaconda.com/pkgs/main/linux-64/tk-8.6.10-hbc83047_0.conda
 67 | https://repo.anaconda.com/pkgs/main/linux-64/zstd-1.4.5-h9ceee32_0.conda
 68 | https://repo.anaconda.com/pkgs/main/linux-64/dbus-1.13.16-hb2f20db_0.conda
 69 | https://repo.anaconda.com/pkgs/main/linux-64/freetype-2.10.2-h5ab3b9f_0.conda
 70 | https://repo.anaconda.com/pkgs/main/linux-64/gst-plugins-base-1.14.0-hbbd80ab_1.conda
 71 | https://repo.anaconda.com/pkgs/main/linux-64/krb5-1.18.2-h173b8e3_0.conda
 72 | https://conda.anaconda.org/bioconda/linux-64/ncbi-ngs-sdk-2.10.0-hdf6179e_0.tar.bz2
 73 | https://conda.anaconda.org/bioconda/linux-64/perl-carp-1.38-pl526_3.tar.bz2
 74 | https://conda.anaconda.org/bioconda/linux-64/perl-encode-2.88-pl526_1.tar.bz2
 75 | https://conda.anaconda.org/bioconda/linux-64/perl-file-path-2.16-pl526_0.tar.bz2
 76 | https://conda.anaconda.org/bioconda/linux-64/perl-html-parser-3.72-pl526h6bb024c_5.tar.bz2
 77 | https://conda.anaconda.org/bioconda/linux-64/perl-io-compress-2.087-pl526he1b5a44_0.tar.bz2
 78 | https://conda.anaconda.org/bioconda/linux-64/perl-list-moreutils-xs-0.428-pl526_0.tar.bz2
 79 | https://conda.anaconda.org/bioconda/linux-64/perl-mime-base64-3.15-pl526_1.tar.bz2
 80 | https://conda.anaconda.org/bioconda/linux-64/perl-ntlm-1.09-pl526_4.tar.bz2
 81 | https://conda.anaconda.org/bioconda/linux-64/perl-storable-3.15-pl526h14c3975_0.tar.bz2
 82 | https://conda.anaconda.org/bioconda/linux-64/perl-test-requiresinternet-0.05-pl526_0.tar.bz2
 83 | https://conda.anaconda.org/bioconda/linux-64/perl-types-serialiser-1.0-pl526_2.tar.bz2
 84 | https://conda.anaconda.org/bioconda/linux-64/perl-xml-namespacesupport-1.12-pl526_0.tar.bz2
 85 | https://repo.anaconda.com/pkgs/main/linux-64/sqlite-3.32.3-h62c20be_0.conda
 86 | https://repo.anaconda.com/pkgs/main/linux-64/fontconfig-2.13.0-h9420a91_0.conda
 87 | https://repo.anaconda.com/pkgs/main/linux-64/libcurl-7.71.1-h20c2e04_1.conda
 88 | https://conda.anaconda.org/bioconda/linux-64/perl-business-isbn-data-20140910.003-pl526_0.tar.bz2
 89 | https://conda.anaconda.org/bioconda/linux-64/perl-encode-locale-1.05-pl526_6.tar.bz2
 90 | https://conda.anaconda.org/bioconda/linux-64/perl-file-temp-0.2304-pl526_2.tar.bz2
 91 | https://conda.anaconda.org/bioconda/linux-64/perl-html-tree-5.07-pl526_1.tar.bz2
 92 | https://conda.anaconda.org/bioconda/linux-64/perl-json-xs-2.34-pl526h6bb024c_3.tar.bz2
 93 | https://conda.anaconda.org/bioconda/linux-64/perl-list-moreutils-0.428-pl526_1.tar.bz2
 94 | https://conda.anaconda.org/bioconda/linux-64/perl-lwp-mediatypes-6.04-pl526_0.tar.bz2
 95 | https://conda.anaconda.org/bioconda/linux-64/perl-net-ssleay-1.88-pl526h90d6eec_0.tar.bz2
 96 | https://conda.anaconda.org/bioconda/linux-64/perl-pathtools-3.75-pl526h14c3975_1.tar.bz2
 97 | https://conda.anaconda.org/bioconda/linux-64/perl-time-local-1.28-pl526_1.tar.bz2
 98 | https://repo.anaconda.com/pkgs/main/linux-64/python-3.7.7-hcff3b4d_5.conda
 99 | https://conda.anaconda.org/conda-forge/linux-64/asciinema-2.0.2-py37_1000.tar.bz2
100 | https://conda.anaconda.org/bioconda/linux-64/bowtie-1.2.3-py37hc9558a2_0.tar.bz2
101 | https://conda.anaconda.org/bioconda/linux-64/bowtie2-2.3.5.1-py37he513fc3_0.tar.bz2
102 | https://repo.anaconda.com/pkgs/main/noarch/certifi-2020.6.20-pyhd3eb1b0_3.conda
103 | https://repo.anaconda.com/pkgs/main/linux-64/chardet-3.0.4-py37_1003.conda
104 | https://repo.anaconda.com/pkgs/main/noarch/click-7.1.2-py_0.conda
105 | https://repo.anaconda.com/pkgs/main/linux-64/curl-7.71.1-hbc83047_1.conda
106 | https://repo.anaconda.com/pkgs/main/noarch/decorator-4.4.2-py_0.conda
107 | https://repo.anaconda.com/pkgs/main/linux-64/future-0.18.2-py37_1.conda
108 | https://repo.anaconda.com/pkgs/main/noarch/idna-2.10-py_0.conda
109 | https://repo.anaconda.com/pkgs/main/linux-64/kiwisolver-1.2.0-py37hfd86e86_0.conda
110 | https://repo.anaconda.com/pkgs/main/noarch/mock-4.0.2-py_0.conda
111 | https://repo.anaconda.com/pkgs/main/linux-64/msgpack-python-1.0.0-py37hfd86e86_1.conda
112 | https://conda.anaconda.org/bioconda/linux-64/perl-archive-tar-2.32-pl526_0.tar.bz2
113 | https://conda.anaconda.org/bioconda/linux-64/perl-business-isbn-3.004-pl526_0.tar.bz2
114 | https://conda.anaconda.org/bioconda/linux-64/perl-http-date-6.02-pl526_3.tar.bz2
115 | https://conda.anaconda.org/bioconda/linux-64/perl-io-socket-ssl-2.066-pl526_0.tar.bz2
116 | https://conda.anaconda.org/bioconda/linux-64/perl-json-4.02-pl526_0.tar.bz2
117 | https://conda.anaconda.org/bioconda/noarch/perl-xml-sax-1.02-pl526_0.tar.bz2
118 | https://repo.anaconda.com/pkgs/main/noarch/pycparser-2.20-py_2.conda
119 | https://repo.anaconda.com/pkgs/main/linux-64/pymongo-3.11.0-py37he6710b0_0.conda
120 | https://repo.anaconda.com/pkgs/main/noarch/pyparsing-2.4.7-py_0.conda
121 | https://repo.anaconda.com/pkgs/main/linux-64/pysocks-1.7.1-py37_1.conda
122 | https://repo.anaconda.com/pkgs/main/noarch/pytz-2020.1-py_0.conda
123 | https://repo.anaconda.com/pkgs/main/linux-64/qt-5.9.7-h5867ecd_1.conda
124 | https://conda.anaconda.org/bioconda/noarch/semidbm-0.5.1-pyh864c0ab_3.tar.bz2
125 | https://repo.anaconda.com/pkgs/main/linux-64/sip-4.19.8-py37hf484d3e_0.conda
126 | https://repo.anaconda.com/pkgs/main/noarch/six-1.15.0-py_0.conda
127 | https://conda.anaconda.org/bioconda/linux-64/spades-3.13.0-0.tar.bz2
128 | https://repo.anaconda.com/pkgs/main/linux-64/tornado-6.0.4-py37h7b6447c_1.conda
129 | https://repo.anaconda.com/pkgs/main/noarch/tqdm-4.48.2-py_0.conda
130 | https://conda.anaconda.org/bioconda/linux-64/bcftools-1.9-ha228f0b_4.tar.bz2
131 | https://repo.anaconda.com/pkgs/main/linux-64/cffi-1.14.1-py37he30daa8_0.conda
132 | https://repo.anaconda.com/pkgs/main/linux-64/cycler-0.10.0-py37_0.conda
133 | https://repo.anaconda.com/pkgs/main/linux-64/mkl-service-2.3.0-py37he904b0f_0.conda
134 | https://conda.anaconda.org/bioconda/linux-64/perl-file-listing-6.04-pl526_1.tar.bz2
135 | https://conda.anaconda.org/bioconda/linux-64/perl-uri-1.76-pl526_0.tar.bz2
136 | https://conda.anaconda.org/bioconda/linux-64/perl-xml-libxml-2.0132-pl526h7ec2d77_1.tar.bz2
137 | https://conda.anaconda.org/bioconda/linux-64/perl-xml-sax-expat-0.51-pl526_3.tar.bz2
138 | https://repo.anaconda.com/pkgs/main/linux-64/pyqt-5.9.2-py37h05f1152_2.conda
139 | https://conda.anaconda.org/bioconda/linux-64/pysam-0.15.3-py37hda2845c_1.tar.bz2
140 | https://repo.anaconda.com/pkgs/main/noarch/python-dateutil-2.8.1-py_0.tar.bz2
141 | https://conda.anaconda.org/bioconda/linux-64/samtools-1.6-h244ad75_5.tar.bz2
142 | https://repo.anaconda.com/pkgs/main/linux-64/setuptools-49.4.0-py37_0.conda
143 | https://repo.anaconda.com/pkgs/main/linux-64/brotlipy-0.7.0-py37h7b6447c_1000.conda
144 | https://repo.anaconda.com/pkgs/main/linux-64/cryptography-2.9.2-py37h1ba5d50_0.conda
145 | https://repo.anaconda.com/pkgs/main/noarch/networkx-2.4-py_1.conda
146 | https://repo.anaconda.com/pkgs/main/linux-64/numpy-base-1.19.2-py37hfa32c7d_0.conda
147 | https://conda.anaconda.org/bioconda/linux-64/perl-http-message-6.18-pl526_0.tar.bz2
148 | https://conda.anaconda.org/bioconda/noarch/perl-net-http-6.19-pl526_0.tar.bz2
149 | https://conda.anaconda.org/bioconda/linux-64/perl-www-robotrules-6.02-pl526_3.tar.bz2
150 | https://conda.anaconda.org/bioconda/linux-64/perl-xml-simple-2.25-pl526_1.tar.bz2
151 | https://conda.anaconda.org/bioconda/linux-64/sra-tools-2.10.0-pl526he1b5a44_0.tar.bz2
152 | https://repo.anaconda.com/pkgs/main/linux-64/wheel-0.34.2-py37_0.conda
153 | https://conda.anaconda.org/bioconda/linux-64/perl-http-cookies-6.04-pl526_0.tar.bz2
154 | https://conda.anaconda.org/bioconda/linux-64/perl-http-daemon-6.01-pl526_1.tar.bz2
155 | https://conda.anaconda.org/bioconda/linux-64/perl-http-negotiate-6.01-pl526_3.tar.bz2
156 | https://repo.anaconda.com/pkgs/main/linux-64/pip-20.2.2-py37_0.conda
157 | https://repo.anaconda.com/pkgs/main/noarch/pyopenssl-19.1.0-py_1.conda
158 | https://conda.anaconda.org/bioconda/noarch/perl-libwww-perl-6.39-pl526_0.tar.bz2
159 | https://repo.anaconda.com/pkgs/main/noarch/urllib3-1.25.10-py_0.conda
160 | https://conda.anaconda.org/bioconda/linux-64/perl-lwp-protocol-https-6.07-pl526_4.tar.bz2
161 | https://repo.anaconda.com/pkgs/main/noarch/requests-2.24.0-py_0.conda
162 | https://conda.anaconda.org/bioconda/linux-64/entrez-direct-13.3-pl526h375a9b1_0.tar.bz2
163 | https://conda.anaconda.org/bioconda/linux-64/blast-2.9.0-pl526h3066fca_4.tar.bz2
164 | https://repo.anaconda.com/pkgs/main/linux-64/matplotlib-3.2.2-0.conda
165 | https://conda.anaconda.org/bioconda/linux-64/htseq-0.11.2-py37h637b7d7_1.tar.bz2
166 | https://repo.anaconda.com/pkgs/main/linux-64/matplotlib-base-3.2.2-py37hef1b27d_0.conda
167 | https://repo.anaconda.com/pkgs/main/linux-64/mkl_fft-1.1.0-py37h23d657b_0.conda
168 | https://repo.anaconda.com/pkgs/main/linux-64/mkl_random-1.1.1-py37h0573a6f_0.conda
169 | https://repo.anaconda.com/pkgs/main/linux-64/numpy-1.19.2-py37h54aff64_0.conda
170 | https://repo.anaconda.com/pkgs/main/linux-64/numexpr-2.7.1-py37h423224d_0.conda
171 | https://repo.anaconda.com/pkgs/main/linux-64/pandas-1.1.0-py37he6710b0_0.conda
172 | https://repo.anaconda.com/pkgs/main/linux-64/scipy-1.5.0-py37h0b6359f_0.conda
173 | https://repo.anaconda.com/pkgs/main/linux-64/patsy-0.5.1-py37_0.conda
174 | https://repo.anaconda.com/pkgs/main/linux-64/pytables-3.6.1-py37h71ec239_0.conda
175 | https://repo.anaconda.com/pkgs/main/linux-64/statsmodels-0.11.1-py37h7b6447c_0.conda
176 | https://conda.anaconda.org/bioconda/linux-64/mgkit-0.4.2-py37h516909a_0.tar.bz2
177 | 


--------------------------------------------------------------------------------
/envs/bioinfo-notebook.yml:
--------------------------------------------------------------------------------
  1 | name: bioinfo-notebook
  2 | channels:
  3 |   - bioconda
  4 |   - conda-forge
  5 |   - defaults
  6 | dependencies:
  7 |   - _libgcc_mutex=0.1=main
  8 |   - asciinema=2.0.2=py37_1000
  9 |   - bcftools=1.9=ha228f0b_4
 10 |   - blas=1.0=mkl
 11 |   - blast=2.9.0=pl526h3066fca_4
 12 |   - blosc=1.19.0=hd408876_0
 13 |   - bowtie=1.2.3=py37hc9558a2_0
 14 |   - bowtie2=2.3.5.1=py37he513fc3_0
 15 |   - brotlipy=0.7.0=py37h7b6447c_1000
 16 |   - bzip2=1.0.8=h7b6447c_0
 17 |   - ca-certificates=2020.10.14=0
 18 |   - certifi=2020.6.20=pyhd3eb1b0_3
 19 |   - cffi=1.14.1=py37he30daa8_0
 20 |   - chardet=3.0.4=py37_1003
 21 |   - click=7.1.2=py_0
 22 |   - cryptography=2.9.2=py37h1ba5d50_0
 23 |   - curl=7.71.1=hbc83047_1
 24 |   - cycler=0.10.0=py37_0
 25 |   - dbus=1.13.16=hb2f20db_0
 26 |   - decorator=4.4.2=py_0
 27 |   - entrez-direct=13.3=pl526h375a9b1_0
 28 |   - expat=2.2.9=he6710b0_2
 29 |   - fontconfig=2.13.0=h9420a91_0
 30 |   - freetype=2.10.2=h5ab3b9f_0
 31 |   - future=0.18.2=py37_1
 32 |   - glib=2.65.0=h3eb4bd4_0
 33 |   - gst-plugins-base=1.14.0=hbbd80ab_1
 34 |   - gstreamer=1.14.0=hb31296c_0
 35 |   - hdf5=1.10.4=hb1b8bf9_0
 36 |   - htseq=0.11.2=py37h637b7d7_1
 37 |   - icu=58.2=he6710b0_3
 38 |   - idna=2.10=py_0
 39 |   - intel-openmp=2020.1=217
 40 |   - jpeg=9b=h024ee3a_2
 41 |   - kiwisolver=1.2.0=py37hfd86e86_0
 42 |   - krb5=1.18.2=h173b8e3_0
 43 |   - ld_impl_linux-64=2.33.1=h53a641e_7
 44 |   - libcurl=7.71.1=h20c2e04_1
 45 |   - libdeflate=1.0=h14c3975_1
 46 |   - libedit=3.1.20191231=h14c3975_1
 47 |   - libffi=3.3=he6710b0_2
 48 |   - libgcc-ng=9.1.0=hdf63c60_0
 49 |   - libgfortran-ng=7.3.0=hdf63c60_0
 50 |   - libiconv=1.15=h63c8f33_5
 51 |   - libpng=1.6.37=hbc83047_0
 52 |   - libssh2=1.9.0=h1ba5d50_1
 53 |   - libstdcxx-ng=9.1.0=hdf63c60_0
 54 |   - libuuid=1.0.3=h1bed415_2
 55 |   - libxcb=1.14=h7b6447c_0
 56 |   - libxml2=2.9.10=he19cac6_1
 57 |   - lz4-c=1.9.2=he6710b0_1
 58 |   - lzo=2.10=h7b6447c_2
 59 |   - matplotlib=3.2.2=0
 60 |   - matplotlib-base=3.2.2=py37hef1b27d_0
 61 |   - mgkit=0.4.2=py37h516909a_0
 62 |   - mkl=2020.1=217
 63 |   - mkl-service=2.3.0=py37he904b0f_0
 64 |   - mkl_fft=1.1.0=py37h23d657b_0
 65 |   - mkl_random=1.1.1=py37h0573a6f_0
 66 |   - mock=4.0.2=py_0
 67 |   - msgpack-python=1.0.0=py37hfd86e86_1
 68 |   - ncbi-ngs-sdk=2.10.0=hdf6179e_0
 69 |   - ncurses=6.2=he6710b0_1
 70 |   - networkx=2.4=py_1
 71 |   - numexpr=2.7.1=py37h423224d_0
 72 |   - numpy=1.19.2=py37h54aff64_0
 73 |   - numpy-base=1.19.2=py37hfa32c7d_0
 74 |   - openssl=1.1.1h=h7b6447c_0
 75 |   - pandas=1.1.0=py37he6710b0_0
 76 |   - patsy=0.5.1=py37_0
 77 |   - pcre=8.44=he6710b0_0
 78 |   - perl=5.26.2=h14c3975_0
 79 |   - perl-app-cpanminus=1.7044=pl526_1
 80 |   - perl-archive-tar=2.32=pl526_0
 81 |   - perl-base=2.23=pl526_1
 82 |   - perl-business-isbn=3.004=pl526_0
 83 |   - perl-business-isbn-data=20140910.003=pl526_0
 84 |   - perl-carp=1.38=pl526_3
 85 |   - perl-common-sense=3.74=pl526_2
 86 |   - perl-compress-raw-bzip2=2.087=pl526he1b5a44_0
 87 |   - perl-compress-raw-zlib=2.087=pl526hc9558a2_0
 88 |   - perl-constant=1.33=pl526_1
 89 |   - perl-data-dumper=2.173=pl526_0
 90 |   - perl-digest-hmac=1.03=pl526_3
 91 |   - perl-digest-md5=2.55=pl526_0
 92 |   - perl-encode=2.88=pl526_1
 93 |   - perl-encode-locale=1.05=pl526_6
 94 |   - perl-exporter=5.72=pl526_1
 95 |   - perl-exporter-tiny=1.002001=pl526_0
 96 |   - perl-extutils-makemaker=7.36=pl526_1
 97 |   - perl-file-listing=6.04=pl526_1
 98 |   - perl-file-path=2.16=pl526_0
 99 |   - perl-file-temp=0.2304=pl526_2
100 |   - perl-html-parser=3.72=pl526h6bb024c_5
101 |   - perl-html-tagset=3.20=pl526_3
102 |   - perl-html-tree=5.07=pl526_1
103 |   - perl-http-cookies=6.04=pl526_0
104 |   - perl-http-daemon=6.01=pl526_1
105 |   - perl-http-date=6.02=pl526_3
106 |   - perl-http-message=6.18=pl526_0
107 |   - perl-http-negotiate=6.01=pl526_3
108 |   - perl-io-compress=2.087=pl526he1b5a44_0
109 |   - perl-io-html=1.001=pl526_2
110 |   - perl-io-socket-ssl=2.066=pl526_0
111 |   - perl-io-zlib=1.10=pl526_2
112 |   - perl-json=4.02=pl526_0
113 |   - perl-json-xs=2.34=pl526h6bb024c_3
114 |   - perl-libwww-perl=6.39=pl526_0
115 |   - perl-list-moreutils=0.428=pl526_1
116 |   - perl-list-moreutils-xs=0.428=pl526_0
117 |   - perl-lwp-mediatypes=6.04=pl526_0
118 |   - perl-lwp-protocol-https=6.07=pl526_4
119 |   - perl-mime-base64=3.15=pl526_1
120 |   - perl-mozilla-ca=20180117=pl526_1
121 |   - perl-net-http=6.19=pl526_0
122 |   - perl-net-ssleay=1.88=pl526h90d6eec_0
123 |   - perl-ntlm=1.09=pl526_4
124 |   - perl-parent=0.236=pl526_1
125 |   - perl-pathtools=3.75=pl526h14c3975_1
126 |   - perl-scalar-list-utils=1.52=pl526h516909a_0
127 |   - perl-socket=2.027=pl526_1
128 |   - perl-storable=3.15=pl526h14c3975_0
129 |   - perl-test-requiresinternet=0.05=pl526_0
130 |   - perl-time-local=1.28=pl526_1
131 |   - perl-try-tiny=0.30=pl526_1
132 |   - perl-types-serialiser=1.0=pl526_2
133 |   - perl-uri=1.76=pl526_0
134 |   - perl-www-robotrules=6.02=pl526_3
135 |   - perl-xml-libxml=2.0132=pl526h7ec2d77_1
136 |   - perl-xml-namespacesupport=1.12=pl526_0
137 |   - perl-xml-parser=2.44=pl526h4e0c4b3_7
138 |   - perl-xml-sax=1.02=pl526_0
139 |   - perl-xml-sax-base=1.09=pl526_0
140 |   - perl-xml-sax-expat=0.51=pl526_3
141 |   - perl-xml-simple=2.25=pl526_1
142 |   - perl-xsloader=0.24=pl526_0
143 |   - pip=20.2.2=py37_0
144 |   - pycparser=2.20=py_2
145 |   - pymongo=3.11.0=py37he6710b0_0
146 |   - pyopenssl=19.1.0=py_1
147 |   - pyparsing=2.4.7=py_0
148 |   - pyqt=5.9.2=py37h05f1152_2
149 |   - pysam=0.15.3=py37hda2845c_1
150 |   - pysocks=1.7.1=py37_1
151 |   - pytables=3.6.1=py37h71ec239_0
152 |   - python=3.7.7=hcff3b4d_5
153 |   - python-dateutil=2.8.1=py_0
154 |   - pytz=2020.1=py_0
155 |   - qt=5.9.7=h5867ecd_1
156 |   - readline=8.0=h7b6447c_0
157 |   - requests=2.24.0=py_0
158 |   - samtools=1.6=h244ad75_5
159 |   - scipy=1.5.0=py37h0b6359f_0
160 |   - semidbm=0.5.1=pyh864c0ab_3
161 |   - setuptools=49.4.0=py37_0
162 |   - sip=4.19.8=py37hf484d3e_0
163 |   - six=1.15.0=py_0
164 |   - snappy=1.1.8=he6710b0_0
165 |   - spades=3.13.0=0
166 |   - sqlite=3.32.3=h62c20be_0
167 |   - sra-tools=2.10.0=pl526he1b5a44_0
168 |   - statsmodels=0.11.1=py37h7b6447c_0
169 |   - subread=2.0.0=hed695b0_0
170 |   - tbb=2020.0=hfd86e86_0
171 |   - tk=8.6.10=hbc83047_0
172 |   - tornado=6.0.4=py37h7b6447c_1
173 |   - tqdm=4.48.2=py_0
174 |   - urllib3=1.25.10=py_0
175 |   - wheel=0.34.2=py37_0
176 |   - xz=5.2.5=h7b6447c_0
177 |   - zlib=1.2.11=h7b6447c_3
178 |   - zstd=1.4.5=h9ceee32_0
179 | 


--------------------------------------------------------------------------------
/envs/orthofinder.yml:
--------------------------------------------------------------------------------
  1 | name: orthofinder
  2 | channels:
  3 |   - bioconda
  4 |   - conda-forge
  5 |   - defaults
  6 | dependencies:
  7 |   - _libgcc_mutex=0.1=conda_forge
  8 |   - _openmp_mutex=4.5=1_gnu
  9 |   - blast=2.10.1=pl526he19e7b1_1
 10 |   - boost-cpp=1.70.0=h7b93d67_3
 11 |   - bzip2=1.0.8=h516909a_3
 12 |   - c-ares=1.16.1=h516909a_3
 13 |   - ca-certificates=2020.6.20=hecda079_0
 14 |   - certifi=2019.11.28=py27h8c360ce_1
 15 |   - curl=7.71.1=he644dc0_5
 16 |   - diamond=2.0.4=h56fc30b_0
 17 |   - dlcpar=1.0=py_2
 18 |   - entrez-direct=13.8=pl526h375a9b1_0
 19 |   - expat=2.2.9=he1b5a44_2
 20 |   - fastme=2.1.5=0
 21 |   - fasttree=2.1.10=h516909a_4
 22 |   - gawk=5.1.0=h516909a_0
 23 |   - gettext=0.19.8.1=hc5be6a0_1002
 24 |   - icu=67.1=he1b5a44_0
 25 |   - iqtree=2.0.3=h176a8bc_0
 26 |   - krb5=1.17.1=hfafb76e_3
 27 |   - ld_impl_linux-64=2.35=h769bd43_9
 28 |   - libblas=3.8.0=17_openblas
 29 |   - libcblas=3.8.0=17_openblas
 30 |   - libcurl=7.71.1=hcdd3856_5
 31 |   - libedit=3.1.20191231=he28a2e2_2
 32 |   - libev=4.33=h516909a_1
 33 |   - libffi=3.2.1=he1b5a44_1007
 34 |   - libgcc=7.2.0=h69d50b8_2
 35 |   - libgcc-ng=9.3.0=h24d8f2e_16
 36 |   - libgfortran-ng=7.5.0=hdf63c60_16
 37 |   - libgomp=9.3.0=h24d8f2e_16
 38 |   - libidn2=2.3.0=h516909a_0
 39 |   - liblapack=3.8.0=17_openblas
 40 |   - libnghttp2=1.41.0=h8cfc5f6_2
 41 |   - libopenblas=0.3.10=pthreads_hb3c22a3_4
 42 |   - libssh2=1.9.0=hab1572f_5
 43 |   - libstdcxx-ng=9.3.0=hdf63c60_16
 44 |   - libunistring=0.9.10=h14c3975_0
 45 |   - llvm-meta=7.0.0=0
 46 |   - lz4-c=1.9.2=he1b5a44_3
 47 |   - mafft=7.471=h516909a_0
 48 |   - mcl=14.137=pl526h516909a_5
 49 |   - mmseqs2=12.113e3=h2d02072_0
 50 |   - muscle=3.8.1551=hc9558a2_5
 51 |   - ncurses=6.2=he1b5a44_1
 52 |   - numpy=1.16.5=py27h95a1406_0
 53 |   - openmp=7.0.0=h2d50403_0
 54 |   - openssl=1.1.1g=h516909a_1
 55 |   - orthofinder=2.2.7=0
 56 |   - pcre=8.44=he1b5a44_0
 57 |   - perl=5.26.2=h516909a_1006
 58 |   - perl-app-cpanminus=1.7044=pl526_1
 59 |   - perl-archive-tar=2.32=pl526_0
 60 |   - perl-base=2.23=pl526_1
 61 |   - perl-business-isbn=3.004=pl526_0
 62 |   - perl-business-isbn-data=20140910.003=pl526_0
 63 |   - perl-carp=1.38=pl526_3
 64 |   - perl-common-sense=3.74=pl526_2
 65 |   - perl-compress-raw-bzip2=2.087=pl526he1b5a44_0
 66 |   - perl-compress-raw-zlib=2.087=pl526hc9558a2_0
 67 |   - perl-constant=1.33=pl526_1
 68 |   - perl-data-dumper=2.173=pl526_0
 69 |   - perl-digest-hmac=1.03=pl526_3
 70 |   - perl-digest-md5=2.55=pl526_0
 71 |   - perl-encode=2.88=pl526_1
 72 |   - perl-encode-locale=1.05=pl526_6
 73 |   - perl-exporter=5.72=pl526_1
 74 |   - perl-exporter-tiny=1.002001=pl526_0
 75 |   - perl-extutils-makemaker=7.36=pl526_1
 76 |   - perl-file-listing=6.04=pl526_1
 77 |   - perl-file-path=2.16=pl526_0
 78 |   - perl-file-temp=0.2304=pl526_2
 79 |   - perl-html-parser=3.72=pl526h6bb024c_5
 80 |   - perl-html-tagset=3.20=pl526_3
 81 |   - perl-html-tree=5.07=pl526_1
 82 |   - perl-http-cookies=6.04=pl526_0
 83 |   - perl-http-daemon=6.01=pl526_1
 84 |   - perl-http-date=6.02=pl526_3
 85 |   - perl-http-message=6.18=pl526_0
 86 |   - perl-http-negotiate=6.01=pl526_3
 87 |   - perl-io-compress=2.087=pl526he1b5a44_0
 88 |   - perl-io-html=1.001=pl526_2
 89 |   - perl-io-socket-ssl=2.066=pl526_0
 90 |   - perl-io-zlib=1.10=pl526_2
 91 |   - perl-json=4.02=pl526_0
 92 |   - perl-json-xs=2.34=pl526h6bb024c_3
 93 |   - perl-libwww-perl=6.39=pl526_0
 94 |   - perl-list-moreutils=0.428=pl526_1
 95 |   - perl-list-moreutils-xs=0.428=pl526_0
 96 |   - perl-lwp-mediatypes=6.04=pl526_0
 97 |   - perl-lwp-protocol-https=6.07=pl526_4
 98 |   - perl-mime-base64=3.15=pl526_1
 99 |   - perl-mozilla-ca=20180117=pl526_1
100 |   - perl-net-http=6.19=pl526_0
101 |   - perl-net-ssleay=1.88=pl526h90d6eec_0
102 |   - perl-ntlm=1.09=pl526_4
103 |   - perl-parent=0.236=pl526_1
104 |   - perl-pathtools=3.75=pl526h14c3975_1
105 |   - perl-scalar-list-utils=1.52=pl526h516909a_0
106 |   - perl-socket=2.027=pl526_1
107 |   - perl-storable=3.15=pl526h14c3975_0
108 |   - perl-test-requiresinternet=0.05=pl526_0
109 |   - perl-time-local=1.28=pl526_1
110 |   - perl-try-tiny=0.30=pl526_1
111 |   - perl-types-serialiser=1.0=pl526_2
112 |   - perl-uri=1.76=pl526_0
113 |   - perl-www-robotrules=6.02=pl526_3
114 |   - perl-xml-namespacesupport=1.12=pl526_0
115 |   - perl-xml-parser=2.44_01=pl526ha1d75be_1002
116 |   - perl-xml-sax=1.02=pl526_0
117 |   - perl-xml-sax-base=1.09=pl526_0
118 |   - perl-xml-sax-expat=0.51=pl526_3
119 |   - perl-xml-simple=2.25=pl526_1
120 |   - perl-xsloader=0.24=pl526_0
121 |   - pip=20.1.1=pyh9f0ad1d_0
122 |   - python=2.7.15=h5a48372_1011_cpython
123 |   - python_abi=2.7=1_cp27mu
124 |   - raxml=8.2.12=h516909a_2
125 |   - readline=8.0=he28a2e2_2
126 |   - scipy=1.2.1=py27h921218d_2
127 |   - setuptools=44.0.0=py27_0
128 |   - sqlite=3.33.0=h4cf870e_0
129 |   - tk=8.6.10=hed695b0_0
130 |   - wget=1.20.1=h22169c7_0
131 |   - wheel=0.35.1=pyh9f0ad1d_0
132 |   - xz=5.2.5=h516909a_1
133 |   - zlib=1.2.11=h516909a_1009
134 |   - zstd=1.4.5=h6597ccf_2
135 | 


--------------------------------------------------------------------------------
/envs/sgRNAcas9.yml:
--------------------------------------------------------------------------------
 1 | name: sgRNAcas9
 2 | channels:
 3 |   - bioconda
 4 |   - conda-forge
 5 |   - defaults
 6 | dependencies:
 7 |   - _libgcc_mutex=0.1=conda_forge
 8 |   - _openmp_mutex=4.5=1_gnu
 9 |   - alsa-lib=1.2.3=h516909a_0
10 |   - ca-certificates=2020.6.20=hecda079_0
11 |   - cairo=1.16.0=h3fc0475_1005
12 |   - certifi=2020.6.20=py38h32f6830_0
13 |   - fontconfig=2.13.1=h1056068_1002
14 |   - freetype=2.10.2=he06d7ca_0
15 |   - gettext=0.19.8.1=hc5be6a0_1002
16 |   - giflib=5.2.1=h516909a_2
17 |   - glib=2.66.1=h680cd38_0
18 |   - graphite2=1.3.13=he1b5a44_1001
19 |   - harfbuzz=2.7.2=hee91db6_0
20 |   - icu=67.1=he1b5a44_0
21 |   - jpeg=9d=h516909a_0
22 |   - lcms2=2.11=hbd6801e_0
23 |   - ld_impl_linux-64=2.35=h769bd43_9
24 |   - libffi=3.2.1=he1b5a44_1007
25 |   - libgcc-ng=9.3.0=h5dbcf3e_17
26 |   - libgomp=9.3.0=h5dbcf3e_17
27 |   - libiconv=1.16=h516909a_0
28 |   - libpng=1.6.37=hed695b0_2
29 |   - libstdcxx-ng=9.3.0=h2ae2ef3_17
30 |   - libtiff=4.1.0=hc7e4089_6
31 |   - libuuid=2.32.1=h14c3975_1000
32 |   - libwebp-base=1.1.0=h516909a_3
33 |   - libxcb=1.13=h14c3975_1002
34 |   - libxml2=2.9.10=h68273f3_2
35 |   - lz4-c=1.9.2=he1b5a44_3
36 |   - ncurses=6.2=he1b5a44_1
37 |   - openjdk=11.0.8=hacce0ff_0
38 |   - openssl=1.1.1h=h516909a_0
39 |   - pcre=8.44=he1b5a44_0
40 |   - perl=5.30.3=h516909a_1
41 |   - pip=20.2.3=py_0
42 |   - pixman=0.38.0=h516909a_1003
43 |   - pthread-stubs=0.4=h14c3975_1001
44 |   - python=3.8.5=h1103e12_9_cpython
45 |   - python_abi=3.8=1_cp38
46 |   - readline=8.0=he28a2e2_2
47 |   - seqmap=1.0.13=hc9558a2_1
48 |   - setuptools=49.6.0=py38h32f6830_1
49 |   - sqlite=3.33.0=h4cf870e_0
50 |   - tk=8.6.10=hed695b0_0
51 |   - wheel=0.35.1=pyh9f0ad1d_0
52 |   - xorg-fixesproto=5.0=h14c3975_1002
53 |   - xorg-inputproto=2.3.2=h14c3975_1002
54 |   - xorg-kbproto=1.0.7=h14c3975_1002
55 |   - xorg-libice=1.0.10=h516909a_0
56 |   - xorg-libsm=1.2.3=h84519dc_1000
57 |   - xorg-libx11=1.6.12=h516909a_0
58 |   - xorg-libxau=1.0.9=h14c3975_0
59 |   - xorg-libxdmcp=1.1.3=h516909a_0
60 |   - xorg-libxext=1.3.4=h516909a_0
61 |   - xorg-libxfixes=5.0.3=h516909a_1004
62 |   - xorg-libxi=1.7.10=h516909a_0
63 |   - xorg-libxrender=0.9.10=h516909a_1002
64 |   - xorg-libxtst=1.2.3=h516909a_1002
65 |   - xorg-recordproto=1.14.2=h516909a_1002
66 |   - xorg-renderproto=0.11.1=h14c3975_1002
67 |   - xorg-xextproto=7.3.0=h14c3975_1002
68 |   - xorg-xproto=7.0.31=h14c3975_1007
69 |   - xz=5.2.5=h516909a_1
70 |   - zlib=1.2.11=h516909a_1009
71 |   - zstd=1.4.5=h6597ccf_2
72 | 


--------------------------------------------------------------------------------
/scripts/DE_analysis_edgeR_script.R:
--------------------------------------------------------------------------------
  1 | # https://github.com/rnnh/bioinfo-notebook.git
  2 | 
  3 | # Loading required libraries
  4 | library(limma)
  5 | library(edgeR)
  6 | 
  7 | # Changing working directory
  8 | # Setting the working directory to the directory which contains this script
  9 | if (exists("RStudio.Version")){
 10 |   setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
 11 | } else {
 12 |   setwd(getSrcDirectory()[1])
 13 | }
 14 | 
 15 | # Reading in the feature count file as "counts.df"
 16 | counts.df <- read.csv("../data/featCounts_S_cere_20200331.csv")
 17 | 
 18 | # Printing the start of the counts.df object in R...
 19 | head(counts.df)
 20 | 
 21 | # Using the "Geneid" column to set the rownames
 22 | rownames(counts.df) <- counts.df$Geneid
 23 | 
 24 | # Removing the "Geneid" column
 25 | counts.df$Geneid <- NULL
 26 | 
 27 | # Printing the start of the counts.df object in R...
 28 | head(counts.df)
 29 | 
 30 | # Reading in the design table as "design.df"
 31 | design.df <- read.csv("../data/design_table.csv", fileEncoding="UTF-8-BOM")
 32 | 
 33 | # Printing the start of the design.df object in R...
 34 | print(design.df)
 35 | 
 36 | # Subsetting gene counts according to experimental condition
 37 | counts_standard.df  <- counts.df[,c("SRR8933535", "SRR8933536", "SRR8933537")]
 38 | counts_anaerobic.df <- counts.df[,c("SRR8933506", "SRR8933511", "SRR8933512")]
 39 | counts_high_temp.df <- counts.df[,c("SRR8933532", "SRR8933533", "SRR8933534")]
 40 | counts_low_pH.df    <- counts.df[,c("SRR8933530", "SRR8933531", "SRR8933539")]
 41 | counts_pressure.df  <- counts.df[,c("SRR8933509", "SRR8933510", "SRR8933538")]
 42 | 
 43 | # Printing the structure of the gene counts set and subsets
 44 | str(counts.df)
 45 | str(counts_standard.df)
 46 | str(counts_anaerobic.df)
 47 | str(counts_high_temp.df)
 48 | str(counts_low_pH.df)
 49 | str(counts_pressure.df)
 50 | 
 51 | # Defining function "RSD.test()"
 52 | RSD.test <- function(dataframe){
 53 |   # This function tests whether the relative standard deviation (RSD) is less
 54 |   # than or equal to one for each row in a data frame.
 55 |   # It adds the result to a new variable in the data frame called "RSD.test".
 56 |   # For a given row, if data.frame$RSD.test is TRUE, that row has an RSD less
 57 |   # than or equal to one, i.e. RSD <= 1.
 58 |   # If data.frame$RSD.test is FALSE, that row has an RSD outside of this range.
 59 |   RSD_tests = dataframe[,1]
 60 |   for (row_index in 1:nrow(dataframe)){
 61 |     row = as.numeric(dataframe[row_index,])
 62 |     RSD = sd(row) / mean(row)
 63 |     RSD_tests[row_index] = RSD <= 1 || is.na(RSD)
 64 |   }
 65 |   dataframe$RSD.test <- as.factor(RSD_tests)
 66 |   levels(dataframe$RSD.test) <- c(FALSE, TRUE)
 67 |   return(dataframe)
 68 | }
 69 | 
 70 | # Applying RSD.test() to gene count subsets
 71 | counts_standard.df  <- RSD.test(counts_standard.df)
 72 | counts_anaerobic.df <- RSD.test(counts_anaerobic.df)
 73 | counts_high_temp.df <- RSD.test(counts_high_temp.df)
 74 | counts_low_pH.df    <- RSD.test(counts_low_pH.df)
 75 | counts_pressure.df  <- RSD.test(counts_pressure.df)
 76 | 
 77 | # Printing the structure of the gene counts subsets
 78 | str(counts_standard.df)
 79 | str(counts_anaerobic.df)
 80 | str(counts_high_temp.df)
 81 | str(counts_low_pH.df)
 82 | str(counts_pressure.df)
 83 | 
 84 | # Creating list of genes which failed RSD test
 85 | RSD_failed_genes <- rownames(counts_standard.df[
 86 |   which(counts_standard.df$RSD.test == FALSE),])
 87 | RSD_failed_genes <- append(RSD_failed_genes, rownames(counts_anaerobic.df[
 88 |   which(counts_anaerobic.df$RSD.test == FALSE),]))
 89 | RSD_failed_genes <- append(RSD_failed_genes, rownames(counts_high_temp.df[
 90 |   which(counts_high_temp.df$RSD.test == FALSE),]))
 91 | RSD_failed_genes <- append(RSD_failed_genes, rownames(counts_low_pH.df[
 92 |   which(counts_low_pH.df$RSD.test == FALSE),]))
 93 | RSD_failed_genes <- append(RSD_failed_genes, rownames(counts_pressure.df[
 94 |   which(counts_pressure.df$RSD.test == FALSE),]))
 95 | RSD_failed_genes <- unique(RSD_failed_genes)
 96 | length(RSD_failed_genes)
 97 | 
 98 | # Filtering gene counts
 99 | filtered_counts.df <- counts.df[
100 |   which(!rownames(counts.df) %in% RSD_failed_genes),]
101 | 
102 | # Printing the structure of the filtered gene counts
103 | str(filtered_counts.df)
104 | 
105 | # Checking that gene counts were correctly filtered
106 | nrow(counts.df) - length(RSD_failed_genes) == nrow(filtered_counts.df)
107 | 
108 | # Removing redundant objects from R environment
109 | rm(counts_anaerobic.df, counts_high_temp.df, counts_low_pH.df,
110 |    counts_pressure.df, counts_standard.df, counts.df, RSD_failed_genes)
111 | 
112 | # Creating a DGEList object using the filtered gene counts
113 | counts.DGEList <- DGEList(counts = filtered_counts.df,
114 |                           genes = rownames(filtered_counts.df))
115 | 
116 | # Printing the design table
117 | print(design.df)
118 | 
119 | # Confirming samples are in the same order in the gene counts and design table
120 | summary(colnames(filtered_counts.df) == design.df$run)
121 | 
122 | # Add grouping information to DGEList object
123 | counts.DGEList$samples$group <- as.factor(design.df$condition)
124 | 
125 | # Printing counts.DGEList
126 | counts.DGEList
127 | 
128 | # Summary of the counts.DGEList object: number of genes, number of samples
129 | dim(counts.DGEList)
130 | 
131 | # Creating an object to filter genes with low expression
132 | counts.keep <- filterByExpr(counts.DGEList)
133 | summary(counts.keep)
134 | 
135 | # Filtering lowly expressed genes
136 | counts.DGEList <- counts.DGEList[counts.keep, , keep.lib.sizes = FALSE]
137 | dim(counts.DGEList)
138 | 
139 | # Confirming that the number of genes in counts.DGEList is the same as the
140 | # number of TRUE values in counts.keep
141 | length(counts.keep[counts.keep == TRUE]) == dim(counts.DGEList)[1]
142 | 
143 | # Removing counts.keep
144 | rm(counts.keep)
145 | 
146 | # Printing the normalisation factors for the libraries
147 | counts.DGEList$samples$norm.factors
148 | 
149 | # Calculating normalisation factors and applying them to counts.DGEList
150 | counts.DGEList <- calcNormFactors(counts.DGEList)
151 | counts.DGEList$samples$norm.factors
152 | 
153 | # Estimating common dispersion and tagwise dispersion
154 | condition_ <- design.df$condition
155 | counts.DGEList <- estimateDisp(counts.DGEList,
156 |                                design = model.matrix(~condition_))
157 | 
158 | counts.DGEList
159 | 
160 | condition_
161 | 
162 | # Exact tests for differences between experimental conditions
163 | std_anaerobic.DGEExact <- exactTest(counts.DGEList, pair = c("standard",
164 |                                                              "anaerobic"))
165 | std_salt.DGEExact <- exactTest(counts.DGEList, pair = c("standard",
166 |                                                         "osmotic_pressure"))
167 | std_temp.DGEExact <- exactTest(counts.DGEList, pair = c("standard",
168 |                                                         "high_temp"))
169 | std_pH.DGEExact <- exactTest(counts.DGEList, pair = c("standard",
170 |                                                       "low_pH"))
171 | 
172 | # Extracting most differentially expressed genes from exact tests
173 | std_anaerobic.topTags <- topTags(std_anaerobic.DGEExact)
174 | std_salt.topTags <- topTags(std_salt.DGEExact)
175 | std_temp.topTags <- topTags(std_temp.DGEExact)
176 | std_pH.topTags <- topTags(std_pH.DGEExact)
177 | 
178 | # Printing the most differentially expressed genes
179 | std_anaerobic.topTags
180 | std_salt.topTags
181 | std_temp.topTags
182 | std_pH.topTags
183 | 
184 | # Printing session information
185 | sessionInfo()
186 | 


--------------------------------------------------------------------------------
/scripts/UniProt_downloader.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | # https://github.com/rnnh/bioinfo-notebook.git
 3 | 
 4 | # Help/usage text
 5 | usage="$(basename "$0") [-h|--help] [-p|--processors n -o|--output] -i|--input \n
 6 | \n
 7 | This script takes a list of UniProt primary accession numbers (*.list), and \n
 8 | downloads the corresponding protein sequences from UniProt as a FASTA amino \n
 9 | acid (.faa) file.\n
10 | \n
11 | This list can be generated by searching UniProtKB for a desired term (e.g. \n
12 | 'taxonomy:147537' for the Saccharomycotina subphylum), selecting 'Download' \n
13 | and 'Format: List' to download the accession numbers of the corresponding \n
14 | results.\n
15 | \n
16 | arguments: \n
17 | \t    -h | --help\t\t         show this help text and exit \n
18 | \t    -i | --input\t\t        the list of UniProt proteins to download \n
19 | \t    -p | --processors\t     optional: set the number (n) of processors to \n
20 | \t\t\t\t                      use (default: 1) \n
21 | \t    -o | --output\t\t       optional: name of the output .faa file \n
22 | \t\t\t\t                      (default: uniprot_{date}.faa) \n
23 | "
24 | 
25 | PROCESSORS=1
26 | OUTPUT=uniprot_$(date +%Y%m%d).faa
27 | 
28 | # Iterating through the input arguments with a while loop
29 | while (( "$#" )); do
30 |     case "$1" in
31 |         -h|--help)
32 |             echo -e $usage
33 |             exit
34 |             ;;
35 |         -i|--input)
36 |             INPUT=$2
37 |             shift 2
38 |             ;;
39 |         -p|--processors)
40 |             PROCESSORS=$2
41 |             shift 2
42 |             ;;
43 |         -o|--output)
44 |             OUTPUT=$2
45 |             shift 2
46 |             ;;
47 |         --) # end argument parsing
48 |             shift
49 |             break
50 |             ;;
51 |         -*|--*) # unsupported flags
52 |             echo -e "ERROR: $1 is an invalid option. \n" >&2
53 |             echo -e $usage
54 |             exit 1
55 |             ;;
56 |     esac
57 | done
58 | 
59 | if test -z "$INPUT";
60 | then
61 |     echo -e "ERROR: No input file given. \n" >&2
62 |     echo -e $usage
63 |     exit 1
64 | fi
65 | 
66 | echo "$(date +%Y/%m/%d\ %H:%M) Downloading UniProt sequences..."
67 | 
68 | cat $INPUT | \
69 | xargs -n 1 -P $PROCESSORS -I % curl -s https://www.uniprot.org/uniprot/%.fasta \
70 | >> $OUTPUT
71 | 
72 | echo "$(date +%Y/%m/%d\ %H:%M) Script finished."
73 | 


--------------------------------------------------------------------------------
/scripts/annotated_snps_filter.R:
--------------------------------------------------------------------------------
  1 | # https://github.com/rnnh/bioinfo-notebook.git
  2 | 
  3 | # Aim ==========================================================================
  4 | 
  5 | # This script cross-references annotated SNP files created using
  6 | # annotating_snps.R. It takes two files created using this script, and returns
  7 | # unique SNPs for each file. If a SNP in File 1 is not found at the same
  8 | # position on the same sequence as File 2, it is returned as a unique SNP, and
  9 | # vice versa. These unique SNPs are then written to new .tsv files.
 10 | 
 11 | # Selecting files ==============================================================
 12 | 
 13 | #   - Assign the name of the first annotated SNP file to be filtered to
 14 | #     'annotated_SNP_file_1'
 15 | #   - Assign the name of the second annotated SNP file to be filtered to
 16 | #     'annotated_SNP_file_2'
 17 | #   - These files should be in the `~/bioinfo-notebook/data/` directory.
 18 | #   - Optional: the name of the output files can be assigned on lines 109 and
 19 | #     115 respectively.
 20 | 
 21 | annotated_SNP_file_1 <- "<.tsv File name here>"
 22 | annotated_SNP_file_2 <- "<.tsv File name here>"
 23 | 
 24 | # Setup ========================================================================
 25 | 
 26 | # Setting the working directory
 27 | setwd("~/bioinfo-notebook/data")
 28 |   
 29 | annotated_SNP_file_1 <- read.table(
 30 |   annotated_SNP_file_1,
 31 |   stringsAsFactors = FALSE, header = TRUE)
 32 | 
 33 | annotated_SNP_file_2 <- read.table(
 34 |   annotated_SNP_file_2,
 35 |   stringsAsFactors = FALSE, header = TRUE)
 36 | 
 37 | # Finding rows in common between annotated SNP data frames =====================
 38 | 
 39 | # This needs to be carried out multiple times because the number of rows in
 40 | # each annotate SNP file differ. Two files may have a SNP in common, but it may
 41 | # not occur at the same row number
 42 | 
 43 | # Loops in this section are structured as follows:
 44 | # For every row index in a given data frame...
 45 | #   Get the row using the row index
 46 | #   If the SNP position for a given row index is in the other data frame...
 47 | #     Get the indices of the matching rows
 48 | #     For each index in the indices of matching row...
 49 | #       If the sequence names are the same for the matching rows...
 50 | #         Add that index to the matching row values
 51 | #         Keep only the unique indices
 52 | 
 53 | # Creating empty integer values for the matching SNPs
 54 | file_1_SNPs_common_with_file_2 <- integer()
 55 | file_2_SNPs_common_with_file_1 <- integer()
 56 | 
 57 | # Rows in common between file 1 and file 2
 58 | for (index in 1:nrow(annotated_SNP_file_2)){
 59 |   row = annotated_SNP_file_2[index, ]
 60 |   if (row$POS %in% annotated_SNP_file_1$POS){
 61 |     matching_row_indices = which(annotated_SNP_file_1$POS == row$POS)
 62 |     for (mr_index in matching_row_indices){
 63 |       if (annotated_SNP_file_1$sequence[mr_index] == row$sequence){
 64 |         file_1_SNPs_common_with_file_2 <- c(file_1_SNPs_common_with_file_2,
 65 |                                             mr_index)
 66 |         file_1_SNPs_common_with_file_2 <- unique(file_1_SNPs_common_with_file_2)
 67 |       }
 68 |     }
 69 |   }
 70 | }
 71 | 
 72 | # Rows in common between file 2 and file 1
 73 | for (index in 1:nrow(annotated_SNP_file_1)){
 74 |   row = annotated_SNP_file_1[index, ]
 75 |   if (row$POS %in% annotated_SNP_file_2$POS){
 76 |     matching_row_indices = which(annotated_SNP_file_2$POS == row$POS)
 77 |     for (mr_index in matching_row_indices){
 78 |       if (annotated_SNP_file_2$sequence[mr_index] == row$sequence){
 79 |         file_2_SNPs_common_with_file_1 <- c(file_2_SNPs_common_with_file_1,
 80 |                                             mr_index)
 81 |         file_2_SNPs_common_with_file_1 <- unique(file_2_SNPs_common_with_file_1)
 82 |       }
 83 |     }
 84 |   }
 85 | }
 86 | 
 87 | # Filtering SNPs in common between annotated SNP data frames ===================
 88 | 
 89 | # The matching row values produced by the loops in the previous section are
 90 | # used to subset each data frame: this is done by selecting non-matching rows
 91 | 
 92 | annotated_SNP_file_1_unique.df <- annotated_SNP_file_1[-file_1_SNPs_common_with_file_2, ]
 93 | annotated_SNP_file_2_unique.df <- annotated_SNP_file_2[-file_2_SNPs_common_with_file_1, ]
 94 | 
 95 | # Checking that the correct number of rows were filtered =======================
 96 | 
 97 | # If the correct number of rows were filtered, the following statements should
 98 | # all return TRUE
 99 | 
100 | nrow(annotated_SNP_file_2) == nrow(annotated_SNP_file_2_unique.df) +
101 |   length(file_2_SNPs_common_with_file_1)
102 | 
103 | nrow(annotated_SNP_file_1) == nrow(annotated_SNP_file_1_unique.df) + 
104 |   length(file_1_SNPs_common_with_file_2)
105 | 
106 | # Writing data frames to tab-separated values (.tsv) files =====================
107 | 
108 | write.table(annotated_SNP_file_1_unique.df,
109 |             file = c(annotated_SNP_file_1, "_filtered.tsv",
110 |             fileEncoding = "UTF-8",
111 |             sep = "\t",
112 |             row.names = FALSE)
113 |             
114 | write.table(annotated_SNP_file_2_unique.df,
115 |             file = c(annotated_SNP_file_2, "_filtered.tsv",
116 |             fileEncoding = "UTF-8",
117 |             sep = "\t",
118 |             row.names = FALSE)
119 | 
120 | # Exiting ======================================================================
121 | quit(save = "no")
122 | 


--------------------------------------------------------------------------------
/scripts/annotating_snps.R:
--------------------------------------------------------------------------------
  1 | # https://github.com/rnnh/bioinfo-notebook.git
  2 | 
  3 | # Aim ==========================================================================
  4 | 
  5 | # The aim of this script is to cross-reference annotations of genome assemblies
  6 | # with VCF files containing SNPs of sequencing reads aligned against those
  7 | # genome assemblies. If a SNP falls within- or upstream of- an annotated
  8 | # genome feature (start codon, stop codon, CDS, etc.), the script will return
  9 | # that feature along with the SNP.
 10 | 
 11 | # Selecting files and parameters ===============================================
 12 | 
 13 | #   - The VCF and GFF files to be cross-referenced are specified in this
 14 | #     section. For this script to work, these files need to use the same
 15 | #     sequence names: e.g. if the first sequence in the VCF is called "chrI",
 16 | #     there should be a corresponding sequence called "chrI" in the GFF file.
 17 | #   - The VCF and GFF files should be in the directory
 18 | #     '~/bioinfo-notebook/data/'.
 19 | #   - The number of lines in the VCF file header should be specified in the
 20 | #     'VCF_header.int' variable. This is the number of lines that begin with '#'
 21 | #     in the VCF file.
 22 | #   - The variable 'upstream.int' is used to determine how far upstream from an
 23 | #     annotated feature a SNP can be. This can be set to 0 if you do not want
 24 | #     upstream SNPs to be considered. Setting it to 1000 will mean that SNPs
 25 | #     up to 1,000 bases/1kb upstream from a feature will be annotated.
 26 | #   - The variable 'output_name' is used to specify the name of the output file,
 27 | #     which should end in '.tsv' as it will be a tab-separated values text file.
 28 | 
 29 | GFF_file <- "<.gff File name here>"
 30 | VCF_file <- "<.vcf File name here>"
 31 | VCF_header.int <- as.integer("<Number of header lines in .vcf file here>")
 32 | upstream.int <- as.integer("<Number of bases upstream of a feature a SNP can be")
 33 | output_name <- "Output_file_name_here.tsv"
 34 | 
 35 | # Setup ========================================================================
 36 | 
 37 | # Loading dplyr
 38 | if (!requireNamespace("dplyr", quietly = TRUE))
 39 |     install.packages("dplyr")
 40 | library(dplyr)
 41 | 
 42 | # Setting the working directory
 43 | setwd("~/bioinfo-notebook/data")
 44 | 
 45 | # Defining headers for GFF and VCF files
 46 | gff_headers <- c("sequence", "source", "feature", "start", "end", "score",
 47 |                  "strand", "phase", "attributes")
 48 | vcf_headers <- c("sequence", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO",
 49 |                  "FORMAT", "SAMPLE")
 50 | 
 51 | # Reading in files and applying format headers =================================
 52 | 
 53 | # Reading assembly annotation file (.gff)
 54 | genome_annotation.df <- read.delim(
 55 |   GFF_file,
 56 |   header = FALSE, stringsAsFactors = FALSE
 57 | )
 58 | colnames(genome_annotation.df) <- gff_headers
 59 | 
 60 | # SNPs from alignment with annotated genome (.vcf)
 61 | SNPs.df <- read.delim(
 62 |   VCF_file,
 63 |   header = FALSE, stringsAsFactors = FALSE, skip = VCF_header.int
 64 | )
 65 | colnames(SNPs.df) <- vcf_headers
 66 | 
 67 | # Joining data frames using dplyr ==============================================
 68 | 
 69 | # In this section...
 70 | #   - inner_join() is used to join together the genome annotation and SNPs
 71 | #     data frames along the column "sequence": i.e. rows with the same value
 72 | #     for "sequence" are joined together
 73 | #   - select() is used to remove the columns ID, FORMAT and FILTER
 74 | #   - filter() is used to filter out SNPs which do not fall within regions of
 75 | #     interest: i.e. SNPs that are not within- or upstream of- features
 76 | 
 77 | # Joining data frames with genome annotation and SNPs
 78 | SNPs_with_annotations.df <- inner_join(genome_annotation.df, SNPs.df, 
 79 |                                        by = "sequence") %>%
 80 |                                        select(-ID, -FORMAT, -FILTER) %>%
 81 |                                        filter(POS >= (start - upstream) &
 82 |                                               POS <= end)
 83 |                                               
 84 | # Removing redundant data frames
 85 | rm(genome_annotation.df, SNPs.df)
 86 | 
 87 | # Ordering filtered data frame of SNPs with annotations ========================
 88 | attach(SNPs_with_annotations.df)
 89 | SNPs_with_annotations.df <- SNPs_with_annotations.df[order(sequence, start, end), ]
 90 | detach(SNPs_with_annotations.df)
 91 | 
 92 | # Exporting SNPs with annotations to tab-separated value (.tsv) file ===========
 93 | write.table(SNPs_with_annotations.df,
 94 |             file = output_name,
 95 |             fileEncoding = "UTF-8",
 96 |             sep = "\t",
 97 |             row.names = FALSE)
 98 | 
 99 | # Exiting ======================================================================
100 | quit(save = "no")
101 | 


--------------------------------------------------------------------------------
/scripts/combining_featCount_tables.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # https://github.com/rnnh/bioinfo-notebook.git
 3 | # -*- coding: utf-8 -*-
 4 | """
 5 | Created on Wed Mar 18 12:08:41 2020
 6 | 
 7 | @author: ronan
 8 | 
 9 | This script creates a single CSV feature count table from the featureCounts
10 | output tables in the target directory.
11 | 
12 | This combined feature count table can be used for differential expression
13 | analysis (e.g. using DESeq2 or edgeR in R).
14 | """
15 | 
16 | # Loading required libraries
17 | from time import gmtime, strftime
18 | import pandas as pd
19 | import argparse
20 | import sys
21 | import os
22 | 
23 | # Parsing command line arguments
24 | parser = argparse.ArgumentParser(
25 |     description = "Combines the featureCounts output tables in the target \
26 |     directory.")
27 | 
28 | # -d PATH -o CUSTOM_FILENAME
29 | parser.add_argument("-d", "--directory", dest = "path",
30 |                     help = "path to target directory. \
31 |                             Default: current directory")
32 | parser.add_argument("-o", "--output", dest ="custom_filename",
33 |                     help = "output filename.\
34 |                             Default: featCounts_{species}_{date}.csv")
35 | 
36 | args = parser.parse_args()
37 | 
38 | # Changing to the target directory
39 | if args.path is not None:
40 |     path = args.path
41 | else:
42 |     path = os.getcwd()
43 | os.chdir(path)
44 | 
45 | # Creating variables
46 | fixed_headers = ["Geneid", "Chromosome", "Start", "End", "Strand", "Length"]
47 | target_file_prefix = "feature_counts_"
48 | date = strftime("%Y%m%d", gmtime())
49 | counts_table = pd.DataFrame()
50 | output_filename = str()
51 | target_file_count = 0
52 | species_name = str()
53 | srr = str()
54 | 
55 | # Iterating through files in target directory, combining feature counts
56 | # into one DataFrame object ("counts_table")
57 | for filename in os.listdir():
58 |     if filename.startswith(target_file_prefix):
59 |         target_file_count = target_file_count + 1
60 |         filename_list = filename.split("_")
61 |         srr = filename_list[2]
62 |         species_name = filename_list[3] + "_" + filename_list[4]
63 |         featCounts_df = pd.read_csv(filename, sep = "\t",
64 |                                     lineterminator = '\n', skiprows = 1,
65 |                                     header = 0)
66 |         featCounts_headers = fixed_headers.copy()
67 |         featCounts_headers += [srr]
68 |         featCounts_df.columns = featCounts_headers
69 |         gene_ids = featCounts_df["Geneid"]
70 |         counts = featCounts_df[srr]
71 |         # Add the gene IDs and counts to the counts_table DataFrame as columns
72 |         # if it's empty; otherwise add the counts only
73 |         if counts_table.empty:
74 |             counts_table = pd.concat([gene_ids, counts], axis = 1,
75 |                                      sort = False)
76 |         else:
77 |             counts_table = pd.concat([counts_table, counts], axis = 1,
78 |                                      sort = False)
79 |         del featCounts_headers
80 | 
81 | if target_file_count == 0:
82 |     # Exiting script if there are no target files in the target directory
83 |     print("ERROR: There are no featureCount files in the target directory. \n")
84 |     parser.print_help(sys.stderr)
85 |     exit
86 | else:
87 |     # Exporting counts_table DataFrame as a CSV file
88 |     if args.custom_filename is not None:
89 |         output_filename = args.custom_filename
90 |     else:
91 |         output_filename = "featCounts_" + species_name + "_" + date + ".csv"
92 |     counts_table.to_csv(output_filename, index = False)
93 | 


--------------------------------------------------------------------------------
/scripts/fastq-dump_to_featureCounts.sh:
--------------------------------------------------------------------------------
  1 | #! /bin/bash
  2 | # https://github.com/rnnh/bioinfo-notebook.git
  3 | 
  4 | # Help/usage text
  5 | usage="$(basename "$0") [options] -a|--annotation <annotation_file> \
  6 | 	-f|--fasta <fasta_file> <SRR ID(s)> \n
  7 | \n
  8 | This script downloads FASTQ reads from NCBI's SRA, aligns them to an annotated \n
  9 | genome using bowtie2, and generates gene count table(s) using featureCounts.\n
 10 | It can take a single SRR ID as an input, or multiple SRR IDs separated by\n
 11 | spaces.\n
 12 | \n
 13 | Required arguments: \n
 14 | \t      -a | --annotation\t     input genome annotation file \n
 15 | \t      -f | --fasta\t\t        input FASTA file for annotated genome \n
 16 | \t      SRR ID(s)\t\t           Sequence Read Archive Run ID(s) (SRR...) \n
 17 | \n
 18 | Optional arguments: \n
 19 | \t      -h | --help\t\t         show this help text and exit \n
 20 | \t      -p | --processors\t	number (n) of processors to use (default: 1) \n
 21 | \t      --fastq-dump\t\t        use 'fastq-dump' instead of the 'fasterq-dump'\n
 22 | \t      --verbose\t\t           make output of script more verbose\n
 23 | \t	--removetemp\t\t	remove read and alignment files once they are\n
 24 | \t	\t\t\t  		no longer needed (minimises disk space needed) \n
 25 | \t	--log\t\t\t		redirect terminal output to log file
 26 | "
 27 | 
 28 | # Setting FASTQDUMP to 0
 29 | # This will be changed to "1" if --fastq-dump is given as an argument,
 30 | # resulting in fastq-dump being used instead of the default fasterq-dump
 31 | FASTQDUMP=0
 32 | 
 33 | # Setting VERBOSE to 0
 34 | # This will be changed to "1" if --verbose is given as an argument,
 35 | # resulting in more verbose script output
 36 | VERBOSE=0
 37 | 
 38 | # Setting REMOVETEMP to 0
 39 | # This will be changed to "1" if --removetemp is given as an argument,
 40 | # resulting in *.fastq, *.fastq.gz, *.sam, *.bam and *.tsv.summary, being
 41 | # removed once they are no longer needed to create a featureCounts table
 42 | REMOVETEMP=0
 43 | 
 44 | # Setting LOG to 0
 45 | # This will be changed to "1" if --log is given as an argument,
 46 | # resulting in the terminal output from this script being redirected to a log
 47 | # file
 48 | LOG=0
 49 | 
 50 | # Setting default number of PROCESSORS to use
 51 | PROCESSORS=1
 52 | 
 53 | # Creating an empty variable for SRRs to be downloaded and aligned to genome
 54 | SRRs=""
 55 | 
 56 | # Print usage instructions if script is called without any arguments
 57 | if [ "$1" = "" ] ; then
 58 |   echo -e "ERROR: please provide input files. \n"
 59 |   echo -e $usage
 60 |   exit 1
 61 | fi
 62 | 
 63 | # Iterating through the input arguments with a while loop
 64 | while (( "$#" )); do
 65 | 	case "$1" in
 66 | 		-h|--help)
 67 | 			echo -e $usage
 68 | 			exit
 69 | 			;;
 70 | 		-a|--annotation)
 71 | 			ANNOTATION=$2
 72 | 			shift 2
 73 | 			;;
 74 | 		-f|--fasta)
 75 | 			FASTA=$2
 76 | 			shift 2
 77 | 			;;
 78 | 		-p|--processors)
 79 | 			PROCESSORS=$2
 80 | 			shift 2
 81 | 			;;
 82 | 		--fastq-dump)
 83 | 			FASTQDUMP=1
 84 | 			shift
 85 | 			;;
 86 | 		--verbose)
 87 | 			VERBOSE=1
 88 | 			shift
 89 | 			;;
 90 | 		--removetemp)
 91 | 			REMOVETEMP=1
 92 | 			shift
 93 | 			;;
 94 | 		--log)
 95 | 			LOG=1
 96 | 			shift
 97 | 			;;
 98 | 		--) # end argument parsing
 99 | 			shift
100 | 			break
101 | 			;;
102 | 		-*|--*) # unsupported flags
103 | 			echo -e "ERROR: $1 is an invalid option. \n" >&2
104 | 			echo -e $usage
105 | 			exit 1
106 | 			;;
107 | 		*) # preserve SRR ID(s) as positional arguments
108 | 			SRRs="$SRRs $1"
109 | 			shift
110 | 			;;
111 | 	esac
112 | done
113 | 
114 | if [ $LOG -eq "1" ]
115 | then
116 | 	# Redirecting terminal output to log file
117 | 	exec 3>&1 4>&2
118 | 	trap 'exec 2>&4 1>&3' 0 1 2 3
119 | 	exec 1>fd_to_fC_$(date +%Y%m%d_%H%M%S).log 2>&1
120 | fi
121 | 
122 | # Beginning the main body of the script
123 | # The sleep commands ("sleep 1s", "sleep 2s") slow down the script to make
124 | # the output more readable in real-time
125 | 
126 | echo -e		~~~~~~~~~~~~~  F A S T Q - D U M P t o F E A T U R E C O U N T S  ~~~~~~~~~~~~~
127 | echo Script started: $(date)
128 | 
129 | # Loop through the input SRR IDs
130 | for SRR in $SRRs
131 | do
132 | 	printf "\n"
133 | 	echo ================================================================================
134 | 	echo SRR ID: $SRR
135 | 	sleep 1s
136 | 	echo Genome annotation: $ANNOTATION
137 | 	sleep 1s
138 | 	echo Genome multi-FASTA file: $FASTA
139 | 	echo ================================================================================
140 | 	sleep 1s
141 | 	
142 | 	if [ $VERBOSE -eq "1" ]
143 | 	then
144 |         	printf "\n"
145 |         	echo Listing files in directory ...
146 |         	sleep 1s
147 |         	ls
148 |         	sleep 2s
149 | 	fi
150 | 
151 | 
152 | 	if [ $FASTQDUMP -eq "1" ]
153 | 	then
154 |         	if [ $VERBOSE -eq "1" ]
155 | 		then
156 |             		echo Downloading compressed FASTQ reads using fastq-dump...
157 |         	fi
158 | 		until fastq-dump --gzip --skip-technical --readids --read-filter pass \
159 | 		--dumpbase --split-3 --clip $SRR; do
160 | 			echo fastq-dump failed, retrying in 10 seconds...
161 | 	    		sleep 10s
162 | 		done
163 | 	else
164 |         	if [ $VERBOSE -eq "1" ]
165 | 		then
166 |             		echo Downloading FASTQ reads using fasterq-dump...
167 |         	fi
168 | 		if [ $LOG -eq "0" ]
169 | 		then
170 | 			until fasterq-dump --progress --threads $PROCESSORS $SRR; do
171 | 				echo fasterq-dump failed, retrying in 10 seconds...
172 | 				rm -r fasterq.tmp.*
173 | 				sleep 10s
174 | 			done
175 | 		else
176 | 			until fasterq-dump --threads $PROCESSORS $SRR; do
177 | 				echo fasterq-dump failed, retrying in 10 seconds...
178 | 				rm -r fasterq.tmp.*
179 | 				sleep 10s
180 | 			done
181 | 		fi
182 | 	fi
183 | 
184 | 	if [ $VERBOSE -eq "1" ]
185 | 	then
186 |         	sleep 1s
187 |         	echo Listing files in directory after downloading reads...
188 |         	sleep 1s
189 |         	ls
190 |         	sleep 2s
191 |     	fi
192 | 
193 | 	# Checking if bowtie2 index of FASTA file exists before creating bowtie2 index
194 | 	# If bowtie2_$FASTA.1.bt2 (one of the bowtie2 index files) does not exist...
195 | 	if [ ! -f bowtie2_$FASTA.1.bt2 ]
196 | 	# ...then create the bowtie2_$FASTA index
197 | 	then
198 |         	if [ $VERBOSE -eq "1" ]
199 | 		then
200 |             		echo Indexing genome FASTA file using bowtie2-build...
201 | 			sleep 2s
202 | 		fi
203 | 	    	bowtie2-build $FASTA bowtie2_$FASTA
204 | 	    	if [ $VERBOSE -eq "1" ]
205 | 		then
206 |             		sleep 1s
207 |             		echo Listing files in directory after running bowtie2-build...
208 |             		sleep 1s
209 |             		ls
210 |             		sleep 2s
211 |         	fi
212 | 	# Otherwise, print a message confirming that it exists
213 | 	else
214 |         	if [ $VERBOSE -eq "1" ]
215 | 		then
216 |             		echo The bowtie2 index bowtie2_$FASTA exists
217 |             		sleep 1s
218 | 	    	fi
219 | 	fi
220 | 
221 | 	if [ $VERBOSE -eq "1" ]
222 | 	then
223 |         	echo Aligning reads to reference genome using bowtie2...
224 |         	sleep 2s
225 |     	fi
226 | 
227 | 	# Checking if fastq-dump or fasterq-dump was used, as this will result
228 | 	# in different filenames
229 | 	if [ $FASTQDUMP -eq "1" ]
230 | 	then
231 | 		bowtie2 -p $PROCESSORS --no-unal -x bowtie2_$FASTA \
232 | 		-1 $SRR\_pass_1.fastq.gz -2 $SRR\_pass_2.fastq.gz \
233 | 		-S $SRR\_$FASTA.sam
234 | 	else
235 | 		bowtie2 -p $PROCESSORS --no-unal -x bowtie2_$FASTA \
236 | 		-1 $SRR\_1.fastq -2 $SRR\_2.fastq \
237 | 		-S $SRR\_$FASTA.sam
238 | 	fi
239 | 
240 | 	if [ $REMOVEREADS -eq "1"]
241 | 	then
242 | 		echo Removing .fastq reads...
243 | 		rm *.fastq *.fastq.gz
244 | 	fi
245 | 
246 | 	if [ $VERBOSE -eq "1" ]
247 | 	then
248 | 		sleep 1s
249 |         	echo Listing files in directory after running bowtie2...
250 |         	sleep 1s
251 |         	ls
252 |         	sleep 2s
253 | 
254 |         	echo Converting alignment from SAM to BAM format using samtools view...
255 |         	sleep 2s
256 |     	fi
257 | 	samtools view -@ $PROCESSORS -Sb $SRR\_$FASTA.sam \
258 | 	> $SRR\_$FASTA.bam
259 | 
260 | 	if [ $VERBOSE -eq "1" ]
261 | 	then
262 |         	sleep 1s
263 | 	        echo Listing files in directory after running samtools view...
264 |         	sleep 1s
265 |         	ls
266 |         	sleep 2s
267 | 
268 |         	echo Sorting the BAM file using samtools sort...
269 |         	sleep 2s
270 |     	fi
271 | 	samtools sort -@ $PROCESSORS $SRR\_$FASTA.bam \
272 | 	-o sorted_$SRR\_$FASTA.bam
273 | 
274 | 	if [ $VERBOSE -eq "1" ]
275 | 	then
276 |         	sleep 1s
277 |         	echo Listing files in directory after running samtools sort...
278 |         	sleep 1s
279 |         	ls
280 |         	sleep 2s
281 |     
282 |         	echo Generating count table using featureCounts...
283 |         	sleep 2s
284 |     	fi
285 | 	featureCounts -p -s 2 -T $PROCESSORS -a $ANNOTATION \
286 | 	-o feature_counts_$SRR\_$FASTA.tsv \
287 | 	sorted_$SRR\_$FASTA.bam
288 | 
289 | 	if [ $VERBOSE -eq "1" ]
290 | 	then
291 |         	sleep 1s
292 |         	echo Listing files in directory after running featureCounts...
293 |         	sleep 1s
294 |         	ls
295 |         	sleep 2s
296 | 	
297 |         	echo Results written to feature_counts_$SRR\_$FASTA.tsv
298 |         	sleep 2s
299 | 
300 |         	echo Head of feature_counts_$SRR\_$FASTA.tsv
301 |         	sleep 2s
302 |         	head feature_counts_$SRR\_$FASTA.tsv
303 |         	sleep 2s
304 | 
305 |         	echo Tail of feature_counts_$SRR\_$FASTA.tsv
306 |         	sleep 2s
307 |         	tail feature_counts_$SRR\_$FASTA.tsv
308 |         	sleep 2s
309 |     	fi
310 | 
311 | 
312 | 	if [ $REMOVETEMP -eq "1" ]
313 | 	then
314 | 		echo Removing temporary files...
315 | 		if [ $FASTQDUMP -eq "1" ]
316 | 		then
317 | 			rm *.fastq.gz *.sam *.bam *.tsv.summary
318 | 		else
319 | 			rm *.fastq *.sam *.bam *.tsv.summary
320 | 		fi
321 | 	fi
322 | 
323 | done
324 | 
325 | echo Script finished: $(date)
326 | 


--------------------------------------------------------------------------------
/scripts/genome_annotation_SwissProt_CDS.sh:
--------------------------------------------------------------------------------
  1 | #! /bin/bash
  2 | # https://github.com/rnnh/bioinfo-notebook.git
  3 | 
  4 | # Help/usage text
  5 | usage="$(basename "$0") [-h|--help] [-d|--demo] [-i|--input] \n
  6 | [-l|--log -p|--processors n -e|--email] \n
  7 | \n
  8 | A script to annotate proteins in a genome assembly, using BLASTx with\n
  9 | UniProtKB/Swiss-Prot.\n
 10 | \n
 11 | When run with the arugment '-d' or '--demo' this script...\n
 12 | \n
 13 | \t 1. Downloads a Saccharomyces cerevisiae S288C genome assembly, and \n
 14 | \t the UniProtKB/Swiss-Prot amino acid sequences. \n
 15 | \t 2. Creates a BLAST database from the downloaded Swiss-Prot sequences,\n
 16 | \t and searches the S. cerevisiae genome against it using BLASTx with an\n
 17 | \t E-value threshold of 1e-100. \n
 18 | \t 3. Filters the BLASTx results, removing results with less than 90%\n
 19 | \t identity.\n
 20 | \t 4. Creates a genome annotation GFF file from these BLASTx results.\n
 21 | \t 5. Adds information to the genome annotation from UniProt (protein\n
 22 | \t names, KeGG ortholog information, EC numbers, etc.) \n
 23 | \n
 24 | The end result ('S_cere.gff') is an annotation of the coding sequences (CDS) \n
 25 | in the S. cerevisiae genome that are described in UniProtKB/Swiss-Prot. \n
 26 | \n
 27 | This script can also be run with the argument '-i' or '--input', which is used\n
 28 | to specify a FASTA nucleotide file (.fasta or .fna) to annotate, instead of\n
 29 | the demo sequence. The end result is also an annotation of the CDS in the input\n
 30 | sequence based on UniProtKB/Swiss-Prot, called '<input>.gff'.\n
 31 | \n
 32 | This script should be called from the 'bioinfo-notebook/' directory.The \n
 33 | programs required for this script are in the 'bioinfo-notebook' conda \n
 34 | environment (bioinfo-notebook/envs/bioinfo-notebook.yml or \n
 35 | bioinfo-notebook/envs/bioinfo-notebook.txt). \n
 36 | If the input file is not in the 'bioinfo-notebook/data/' directory, the full \n
 37 | file path should be given.\n
 38 | \n
 39 | arguments: \n
 40 | \t  -h | --help\t\t          show this help text and exit \n
 41 | \t  -i | --input\t\t         name of input FASTA nucleotide file to annotate \n
 42 | \t  -d | --demo\t\t          run the script with demonstration inputs\n
 43 | \n
 44 | optional arguments:\n
 45 | \t  -l | --log\t\t           redirect terminal output to a log file \n
 46 | \t  -p | --processors\t      set the number (n) of processors to use\n
 47 | \t\t\t\t                     (default: 1) \n
 48 | \t  -e | --email\t\t         contact email for UniProt queries
 49 | "
 50 | 
 51 | MAKELOG=false
 52 | PROCESSORS=1
 53 | EMAIL="none"
 54 | DEMO=false
 55 | INPUT=""
 56 | 
 57 | # Iterating through the input arguments with a while loop
 58 | while (( "$#" )); do
 59 |     case "$1" in
 60 |         -h|--help)
 61 |             echo -e $usage
 62 |             exit
 63 |             ;;
 64 |         -i|--input)
 65 |             INPUT=$2
 66 |             shift 2
 67 |             ;;
 68 |         -d|--demo)
 69 |             DEMO=true
 70 |             shift 1
 71 |             ;;
 72 |         -l|--log)
 73 |             MAKELOG=true
 74 |             shift 1
 75 |             ;;
 76 |         -p|--processors)
 77 |             PROCESSORS=$2
 78 |             shift 2
 79 |             ;;
 80 |         -e|--email)
 81 |             EMAIL=$2
 82 |             shift 2
 83 |             ;;
 84 |         --) # end argument parsing
 85 |             shift
 86 |             break
 87 |             ;;
 88 |         -*|--*) # unsupported flags
 89 |             echo -e "ERROR: $1 is an invalid option. \n" >&2
 90 |             echo -e $usage
 91 |             exit 1
 92 |             ;;
 93 |     esac
 94 | done
 95 | 
 96 | cd data
 97 | 
 98 | if $MAKELOG ; then
 99 |     # Creating results directory, if it does not already exist
100 |     if [ ! -d ../results ]; then
101 |         mkdir ../results
102 |     fi
103 |     # CREATING LOG FILE
104 |     # Terminal output directed to the file 'genome_annotation_[date]_[time].log'
105 |     exec 3>&1 4>&2
106 |     trap 'exec 2>&4 1>&3' 0 1 2 3
107 |     exec 1>../results/genome_annotation_$(date +%Y%m%d_%H%M).log 2>&1
108 | fi
109 | 
110 | echo "$(date +%Y/%m/%d\ %H:%M) Beginning genome annotation script."
111 | 
112 | if $DEMO ; then
113 |     echo Downloading genome FASTA file...
114 |     curl -s -o S_cere.fna.gz ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/146\
115 | /045/GCF_000146045.2_R64/GCF_000146045.2_R64_genomic.fna.gz
116 | 
117 |     echo Decompressing genome FASTA file...
118 |     gunzip S_cere.fna.gz
119 | 
120 | fi
121 | 
122 | echo Downloading Swiss-Prot sequences...
123 | curl -s -o uniprot_sprot.fasta.gz ftp://ftp.uniprot.org/pub/databases/uniprot/\
124 | current_release/knowledgebase/complete/uniprot_sprot.fasta.gz | xargs -n 1 \
125 | -P $PROCESSORS
126 | 
127 | echo Decompressing Swiss-Prot sequences...
128 | gunzip uniprot_sprot.fasta.gz
129 | 
130 | echo Creating BLAST database...
131 | makeblastdb -dbtype prot -in uniprot_sprot.fasta -out SwissProt
132 | 
133 | echo Removing Swiss-Prot sequences...
134 | rm -v uniprot_sprot.fasta
135 | 
136 | if $DEMO ; then
137 |     echo Searching genome FASTA file against Swiss-Prot with BLASTx...
138 |     blastx -num_threads $PROCESSORS -evalue 1e-100 -query S_cere.fna \
139 |     -db SwissProt -outfmt 6 -out blastx_SwissProt_S_cere_unfiltered.tsv
140 | 
141 |     echo Removing Swiss-Prot database...
142 |     rm -v SwissProt*
143 | 
144 | echo Filtering BLASTx results with percentage identity less than 90% with awk...
145 |     awk '{ if ($3 >= 90) { print } }' blastx_SwissProt_S_cere_unfiltered.tsv \
146 |     > blastx_SwissProt_S_cere.tsv
147 | 
148 |     echo Removing unfiltered BLASTx results...
149 |     rm -v blastx_SwissProt_S_cere_unfiltered.tsv
150 | 
151 |     echo Creating genome annotation GFF file from BLASTx results...
152 |     blast2gff uniprot --fasta-file S_cere.fna blastx_SwissProt_S_cere.tsv \
153 |     S_cere_without_UniProt_info.gff
154 | 
155 |     echo Adding information to genome annotation from UniProt...
156 |     until add-gff-info uniprot --email $EMAIL --protein-names --enzymes \
157 |     --kegg_orthologs --eggnog --taxon-id S_cere_without_UniProt_info.gff \
158 |     S_cere.gff; do
159 |         echo add-gff-info failed, retrying in 10 seconds...
160 |         rm -v S_cere.gff
161 |         sleep 10s
162 |     done
163 | 
164 |     echo Removing copy of genome annotation without added UniProt info...
165 |     rm -v S_cere_without_UniProt_info.gff
166 | 
167 |     echo First line of finished genome annotation...
168 |     head -n 1 S_cere.gff
169 | fi
170 | 
171 | if [ ! -z $INPUT ]; then
172 |     echo Searching genome FASTA file against Swiss-Prot with BLASTx...
173 |     blastx -num_threads $PROCESSORS -evalue 1e-100 -query $INPUT \
174 |     -db SwissProt -outfmt 6 -out blastx_SwissProt_$INPUT\_unfiltered.tsv
175 | 
176 |     echo Removing Swiss-Prot database...
177 |     rm -v SwissProt*
178 | 
179 | echo Filtering BLASTx results with percentage identity less than 90% with awk...
180 |     awk '{ if ($3 >= 90) { print } }' blastx_SwissProt_$INPUT\_unfiltered.tsv \
181 |     > blastx_SwissProt_$INPUT\.tsv
182 | 
183 |     echo Removing unfiltered BLASTx results...
184 |     rm -v blastx_SwissProt_$INPUT\_unfiltered.tsv
185 | 
186 |     echo Creating genome annotation GFF file from BLASTx results...
187 |     blast2gff uniprot --fasta-file $INPUT blastx_SwissProt_$INPUT\.tsv \
188 |     $INPUT\_without_UniProt_info.gff
189 | 
190 |     echo Adding information to genome annotation from UniProt...
191 |     until add-gff-info uniprot --email $EMAIL --protein-names --enzymes \
192 |     --kegg_orthologs --eggnog --taxon-id $INPUT\_without_UniProt_info.gff \
193 |     $INPUT.gff; do
194 |         echo add-gff-info failed, retrying in 10 seconds...
195 |         rm -v $INPUT.gff
196 |         sleep 10s
197 |     done
198 | 
199 |     echo Removing copy of genome annotation without added UniProt info...
200 |     rm -v $INPUT\_without_UniProt_info.gff
201 | 
202 |     echo First line of finished genome annotation...
203 |     head -n 1 $INPUT.gff
204 | fi
205 | 
206 | echo "$(date +%Y/%m/%d\ %H:%M) Script finished."
207 | 


--------------------------------------------------------------------------------
/scripts/linux_setup.sh:
--------------------------------------------------------------------------------
  1 | #! /bin/bash/
  2 | # https://github.com/rnnh/bioinfo-notebook.git
  3 | 
  4 | # Help/usage text
  5 | usage="$(basename "$0") \n
  6 | \n
  7 | This script downloads and installs Miniconda3, and uses conda to install \n
  8 | the 'bioinfo-notebook' virtual environment. \n
  9 | \n
 10 | Before running this script... \n
 11 | \n
 12 | \t 1. Please run the following command:  \n
 13 | \t \t \$ sudo apt-get update \n
 14 | \t This will ensure that the software installed will be up-to-date. \n
 15 | \n
 16 | \t 2. Please ensure that the 'bioinfo-notebook/' directory is in your \n
 17 | \t home directory (~). The path to this directory should look like this: \n
 18 | \t \t $HOME/bioinfo-notebook \n
 19 | \n
 20 | The 'bash' command is used to run this script: \n
 21 | \t \$ bash $0 \n
 22 | \n
 23 | Optional arguments: \n
 24 | \t      -h | --help\t         show this help text and exit \n
 25 | "
 26 | 
 27 | # Iterating through the input arguments with a while loop
 28 | while (( "$#" )); do
 29 | 	case "$1" in
 30 | 		-h|--help)
 31 | 			echo -e $usage
 32 | 			exit 0
 33 | 			;;
 34 | 	esac
 35 | done
 36 | 
 37 | # Changing directory to the home directory ("~" or "$HOME")
 38 | cd ~
 39 | 
 40 | echo Checking if the bioinfo-notebook environment is already installed...
 41 | sleep 2s # Slows down script to make terminal output more readable
 42 | if [ -d ~/miniconda/envs/bioinfo-notebook ]; then 
 43 | 	echo The bioinfo-notebook environment already exists, exiting script.
 44 | 	exit 0
 45 | fi
 46 | 
 47 | echo Checking if bioinfo-notebook/ is in the home directory...
 48 | sleep 2s # Slows down script to make terminal output more readable
 49 | # If bioinfo-notebook/ is not in the home directory...
 50 | if [ ! -d ~/bioinfo-notebook/ ];
 51 | then
 52 | 	echo ERROR: bioinfo-notebook/ is not in the home directory
 53 | 	echo The home directory is $HOME
 54 | 	echo Please move the bioinfo-notebook/ directory to the home directory,
 55 | 	echo or create a copy of bioinfo-notebook/ in $HOME
 56 | 	exit 1
 57 | fi
 58 | 
 59 | echo Downloading Miniconda3 installation script...
 60 | sleep 2s # Slows down script to make terminal output more readable
 61 | # If the Linux system is 64-bit...
 62 | if [ "$(uname -m)" == "x86_64" ];
 63 | then
 64 | 	# Download the script to install the 64-bit version of miniconda
 65 | 	wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh \
 66 | 		-O miniconda.sh
 67 | # If the Linux system is not 64-bit...
 68 | else
 69 | 	# Download the script to install the 32-bit version of miniconda
 70 | 	wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86.sh \
 71 | 		-O miniconda.sh
 72 | fi
 73 | 
 74 | echo Installing Miniconda3...
 75 | sleep 2s # Slows down script to make terminal output more readable
 76 | bash miniconda.sh -b -p $HOME/miniconda
 77 | 
 78 | echo Miniconda3 installed, removing installation script...
 79 | rm -f miniconda.sh
 80 | 
 81 | echo Setting up Miniconda3...
 82 | sleep 2s # Slows down script to make terminal output more readable
 83 | source "$HOME/miniconda/etc/profile.d/conda.sh"
 84 | hash -r
 85 | conda config --set always_yes yes --set changeps1 yes \
 86 | 	--set auto_activate_base false
 87 | conda update -q conda
 88 | conda init
 89 | 
 90 | echo Displaying information about current conda installation...
 91 | sleep 2s # Slows down script to make terminal output more readable
 92 | conda info -a
 93 | 
 94 | echo Creating the bioinfo-notebook virtual environment using conda...
 95 | sleep 2s # Slows down script to make terminal output more readable
 96 | # If the Linux system is 64-bit...
 97 | if [ "$(uname -m)" == "x86_64" ];
 98 | then
 99 | 	# Create the virtual environment using the explicit spec list
100 | 	conda create --name bioinfo-notebook \
101 | 		--file ~/bioinfo-notebook/envs/bioinfo-notebook.txt
102 | # If the Linux system is not 64-bit...
103 | else
104 | 	# Create the virtual environment using an "environment".yml file
105 | 	conda env create -f ~/bioinfo-notebook/envs/bioinfo-notebook.yml
106 | fi
107 | 
108 | echo Removing unused packages and caches using conda...
109 | sleep 2s # Slows down script to make terminal output more readable
110 | conda clean --all --yes
111 | 
112 | echo -e Script finished! \n
113 | 
114 | echo -e Please restart your Linux system for these changes to take effect. \n
115 | 
116 | echo The bioinfo-notebook environment can be activated using the command...
117 | echo -e	"\t \$ conda activate bioinfo-notebook"
118 | echo A conda virtual environment can be deactivated using the command...
119 | echo -e	"\t \$ conda deactivate"
120 | 


--------------------------------------------------------------------------------
/scripts/snp_calling.sh:
--------------------------------------------------------------------------------
  1 | #! /bin/bash
  2 | # https://github.com/rnnh/bioinfo-notebook.git
  3 | 
  4 | # Help/usage text
  5 | usage="$(basename "$0") [-h|--help] [-1|--one -2|--two -r|--reference] \n
  6 | [-d|--demo] [-o|--output -l|--log -p|--processors n] \n
  7 | \n
  8 | This script aligns sequencing reads to a reference genome, and finds genetic \n
  9 | variants (SNPs/indels) based on this alignment, which are written to a variant\n
 10 | call format (VCF) file.\n
 11 | \n
 12 | Calling this script with the argument '-d' or '--demo' will run this script \n
 13 | using Saccharomyces cerevisiae FASTQ sequencing reads and a Saccharomyces \n
 14 | cerevisiae reference genome, which will be downloaded from NCBI. \n
 15 | \n
 16 | This script should be called from the 'bioinfo-notebook/' directory.The \n
 17 | programs required for this script are in the 'bioinfo-notebook' conda \n
 18 | environment (bioinfo-notebook/envs/bioinfo-notebook.yml or \n
 19 | bioinfo-notebook/envs/bioinfo-notebook.txt). \n
 20 | If the input files are not in the 'bioinfo-notebook/data/' directory, the full \n
 21 | file paths should be given.\n\n
 22 | \n
 23 | arguments: \n
 24 | \t  -h | --help\t\t          show this help text and exit \n
 25 | \t  -1 | --one\t\t           forward reads to align with reference sequence \n
 26 | \t\t\t\t                     (FASTQ: .fastq or .fastq.gz) \n
 27 | \t  -2 | --two\t\t           reverse reads to align with reference sequence \n
 28 | \t\t\t\t                     (FASTQ: .fastq or .fastq.gz) \n
 29 | \t  -r | --reference\t       reference sequence to align reads against \n
 30 | \t\t\t\t                     (FASTA nucleotide file: .fna) \n
 31 | \t  -d | --demo\t\t          run the script with demonstration inputs\n
 32 | \n
 33 | optional arguments: \n
 34 | \t  -o | --output\t\t        optional: name of final output file \n
 35 | \t\t\t\t                     (default: 'reference_seq_vs_reads_var.vcf', or \n
 36 | \t\t\t\t                     'S_cere_DRR237290_var.vcf' if demo is used). \n
 37 | \t  -l | --log\t\t           redirect terminal output to a log file in the \n
 38 | \t\t\t\t                     directory bioinfo-notebook/results/ \n
 39 | \t  -p | --processors\t      optional: set the number (n) of processors to \n
 40 | \t\t\t\t                     use (default: 1) \n
 41 | "
 42 | 
 43 | MAKELOG=false
 44 | PROCESSORS=1
 45 | DEMO=false
 46 | ONE=""
 47 | TWO=""
 48 | REFERENCE=""
 49 | OUTPUT=""
 50 | 
 51 | # Iterating through the input arguments with a while loop
 52 | while (( "$#" )); do
 53 |     case "$1" in
 54 |         -h|--help)
 55 |             echo -e $usage
 56 |             exit
 57 |             ;;
 58 |         -1|--one)
 59 |             ONE=$2
 60 |             shift 2
 61 |             ;;
 62 |         -2|--two)
 63 |             TWO=$2
 64 |             shift 2
 65 |             ;;
 66 |         -r|--reference)
 67 |             REFERENCE=$2
 68 |             shift 2
 69 |             ;;
 70 |         -o|--output)
 71 |             OUTPUT=$2
 72 |             shift 2
 73 |             ;;
 74 |         -d|--demo)
 75 |             DEMO=true
 76 |             shift 1
 77 |             ;;
 78 |         -l|--log)
 79 |             MAKELOG=true
 80 |             shift 1
 81 |             ;;
 82 |         -p|--processors)
 83 |             PROCESSORS=$2
 84 |             shift 2
 85 |             ;;
 86 |         --) # end argument parsing
 87 |             shift
 88 |             break
 89 |             ;;
 90 |         -*|--*) # unsupported flags
 91 |             echo -e "ERROR: $1 is an invalid option. \n" >&2
 92 |             echo -e $usage
 93 |             exit 1
 94 |             ;;
 95 |     esac
 96 | done
 97 | 
 98 | cd ~/bioinfo-notebook/data/
 99 | 
100 | if $MAKELOG ; then
101 |     # Creating results directory, if it does not already exist
102 |     if [ ! -d ../results ]; then
103 |         mkdir ../results
104 |     fi
105 |     # CREATING LOG FILE
106 |     # Terminal output directed to the file 'snp_calling_[date]_[time].log'
107 |     exec 3>&1 4>&2
108 |     trap 'exec 2>&4 1>&3' 0 1 2 3
109 |     exec 1>../results/snp_calling_$(date +%Y%m%d_%H%M).log 2>&1
110 | fi
111 | 
112 | echo "$(date +%Y/%m/%d\ %H:%M) Beginning SNP calling script."
113 | 
114 | if $DEMO ; then
115 |     echo Downloading reads...
116 |     until fastq-dump --gzip --skip-technical --readids --read-filter pass \
117 |     --dumpbase --split-files --clip DRR237290; do
118 |         echo fastq-dump failed, retrying in 10 seconds...
119 |         sleep 10s
120 |     done
121 | 
122 |     echo Downloading reference sequence...
123 |     curl -s --remote-name --remote-time ftp://ftp.ncbi.nlm.nih.gov/genomes/all/\
124 | GCF/000/146/045/GCF_000146045.2_R64/GCF_000146045.2_R64_genomic.fna.gz
125 | 
126 |     echo Decompressing reference sequence...
127 |     gunzip GCF_000146045.2_R64_genomic.fna.gz
128 | 
129 |     echo Indexing reference sequence for bowtie2...
130 |     bowtie2-build GCF_000146045.2_R64_genomic.fna S_cere_ref_seq
131 | 
132 |     echo Aligning reads to the reference genome...
133 |     bowtie2 --no-unal -p $PROCESSORS -x S_cere_ref_seq \
134 |     -1 DRR237290_pass_1.fastq.gz -2 DRR237290_pass_2.fastq.gz \
135 |     -S S_cere_DRR237290_alignment.sam
136 | 
137 |     echo Converting SAM alignment to sorted BAM alignment...
138 |     samtools view -@ $PROCESSORS -Sb \
139 |     -o S_cere_DRR237290_alignment_unsorted.bam S_cere_DRR237290_alignment.sam 
140 | 
141 |     samtools sort -@ $PROCESSORS -O bam -l 9 -o S_cere_DRR237290_alignment.bam \
142 |     S_cere_DRR237290_alignment_unsorted.bam
143 | 
144 |     echo Removing redundant alignment files...
145 |     rm -v S_cere_DRR237290_alignment.sam S_cere_DRR237290_alignment_unsorted.bam
146 | 
147 |     echo Indexing reference sequence for SAMtools...
148 |     samtools faidx GCF_000146045.2_R64_genomic.fna
149 | 
150 |     echo Generating genotype variant likelihoods with BCFtools...
151 |     bcftools mpileup --max-depth 10000 --threads $PROCESSORS \
152 |     -f GCF_000146045.2_R64_genomic.fna \
153 |     -o S_cere_DRR237290_full.bcf S_cere_DRR237290_alignment.bam
154 | 
155 |     echo Variant calling with BCFtools...
156 |     bcftools call -O b --threads $PROCESSORS -vc --ploidy 1 -p 0.05 \
157 |     -o S_cere_DRR237290_var_unfiltered.bcf S_cere_DRR237290_full.bcf
158 | 
159 |     echo Removing redundant BCF file...
160 |     rm -v S_cere_DRR237290_full.bcf
161 | 
162 |     if [ -z $OUTPUT ]; then
163 |         echo Variant filtering with BCFtools filter...
164 |         bcftools filter --threads $PROCESSORS -i '%QUAL>=20' -O v \
165 |         -o S_cere_DRR237290_var.vcf S_cere_DRR237290_var_unfiltered.bcf
166 | 
167 |         echo Head of VCF file...
168 |         head S_cere_DRR237290_var.vcf
169 | 
170 |         echo Tail of VCF file...
171 |         tail S_cere_DRR237290_var.vcf
172 | 
173 |         echo "$(date +%Y/%m/%d\ %H:%M) Script finished."
174 |     else
175 |         echo Variant filtering with BCFtools filter...
176 |         bcftools filter --threads $PROCESSORS -i '%QUAL>=20' -O v \
177 |         -o $OUTPUT.vcf S_cere_DRR237290_var_unfiltered.bcf
178 | 
179 |         echo Head of VCF file...
180 |         head $OUTPUT.vcf
181 | 
182 |         echo Tail of VCF file...
183 |         tail $OUTPUT.vcf
184 |     fi
185 | fi
186 | 
187 | echo Indexing reference sequence for bowtie2...
188 | bowtie2-build $REFERENCE reference_seq
189 | 
190 | echo Aligning reads to the reference genome...
191 | bowtie2 --no-unal -p $PROCESSORS -x reference_seq \
192 | -1 $ONE -2 $TWO -S reference_seq_vs_reads_alignment.sam
193 | 
194 | echo Converting SAM alignment to sorted BAM alignment...
195 | samtools view -@ $PROCESSORS -Sb \
196 | -o reference_seq_vs_reads_alignment_unsorted.bam \
197 | reference_seq_vs_reads_alignment.sam 
198 | 
199 | samtools sort -@ $PROCESSORS -O bam -l 9 \
200 | -o reference_seq_vs_reads_alignment.bam \
201 | reference_seq_vs_reads_alignment_unsorted.bam
202 | 
203 | echo Removing redundant alignment files...
204 | rm -v reference_seq_vs_reads_alignment.sam \
205 | reference_seq_vs_reads_alignment_unsorted.bam
206 | 
207 | echo Indexing reference sequence for SAMtools...
208 | samtools faidx $REFERENCE
209 | 
210 | echo Generating genotype variant likelihoods with BCFtools...
211 | bcftools mpileup --max-depth 10000 --threads $PROCESSORS \
212 | -f $REFERENCE -o reference_seq_vs_reads_full.bcf \
213 | reference_seq_vs_reads_alignment.bam
214 | 
215 | echo Variant calling with BCFtools...
216 | bcftools call -O b --threads $PROCESSORS -vc --ploidy 1 -p 0.05 \
217 | -o reference_seq_vs_reads_var_unfiltered.bcf reference_seq_vs_reads_full.bcf
218 | 
219 | echo Removing redundant BCF file...
220 | rm reference_seq_vs_reads_full.bcf
221 | 
222 | if [ -z $OUTPUT ]; then
223 |     echo Variant filtering with BCFtools filter...
224 |     bcftools filter --threads $PROCESSORS -i '%QUAL>=20' -O v \
225 |     -o reference_seq_vs_reads_var.vcf reference_seq_vs_reads_var_unfiltered.bcf
226 | 
227 |     echo Head of VCF file...
228 |     head reference_seq_vs_reads_var.vcf
229 | 
230 |     echo Tail of VCF file...
231 |     tail reference_seq_vs_reads_var.vcf
232 | else
233 |     echo Variant filtering with BCFtools filter...
234 |     bcftools filter --threads $PROCESSORS -i '%QUAL>=20' -O v \
235 |     -o $OUTPUT.vcf reference_seq_vs_reads_var_unfiltered.bcf
236 | 
237 |     echo Head of VCF file...
238 |     head $OUTPUT.vcf
239 | 
240 |     echo Tail of VCF file...
241 |     tail $OUTPUT.vcf
242 | fi
243 | 
244 | echo "$(date +%Y/%m/%d\ %H:%M) Script finished."
245 | 


--------------------------------------------------------------------------------