├── doc
    ├── .gitignore
    ├── _static
    │   ├── git_PR.png
    │   ├── 2014-zhang.png
    │   ├── git_branch.png
    │   ├── git_issues.png
    │   ├── git_remote.png
    │   ├── git_new_repo.png
    │   ├── git_octocat.png
    │   ├── git_overview.png
    │   ├── git_working.png
    │   ├── github_repo.png
    │   ├── bad_quality1.8.png
    │   ├── git_create_repo.png
    │   ├── github_commit.png
    │   ├── good_quality1.8.png
    │   ├── kmer-trimming.png
    │   ├── adapter_content_R1.png
    │   ├── per_base_seq_qual_R1.png
    │   ├── IBD.kmertrim.compare.mds.png
    │   ├── IBD.kmertrimmed.compare.mds.pdf
    │   └── IBD.kmertrim.compare.np.matrix.png
    ├── 00.getting-started.md
    ├── index.md
    ├── 03.lab-notebook.md
    ├── 05.starting-a-work-session.md
    ├── 04.sourmash-tutorial.md
    ├── 11.experiment-challenge.md
    ├── 09.comparing-samples-with-sourmash.md
    ├── 01.using-farm.md
    ├── 02.conda.md
    ├── 10.workflows-and-repeatability.md
    ├── 12.angus-github.md
    ├── 08.taxonomic-discovery-with-sourmash.md
    ├── 07.quality-control.md
    └── 06.download-assess-ibd-data.md
├── .gitignore
├── .github
    └── workflows
    │   └── gh-pages.yml
├── scripts
    └── mds_plot.R
├── mkdocs.yml
├── README.md
└── Snakefile


/doc/.gitignore:
--------------------------------------------------------------------------------
1 | raw_data/*
2 | .DS_Store
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | raw_data/*
3 | *png
4 | 


--------------------------------------------------------------------------------
/doc/_static/git_PR.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dib-lab/2020-NSURP/master/doc/_static/git_PR.png


--------------------------------------------------------------------------------
/doc/_static/2014-zhang.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dib-lab/2020-NSURP/master/doc/_static/2014-zhang.png


--------------------------------------------------------------------------------
/doc/_static/git_branch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dib-lab/2020-NSURP/master/doc/_static/git_branch.png


--------------------------------------------------------------------------------
/doc/_static/git_issues.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dib-lab/2020-NSURP/master/doc/_static/git_issues.png


--------------------------------------------------------------------------------
/doc/_static/git_remote.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dib-lab/2020-NSURP/master/doc/_static/git_remote.png


--------------------------------------------------------------------------------
/doc/_static/git_new_repo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dib-lab/2020-NSURP/master/doc/_static/git_new_repo.png


--------------------------------------------------------------------------------
/doc/_static/git_octocat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dib-lab/2020-NSURP/master/doc/_static/git_octocat.png


--------------------------------------------------------------------------------
/doc/_static/git_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dib-lab/2020-NSURP/master/doc/_static/git_overview.png


--------------------------------------------------------------------------------
/doc/_static/git_working.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dib-lab/2020-NSURP/master/doc/_static/git_working.png


--------------------------------------------------------------------------------
/doc/_static/github_repo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dib-lab/2020-NSURP/master/doc/_static/github_repo.png


--------------------------------------------------------------------------------
/doc/_static/bad_quality1.8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dib-lab/2020-NSURP/master/doc/_static/bad_quality1.8.png


--------------------------------------------------------------------------------
/doc/_static/git_create_repo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dib-lab/2020-NSURP/master/doc/_static/git_create_repo.png


--------------------------------------------------------------------------------
/doc/_static/github_commit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dib-lab/2020-NSURP/master/doc/_static/github_commit.png


--------------------------------------------------------------------------------
/doc/_static/good_quality1.8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dib-lab/2020-NSURP/master/doc/_static/good_quality1.8.png


--------------------------------------------------------------------------------
/doc/_static/kmer-trimming.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dib-lab/2020-NSURP/master/doc/_static/kmer-trimming.png


--------------------------------------------------------------------------------
/doc/_static/adapter_content_R1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dib-lab/2020-NSURP/master/doc/_static/adapter_content_R1.png


--------------------------------------------------------------------------------
/doc/_static/per_base_seq_qual_R1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dib-lab/2020-NSURP/master/doc/_static/per_base_seq_qual_R1.png


--------------------------------------------------------------------------------
/doc/_static/IBD.kmertrim.compare.mds.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dib-lab/2020-NSURP/master/doc/_static/IBD.kmertrim.compare.mds.png


--------------------------------------------------------------------------------
/doc/_static/IBD.kmertrimmed.compare.mds.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dib-lab/2020-NSURP/master/doc/_static/IBD.kmertrimmed.compare.mds.pdf


--------------------------------------------------------------------------------
/doc/_static/IBD.kmertrim.compare.np.matrix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dib-lab/2020-NSURP/master/doc/_static/IBD.kmertrim.compare.np.matrix.png


--------------------------------------------------------------------------------
/.github/workflows/gh-pages.yml:
--------------------------------------------------------------------------------
 1 | name: build and deploy mkdocs to github pages
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 | jobs:
 8 |   deploy:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |       - uses: actions/checkout@v2
12 |       - uses: actions/setup-python@v2
13 |         with:
14 |           python-version: 3.x
15 |       - run: pip install mkdocs-material
16 |       - run: mkdocs gh-deploy --force
17 | 


--------------------------------------------------------------------------------
/scripts/mds_plot.R:
--------------------------------------------------------------------------------
 1 | # slighly modified from https://raw.githubusercontent.com/ngs-docs/2018-cicese-metatranscriptomics/master/scripts/mds_plot.R
 2 | args = commandArgs(trailingOnly=TRUE)
 3 | 
 4 | library(ggplot2)
 5 | library(ggrepel)
 6 | 
 7 | comp <- read.csv(args[1])
 8 | 
 9 | # Label the rows
10 | rownames(comp) <- colnames(comp)
11 | 
12 | # Transform for plotting
13 | comp <- as.matrix(comp)
14 | 
15 | fit <- dist(comp)
16 | fit <- cmdscale(fit)
17 | fit <- as.data.frame(fit)
18 | 
19 | fit$lab <- rownames(fit) 
20 | 
21 | plt <- ggplot(fit, aes(x = V1, y = V2)) +
22 |         geom_point() + 
23 |         geom_label_repel(label = fit$lab) + 
24 |         theme_minimal() +
25 |         ggtitle("MDS plot of sourmash compare on reads")
26 | 
27 | pdf(file = args[2], width = 6, height = 5)
28 | plt
29 | dev.off()
30 | 


--------------------------------------------------------------------------------
/doc/00.getting-started.md:
--------------------------------------------------------------------------------
 1 | # NSURP Research Project 2020
 2 | 
 3 | This repository contains step-by-step resources for running a metagenomics workflow. 
 4 | If you see a mistake or something is not clear, please submit an [issue](https://github.com/bluegenes/2020-NSURP/issues). 
 5 | 
 6 | During this project, you will learn how to:
 7 |  + interact with an HPC (we'll use [Farm](https://www.hpc.ucdavis.edu/posts/about_farm/))
 8 |  + install and manage software environments using [conda](https://docs.conda.io/en/latest/)
 9 |  + download sequencing data and other files from the internet and public databases
10 |  + interpret and use different file formats in bioinformatics and computing
11 |  + quality analysis and control for sequencing data
12 |  + determine the taxonomic composition of sequencing reads
13 |  + assemble and annotate metagenomic reads
14 |  + quickly compare large sequencing datasets
15 |  + document workflows using git and GitHub. 
16 |  
17 | The files in this repository are ordered by execution, meaning file `00*` should be completed before `01*`. 
18 | 
19 | Most of the work done in this rotation will be completed on Farm. 
20 | However, you will need to access Farm from your own computer. 
21 | We will use an SSH-client to be able to interact with Farm. 
22 | If you are using a Mac or a Linux, your computer comes with a program called `Terminal` that we will use as an SSH-client.
23 | If you are on a Windows running Windows 10, you can install the [Ubuntu Subsystem](https://docs.microsoft.com/en-us/windows/wsl/install-win10).
24 | Otherwise, please follow the instructions for Windows found at this [link](https://hpc-carpentry.github.io/hpc-intro/setup/).
25 | 


--------------------------------------------------------------------------------
/doc/index.md:
--------------------------------------------------------------------------------
 1 | # NSURP Research Project 2020
 2 | 
 3 | The materials in this repository are designed to facilitate learning bioinformatic techniques while working through a metagenomics project using publicly-available data.
 4 | If you see a mistake or something is not clear, please submit an [issue](https://github.com/bluegenes/2020-NSURP/issues). 
 5 | 
 6 | During this project, you will learn how to:
 7 | 
 8 |   + keep a detailed lab notebook
 9 |   + interact with an HPC (we'll use [Farm](https://www.hpc.ucdavis.edu/posts/about_farm/))
10 |   + install and manage software environments using [conda](https://docs.conda.io/en/latest/)
11 |   + download sequencing data and other files from the internet and public databases
12 |   + interpret and use different file formats in bioinformatics and computing
13 |   + conduct quality analysis and control for sequencing data
14 |   + determine the taxonomic composition of sequencing reads
15 |   + quickly compare large sequencing datasets
16 |   + build reproducible workflows using snakemake
17 |   + document workflows using git and GitHub
18 |   + troubleshoot errors during your analysis
19 |  
20 | Most of the work done in this rotation will be completed on Farm. 
21 | However, you will need to access Farm from your own computer. 
22 | We will use an SSH-client to be able to interact with Farm. 
23 | If you are using a Mac or running Linux, your computer comes with a program (e.g. `Terminal` on Mac) that we can use as an SSH-client.
24 | If you are on a Windows running Windows 10, you can install the [Ubuntu Subsystem](https://docs.microsoft.com/en-us/windows/wsl/install-win10).
25 | Otherwise, please follow the instructions for Windows found at this [link](https://hpc-carpentry.github.io/hpc-intro/setup/).
26 | 


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: dib-lab NSURP project
 2 | site_url: http://dib-lab.github.io/2020-NSURP/
 3 | repo_name: 2020-NSURP
 4 | repo_url: https://github.com/dib-lab/2020-NSURP
 5 | edit_uri: ""
 6 | 
 7 | copyright: 'Copyright &copy; 2020 <a href="http://ivory.idyll.org/lab/">Lab for Data Intensive Biology</a> at UC Davis'
 8 | 
 9 | # change directory names here
10 | docs_dir: doc
11 | site_dir: site
12 | 
13 | theme:
14 |   name: material
15 | 
16 |   # pretty colors! see https://squidfunk.github.io/mkdocs-material/getting-started/#primary-colors
17 |   palette:
18 |     primary: 'deep purple'
19 |     accent: 'indigo'
20 |   
21 |   # fun logos! see https://material.io/icons/
22 |   icon:
23 |     logo: material/school
24 | 
25 |   font:
26 |     text: 'Roboto'
27 |     code: 'Roboto Mono'
28 |   
29 | # optionally give a title for each page or section
30 | nav:
31 |   - 'Home': 'index.md'
32 |   - "Setup":
33 |     - "Setting up Farm": "01.using-farm.md"
34 |     - "Install Conda": "02.conda.md"
35 |   - "Keeping a Lab Notebook": "03.lab-notebook.md"
36 |   - "Try Sourmash!": "04.sourmash-tutorial.md"
37 |   - "Starting a Work Session": "05.starting-a-work-session.md"
38 |   - "IBD Research Project":
39 |     - "Download and Visually Assess": "06.download-assess-ibd-data.md"
40 |     - "Quality Control": "07.quality-control.md"
41 |     - "Taxonomic Discovery with Sourmash": "08.taxonomic-discovery-with-sourmash.md"
42 |     - "Comparing Samples with Sourmash": "09.comparing-samples-with-sourmash.md"
43 |     - "Experiment Challenge": "11.experiment-challenge.md"
44 |   - "Advanced Modules":
45 |     - "Automation, Workflows, and Repeatability": "10.workflows-and-repeatability.md"
46 |     - "Version Control with Git and GitHub":  "12.angus-github.md"
47 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # DIB Lab NSURP Research Project 2020
 2 | 
 3 | [![DOI](https://zenodo.org/badge/284118125.svg)](https://zenodo.org/badge/latestdoi/284118125)
 4 | 
 5 | Welcome to the project home for our NSURP research materials! The [training website](https://dib-lab.github.io/2020-NSURP/) contains a much prettier (rendered) version of these materials - check it out!
 6 | 
 7 | This project is designed to facilitate learning bioinformatic techniques while working through a metagenomics project on publicly-available data.
 8 | 
 9 | During this project, you will learn how to:
10 | 
11 |   + keep a detailed lab notebook
12 |   + interact with an HPC (we'll use [Farm](https://www.hpc.ucdavis.edu/posts/about_farm/))
13 |   + install and manage software environments using [conda](https://docs.conda.io/en/latest/)
14 |   + download sequencing data and other files from the internet and public databases
15 |   + interpret and use different file formats in bioinformatics and computing
16 |   + conduct quality analysis and control for sequencing data
17 |   + determine the taxonomic composition of sequencing reads
18 |   + quickly compare large sequencing datasets
19 |   + build reproducible workflows using snakemake
20 |   + document workflows using git and GitHub
21 |   + troubleshoot errors during your analysis
22 |  
23 | The material in this repository was primarily written or aggregated by @bluegenes, @taylorreiter, and @hehouts. 
24 | It adapts and builds on tutorials from the following sources:
25 | 
26 | + DIB-Lab Metagenomics Rotation Project: https://github.com/dib-lab/dib_rotation
27 | + ANGUS: https://angus.readthedocs.io/en/2019/index.html
28 | + HPC Carpentry: https://hpc-carpentry.github.io/
29 | + Data Carpentry Genomics: https://datacarpentry.org/genomics-workshop/
30 | + CICESE Metatranscriptomics: https://github.com/ngs-docs/2018-cicese-metatranscriptomics
31 | 


--------------------------------------------------------------------------------
/doc/03.lab-notebook.md:
--------------------------------------------------------------------------------
 1 | # Keeping a Lab Notebook
 2 | 
 3 | Just like with wetlab work, it's important to document everything you do during a computational workflow. 
 4 | 
 5 | ## Where to take notes
 6 | 
 7 | For this project, we recommend using [HackMD](https://hackmd.io/?nav=overview).
 8 | HackMD works a little like google docs, but enables better formatting for adding code.
 9 | You can start with just regular text, but as you start adding screnshots, code blocks, and header sections, you can use Markdown syntax to improve formatting of your rendered notes.
10 | HackMD shows you a few examples of Markdown when you first open a new document, but you can also check out this [markdown tutorial](https://www.markdowntutorial.com/) if you want to learn more.
11 | 
12 | ## How to take notes
13 | 
14 | It's ok to provide the minimum amount of information necessary to execute a set of commands (i.e., you don't necessary have to record every failure, every `ls`, etc), but it is important to document each step.
15 | + Copying and pasting the commands that worked is a great way to record them (the `history` command can be helpful to see what you've run in the past).
16 | 
17 | 
18 | ## Documentation is for you! (And also for others)
19 | 
20 | Your lab notebook and documentation is *most* useful for future you.
21 | Keep in mind that things that seem super obvious right now *will likely be forgetten* within a few weeks/months.
22 | Try to be detailed enough so that if you tried to pick up this project again in 3 months (or 3 years), you would be able to understand exactly what to do and how to do it.
23 | With a good lab notebook, you can save yourself from troubleshooting the same errors over and over again, as well as greatly simplify the process of writing up your `Materials and Methods` section for any reports or papers.
24 | Finally, good lab notebooks help keep everyone working on your project (both now and in the future) on the same page.
25 | 
26 | ## Other systems for taking notes: 
27 | 
28 | After this project, if you like HackMD, great! Stick with it. If not:
29 | 
30 |   + Using google docs or Microsoft Word for documenting computer commands can be hard because of autocorrection. We generally recommend against using these programs.
31 |   + Using a plain text editor (Notepad, Notepad++, Atom, BBEdit, TextEdit, nano, vim) avoids autocorrect problems but still has a nice user interface.
32 |   + Jupyter Lab is very useful for interactive research explorations and notetaking. We'll try this out in a later section.
33 | 
34 | 
35 | Eventually, we'll work through using git and GitHub to record and version control our workflows, but for now it's enough to write down everything you do.
36 | 


--------------------------------------------------------------------------------
/doc/05.starting-a-work-session.md:
--------------------------------------------------------------------------------
 1 | Starting a Work Session on FARM
 2 | ===
 3 | 
 4 | Any time you log onto FARM to work on this project, follow these steps to get access to computing resources.
 5 | 
 6 | ## 1. Enter a `tmux` session
 7 | 
 8 | This command creates a new `tmux` session:
 9 | ```
10 | tmux new -s nsurp
11 | ```
12 | Note: *If you already created this session, and want to re-join it, use `tmux attach` instead.*
13 | 
14 | ## 2. Get access to a compute node
15 | 
16 | When you log on to our `FARM` computing system, you'll be on a `login` node, which is basically a computer with very few resources. These login nodes are shared among all users on farm. 
17 | 
18 | If we run any computing on these login nodes, logging into and navigating farm will slow down for everyone else! Instead, the moment that we want to do anything substantial, we want to ask farm for a more capable comptuter. Farm uses a "job scheduler" to make sure everyone gets access to the computational resources that they need.
19 | 
20 | We can use the following command to get access to a computer that will fit our needs:
21 | ```
22 | srun -p bmm -J nsurp-analysis -t 5:00:00 --mem=10G --pty bash
23 | ```
24 | 
25 | > -  `srun` uses the computer's job scheduler `SLURM` to allocate you a computer
26 | > - `-p` specifies the job queue we want to use, and is specific to our `farm` accounts.
27 | > - `-J nsurp-analysis` is the "job name" assigned to this session. It can be modified to give your session a more descriptive name, e.g. `-J download-data`
28 | > - `-t` denotes that we want the computer for that amount of time (in this case, 3 hours).
29 | > - `--mem` specifies the amount of memory we'd like the computer to have. Here we've asked for 10 Gigabytes (10G). 
30 | > - `--pty bash` specified that we want the linux shell to be the `bash` shell, which is the standard shell we've been working wiht so far
31 | 
32 | 
33 | Note that your home directory (the files you see) will be the same for both the login node and the computer you get access to. This is because both read and write from the same hard drives. So you can create files while in an `srun` session, and they'll still be there for you when you logout.
34 | 
35 | ## 3. Activate your Conda Environment
36 | 
37 | Once you're in an `srun` session, activate your project environment to get access to the software you've installed
38 | 
39 | ```
40 | conda activate nsurp-env
41 | ```
42 | 
43 | ## Leaving your tmux session
44 | 
45 | Exit tmux by `Ctrl-b`, `d`
46 | 
47 | ## Reattaching to your tmux session
48 | 
49 | 
50 | ```
51 | tmux attach
52 | ```
53 | 
54 | _Note: if you make more than one tmux session, you can see all session names by typing `tmux ls`, and then attaching to the right one with `tmux attach -t <NAME>`_
55 | 


--------------------------------------------------------------------------------
/doc/04.sourmash-tutorial.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | tags: NSURP, tutorials
  3 | ---
  4 | 
  5 | # Getting started with Sourmash: a tutorial
  6 | 
  7 | Let's go through a [sourmash tutorial](https://sourmash.readthedocs.io/en/latest/tutorial-basic.html).
  8 | 
  9 | *Commands (as of 07/31/2020) reproduced here. If doing this at a later date and these commands don't work, run the tutorial using the link above instead!!*
 10 | 
 11 | ## Making signatures, comparing, and searching
 12 | 
 13 | 7/31/20
 14 | You'll need about 5 GB of free disk space,
 15 | and about 5 GB of RAM to search GenBank. 
 16 | 
 17 | 
 18 | First activate your conda envionment:
 19 | ```
 20 | conda activate tutorial
 21 | ```
 22 | 
 23 | Because we installed sourmash into our `tutorial` environment, you should now be able to use the `sourmash` command:
 24 | ```
 25 | sourmash info
 26 | ```
 27 | 
 28 | ## Download the data and put it in a folder
 29 | 
 30 | Lets make some folders to keep our project organized:
 31 | ```
 32 | mkdir smtut
 33 | cd smtut
 34 | ```
 35 | 
 36 | ```
 37 | mkdir data
 38 | cd data
 39 | ```
 40 | Now we can download some data into our `data` folder
 41 | ```
 42 | wget https://s3.amazonaws.com/public.ged.msu.edu/ecoli_ref-5m.fastq.gz
 43 | wget https://s3.amazonaws.com/public.ged.msu.edu/ecoliMG1655.fa.gz
 44 | ```
 45 | OR
 46 | ```
 47 | wget https://bit.ly/2CXj13R -O ecoli_ref-5m.fastq.gz
 48 | wget https://bit.ly/2PdRbCJ -O ecoliMG1655.fa.gz
 49 | 
 50 | ```
 51 | 
 52 | The file that ends in `.fastq.gz` is a zipped file with DNA sequences in fastq format
 53 | 
 54 | Lets take a look at our data:
 55 | 
 56 | first we can unzip the fastq file,
 57 | ```
 58 | gunzip --keep ecoli_ref-5m.fastq.gz
 59 | 
 60 | ```
 61 | 
 62 | and use `ls -lh` to compare the sizes
 63 | the zipped file should be smaller than the unzipped file
 64 | 
 65 | we can look at the unzipped file using 
 66 | ```
 67 | less ecoli_ref-5m.fastq
 68 | ```
 69 | it will have this fastq format:
 70 | ![](https://i.imgur.com/bYEp4G1.png)
 71 | 
 72 | 
 73 | 
 74 | ### Computing a sourmash signature
 75 | Compute a scaled signature from our reads:
 76 | 
 77 | first, lets make a folder to keep our signatures in
 78 | ```
 79 | mkdir ~/smtut/sigs
 80 | cd ~/smtut/sigs
 81 | ```
 82 | 
 83 | 
 84 | 
 85 | ## Compare reads to assemblies
 86 | 
 87 | how much of the read content is contained in the reference genome?
 88 | 
 89 | Build a signature for the E. coli reads with `sourmash compute`, 
 90 | 
 91 | ```
 92 | sourmash compute \
 93 | --scaled 10000 \
 94 | ~/smtut/data/ecoli_ref*.fastq.gz \
 95 | -o ~/smtut/sigs/ecoli-reads.sig \
 96 | -k 31
 97 | ```
 98 | 
 99 | 
100 | Next build a signature for the E. coli genome with `sourmash compute`, 
101 | 
102 | ```
103 | sourmash compute \
104 | --scaled 10000 \
105 | ~/smtut/data/ecoliMG1655.fa.gz  \
106 | -o ~/smtut/sigs/ecoli-genome.sig \
107 | -k 31
108 | ```
109 | this command will call the software to make a kmer signature, 
110 | keep only 1 in 10000 kmers from the sequence, store the signature where we ask it to, and use a k size of 31
111 | 
112 | 
113 | Now evaluate *containment*, that is, what fraction of the read content is
114 | contained in the genome:
115 | 
116 | ```
117 | sourmash search -k 31 ecoli-reads.sig ecoli-genome.sig --containment
118 | ```
119 | 
120 | and you should see:
121 | 
122 | ```
123 | # running sourmash subcommand: search
124 | loaded query: /home/ubuntu/data/ecoli_ref-5m... (k=31, DNA)
125 | loaded 1 signatures from ecoli-genome.sig
126 | 1 matches:
127 | similarity   match
128 | ----------   -----
129 |  10.6%       /home/ubuntu/data/ecoliMG1655.fa.gz
130 | ```
131 | 
132 | 
133 | Try the reverse - why is it bigger?
134 | 
135 | ```
136 | sourmash search -k 31 ecoli-genome.sig ecoli-reads.sig --containment
137 | ```
138 | 
139 | 


--------------------------------------------------------------------------------
/doc/11.experiment-challenge.md:
--------------------------------------------------------------------------------
  1 | Experiment Challenge
  2 | ===
  3 | 
  4 | Thus far, we've run through a set of commands with six metagenome samples. 
  5 | These have been from two patients, one with Crohn's disease, one without.
  6 | But there's not much we can say with just two patients (other than "they look different!").
  7 | 
  8 | Now, we'll add samples from more patients and try to understand the differences between samples.
  9 | 
 10 | ## Workspace Setup
 11 | 
 12 | If you're starting a new work session on FARM, be sure to follow the instructions [here](05.starting-a-work-session.md).
 13 | 
 14 | 
 15 | ## Download Additional Files 
 16 | 
 17 | Move into the raw data folder
 18 | ```
 19 | cd ~/2020-NSURP/raw_data
 20 | ```
 21 | 
 22 | Download the files
 23 | ```
 24 | wget https://ibdmdb.org/downloads/raw/HMP2/MGX/2018-05-04/CSM7KOJO.tar
 25 | wget https://ibdmdb.org/downloads/raw/HMP2/MGX/2018-05-04/HSMA33R1.tar
 26 | wget https://ibdmdb.org/downloads/raw/HMP2/MGX/2018-05-04/HSMA33R5.tar
 27 | 
 28 | wget https://ibdmdb.org/downloads/raw/HMP2/MGX/2018-05-04/MSM6J2QP.tar
 29 | wget https://ibdmdb.org/downloads/raw/HMP2/MGX/2018-05-04/MSM6J2QF.tar
 30 | wget https://ibdmdb.org/downloads/raw/HMP2/MGX/2018-05-04/MSM6J2QH.tar
 31 | ```
 32 | 
 33 | Untar each read set
 34 | ```
 35 | tar xf HSMA33S4.tar
 36 | ```
 37 | 
 38 | ## Trim and compute sourmash signatures for these files
 39 | 
 40 | Using your HackMD notes, run the commands for trimming (both adapter and k-mer trimming) on these samples.
 41 | 
 42 | For reference, the [Quality Control](07.quality-control) contains code for running `fastp` and `khmer` trimming; the [Comparing Samples with Sourmash](09.comparing-samples-with-sourmash) contains code for computing sourmash signatures.
 43 | 
 44 | 
 45 | ## Run Sourmash Compare
 46 | 
 47 | Run `sourmash compare` and `sourmash plot` (as in [Comparing Samples with Sourmash](09.comparing-samples-with-sourmash)).
 48 | 
 49 | What do you notice about the sourmash comparison heatmap?
 50 | 
 51 | Which samples are more similar to each other? 
 52 | Can you guess which patients have Crohn's disease or no IBD by comparing them to your prior samples?
 53 | How do samples from the same patient compare to samples from different patients?
 54 | 
 55 | 
 56 | ## Assess Taxonomic Diversity
 57 | 
 58 | Run `sourmash gather` with the `genbank-k31` database on these new samples. 
 59 | 
 60 | Count the total number of species found in each sample. Does it differ between Crohn's disease and non-IBD patients?
 61 | 
 62 | 
 63 | ## Look at the sample metadata
 64 | 
 65 | What additional information can you glean from looking at the metadata (the data _about_ the data)?
 66 | 
 67 | As usual, let's start by creating a directory for this
 68 | 
 69 | ```
 70 | mkdir -p ~/2020-NSURP/metadata
 71 | cd ~/2020-NSURP/metadata
 72 | ```
 73 | 
 74 | All information about this project can be found [here](https://ibdmdb.org/).
 75 | 
 76 | Download the metadata file [here](https://ibdmdb.org/downloads/metadata/hmp2_metadata_2018-08-20.csv).
 77 | 
 78 | This file contains information for the metagenomics sequencing (which we looked at), but also a number of other assessments.
 79 | 
 80 | This file is a spreadsheet that can be opened in Google docs or viewed with `less`.
 81 | 
 82 | For example, view this file with less like so:
 83 | ```
 84 | less -S hmp2_metadata_2018-08-20.csv
 85 | ```
 86 | 
 87 | This is a very large file. You can get information about a specific sample by searching out the specific sample id's we used.
 88 | For example:
 89 | ```
 90 | grep HSMA33S4 hmp2_metadata_2018-08-20.csv
 91 | ```
 92 | 
 93 | That's still a lot of info - let's get only the info for metagenomics samples:
 94 | ```
 95 | grep metagenomics hmp2_metadata_2018-08-20.csv | grep HSMA33S4
 96 | ```
 97 | 
 98 | The formatting is still a litle ugly. 
 99 | Let's direct the output to a file, and then open it with `less -S`:
100 | ```
101 | grep metagenomics hmp2_metadata_2018-08-20.csv | grep HSMA33S4 > HSMA33S4.csv
102 | less -S HSMA33S4.csv
103 | ```
104 | 
105 | 


--------------------------------------------------------------------------------
/doc/09.comparing-samples-with-sourmash.md:
--------------------------------------------------------------------------------
  1 | Comparing Samples with Sourmash
  2 | ===
  3 | 
  4 | Many metagenomics projects are designed to assess the differences between microorganism composition between samples.
  5 | There are many ways to get at this question, but we can start by using k-mer profiles of the reads to quickly compare samples using `sourmash compare`.
  6 | 
  7 | ## Workspace Setup
  8 | 
  9 | If you're starting a new work session on FARM, be sure to follow the instructions [here](05.starting-a-work-session.md).
 10 | 
 11 | First, let's make a directory that we will be working in:
 12 | ```
 13 | cd ~/2020-NSURP
 14 | mkdir -p sourmash-compare
 15 | cd sourmash-compare
 16 | ```
 17 | 
 18 | > Note: we made a directory called `sourmash` for the [taxonomic discovery](08.taxonomic-discovery-with-sourmash.md) module.
 19 | > It helps to name files and folders with details that will help you remember what results are contained therein.
 20 | > How could the prior module's folder name be changed to be more informative?
 21 | 
 22 | 
 23 | ## Calculate sourmash signatures
 24 | 
 25 | Now we can calculate signatures for each of the files. This will take 5 or 10 minutes to run
 26 | 
 27 | ```
 28 | for infile in ~/2020-NSURP/kmer-trim/*.kmertrim.fq.gz
 29 | do
 30 |     name=$(basename ${infile} .kmertrim.fq.gz)
 31 |     echo $name
 32 |     sourmash compute -k 21,31,51 --scaled 2000 --track-abundance --merge ${name} -o ${name}.kmertrim.sig ${infile}
 33 | done
 34 | ```
 35 | 
 36 | > Note: Here we used bash `for loop` to compute signatures on each file with a single set of commands.
 37 | > Go through this [tutorial](https://datacarpentry.org/shell-genomics/04-redirection/index.html) to learn about loops!
 38 | 
 39 | 
 40 | ## Compare sample signatures
 41 | 
 42 | Using these signatures, we can compare our samples.
 43 | 
 44 | ```
 45 | sourmash compare -k 31 -o IBD.kmertrim.compare.np --csv IBD.kmertrim.compare.csv --ignore-abundance *sig
 46 | ```
 47 | 
 48 | Now let's plot! Sourmash has a built in plot utility that we can take advantage of.
 49 | The output is a heatmap.
 50 | 
 51 | 
 52 | ## Visualize the comparison using sourmash plot
 53 | 
 54 | ```
 55 | sourmash plot --labels IBD.kmertrim.compare.np
 56 | ```
 57 | 
 58 | This command produces three `png` files:
 59 | ```
 60 | IBD.kmertrim.compare.np.hist.png
 61 | IBD.kmertrim.compare.np.dendro.png
 62 | IBD.kmertrim.compare.np.matrix.png
 63 | ```
 64 | 
 65 | As usual, these files can be downloaded to your local computer with `scp`
 66 | ```
 67 | scp -P 2022  -i /path/to/key/file username@farm.cse.ucdavis.edu:~/2020-NSURP/sourmash-compare/*.png ./
 68 | ```
 69 | If you're on a mac using `zsh`, you may need to replace the `scp` with `noglob scp` in the command above.
 70 | 
 71 | If you're on windows, you may need to move the the files from the download location on your Linux shell over to the windows side of your computer before opening.
 72 | 
 73 | Once the files are on your local computer, double click to open each file. 
 74 | The `.matrix.png` is the heatmap file, which will show the pattern of similarity between samples
 75 | 
 76 | It should look like this:
 77 | 
 78 | ![sourmash compare matrix](_static/IBD.kmertrim.compare.np.matrix.png).
 79 | 
 80 | What does this heatmap tell you about your samples?
 81 | For example, does it provide any information about which samples are from IBD patients, and which are from non-IBD patients?
 82 | 
 83 | ## Visualize the comparison in an MDS plot
 84 | 
 85 | We can use this output to make a Multidimensional Scaling plot. MDS plots are
 86 | commonly used in visualize similarities and differences between samples.
 87 | Here the strength is we used the k-mer content of all of our reads to calculate similarity.
 88 | 
 89 | ### Install the R packages ggplot2 and ggrepel
 90 | 
 91 | Since this is conda, it will recognize that it needs to install R alongside these, and take care of that for you!
 92 | Usually you'll want to be careful of _which_ version of R you're installing, but since we're just doing this one R command, we'll be a little lax about it.
 93 | 
 94 | ```
 95 | conda install r-ggplot2 r-ggrepel 
 96 | ```
 97 | 
 98 | ### Download an R script to make the MDS plot
 99 | 
100 | The script source is [here](https://raw.githubusercontent.com/dib-lab/2020-NSURP/master/scripts/mds_plot.R) if you are interested!
101 | ```
102 | wget https://raw.githubusercontent.com/dib-lab/2020-NSURP/master/scripts/mds_plot.R
103 | ```
104 | 
105 | ### Run the R script
106 | ```
107 | Rscript mds_plot.R IBD.kmertrim.compare.csv IBD.kmertrim.compare.mds.pdf
108 | ```
109 | 
110 | This outputs a file `IBD.kmertrim.compare.mds.pdf`.
111 | You can see that file by downloading to your computer.
112 | It should look something like this:
113 | 
114 | ![mds plot of sourmash compare](_static/IBD.kmertrim.compare.mds.png).
115 | 
116 | How do the samples cluster? How does this compare to our heatmap, generated by `sourmash plot`, above?
117 | 
118 | 


--------------------------------------------------------------------------------
/doc/01.using-farm.md:
--------------------------------------------------------------------------------
  1 | # Getting Started on the Farm HPC
  2 | 
  3 | High Performance Computing (HPC) refers to computers that have more capability than a typical personal computer 
  4 | (i.e. most desktops and laptops). 
  5 | Many research problems we encounter when analyzing sequencing data require more resources than we have available on our laptops.
  6 | For this, we use large, remote compute systems that have more resources available. 
  7 | 
  8 | Most universities have access to an HPC (or cluster) that has a large amount of hard drive space to store files, RAM for computing tasks, and CPUs for processing.
  9 | Other options for accessing large computers include NSF XSEDE services like Jetstream and paid services like Amazon Web Services or Google Cloud.
 10 | We will use the UC Davis [Farm](https://wiki.cse.ucdavis.edu/support/systems/farm) Cluster during this rotation.
 11 | 
 12 | ## Getting an account on Farm
 13 | 
 14 | To be able to use Farm, you need to sign up for an account. 
 15 | Farm requires key file authentication.
 16 | Key files come in pairs like the locks and keys on doors. 
 17 | The private key file is the first file, and it is like the key to a door. 
 18 | This file is private and should never be shared with anyone (do not post this file on GitHub, slack, etc.). 
 19 | The public key file is the second file, and it is like the lock on a door.
 20 | It is publicly viewable, but cannot be "unlocked" without the private key file. 
 21 | 
 22 | We need to generate a key file pair in order to create a farm account. 
 23 | 
 24 | Open the `Terminal` application or the Terminal emulator you installed in the [first_lesson](00_getting_started.md).
 25 | 
 26 | Change directories into the `.ssh` folder.
 27 | This folder is where key file pairs are typically stored.
 28 | 
 29 | ```
 30 | cd ~/.ssh
 31 | ```
 32 | 
 33 | If this command does not work, create your own `ssh` folder and `cd` into it:
 34 | ```
 35 | mkdir -p ~/.ssh
 36 | cd ~/.ssh
 37 | ```
 38 | 
 39 | Then, generate the keyfile pair by running:
 40 | ```
 41 | ssh-keygen
 42 | ```
 43 | 
 44 | Follow the prompts on the screen. If prompted for a password, you can hit `Enter` on your keyboard to avoid setting one.
 45 | 
 46 | Two files will be created by this command. 
 47 | These files should have the same prefix.
 48 | The file that ends in `.pub` is the public key.
 49 | 
 50 | ### [The account request form](https://wiki.cse.ucdavis.edu/cgi-bin/index2.pl)
 51 | 
 52 | Next, navigate to [this page](https://wiki.cse.ucdavis.edu/cgi-bin/index2.pl).
 53 | From the first drop down menu (Which cluster are you applying for an account on?), select `FARM/CAES`.
 54 | From the second drop down menu (Who is sponsoring your account?), select `Brown, C. Titus`.
 55 | Then, upload your public key file to the page. 
 56 | Submit the form. 
 57 | If the cluster admins and Titus approve your account, you will now have farm access!
 58 | Don't loose the key file pair you just made. 
 59 | You will need the private key file each time you log into farm.
 60 | 
 61 | 
 62 | ## Connecting to a remote computer 
 63 | 
 64 | Once you have a farm account, we will use the command `ssh` to connect to farm. 
 65 | `ssh` stands for "secure shell". 
 66 | 
 67 | To connect to your account on farm, type:
 68 | 
 69 | ```
 70 | ssh -i ~/.ssh/your_keyfile_name username@farm.cse.ucdavis.edu
 71 | ```
 72 | 
 73 | If you are successful, you will see a message that looks something like this:
 74 | 
 75 | ```
 76 | Welcome to Ubuntu 18.04.3 LTS (GNU/Linux 4.15.0-70-generic x86_64)
 77 | 
 78 | 1 updates could not be installed automatically. For more details,
 79 | see /var/log/unattended-upgrades/unattended-upgrades.log
 80 | 
 81 | *** System restart required ***
 82 | A transfer node, c11-42, is available for rsync, scp, gzip
 83 | From outside the Farm cluster use port 2022 to access the transfer node.
 84 |  ssh -p 2022 username@farm.cse.ucdavis.edu
 85 |  scp -P 2022 src username@farm.cse.ucdavis.edu:/destination
 86 | 
 87 |    REMINDER: Farm does not back up user data. Please ensure your data is backed up offsite.
 88 | 
 89 |  *** Dec 04 2019:
 90 |  * 2:10pm - Service restored. Please report any issues to help@cse.ucdavis.edu.
 91 | 
 92 | 
 93 |    Email help@cse.ucdavis.edu for help with Farm.
 94 | 
 95 | Downtime scheduled for the first Wednesday of Oct and April.  The next downtime is Wednesday April 1st at 11:59pm.
 96 | 
 97 | If interested in contributing to farm, the rates for 5 years are:
 98 |   $ 1,000 per 10TB, served from redundant servers with compression
 99 |   $ 8,800 per parallel node (256GB ram, 32 cores/64 threads, 2TB /scratch)
100 |   $17,500 per GPU node (Nvidia Telsa V100, dual Xeon 4114, 2TB /scratch)
101 |   $22,700 per bigmem node (1TB ram, 48 cores/96 threads, 2TB /scratch)
102 | 
103 | Last login: Thu Jan  2 17:01:36 2020 from 76.105.143.194
104 | Module slurm/19.05.3 loaded
105 | Module openmpi/4.0.1 loaded
106 | username@farm:~$
107 | ```
108 | 
109 | When you first login to farm, you will be in your home directory.
110 | This is where you will write your files and run the majority of your commands. 
111 | 
112 | When you are done using farm, you can exit your ssh connection with the `exit` command.
113 | 
114 | ```
115 | exit
116 | ```
117 | 


--------------------------------------------------------------------------------
/doc/02.conda.md:
--------------------------------------------------------------------------------
  1 | # Using Conda for Software Installation
  2 | 
  3 | This section covers using conda to install scientific software.
  4 | 
  5 | ## What is Conda?
  6 | 
  7 | Installing scientific software (including all required dependencies of said software!) is often challenging.
  8 | Conda is a software manager that helps you find and install software packages.
  9 | 
 10 | 
 11 | ## Set up Miniconda
 12 | 
 13 | To get started, we'll install miniconda, which contains everything we need to get started with conda.
 14 | 
 15 | Log in to farm and run the following commands to install Miniconda. 
 16 | Follow the prompts on the screen and accept all default options.
 17 | 
 18 | ### Install conda
 19 | 
 20 | ```
 21 | wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
 22 | bash Miniconda3-latest-Linux-x86_64.sh
 23 | ```
 24 | 
 25 | Again, be sure to answer `yes` to any yes/no questions, especially the last question about `conda init`!
 26 | This will ensure conda is fully installed and you'll be able to use it for the commands below.
 27 | 
 28 | ### Copy .bashrc code to .bash_profile
 29 | 
 30 | FARM runs `.bash_profile` on startup (not `.bashrc`).
 31 | Here, we explicitly run the `.bashrc` from the `.bash_profile` file
 32 | ```
 33 | echo source ~/.bashrc >> ~/.bash_profile
 34 | ```
 35 | 
 36 | ### Activate conda
 37 | 
 38 | Miniconda is now installed, but we need to activate it to be able to use it.
 39 | ```
 40 | source ~/.bashrc
 41 | ```
 42 | This command executes our `~/.bashrc` command, which should now have
 43 | 
 44 | You should now see `(base)` in front of your prompt, indicating that you are in the base environment.
 45 | 
 46 | 
 47 | ## Optional: Add colors to your terminal output
 48 | 
 49 | If you have preferred settings for bash, go ahead and set them up.
 50 | 
 51 | Open the `.bash_profile` file using `nano`
 52 | 
 53 | ```
 54 | cd
 55 | nano .bash_profile
 56 | ```
 57 | 
 58 | Now, add the following to the document:
 59 | ```
 60 | export PS1="\[\033[36m\]\u\[\033[m\]@\[\033[32m\]\h:\[\033[33;1m\]\w\[\033[m\]\$"
 61 | export CLICOLOR=1
 62 | export LSCOLORS=ExFxBxDxCxegedabagacad
 63 | alias ls='ls --color=auto'
 64 | ```
 65 | 
 66 | And close and save the document (`Ctrl-X`, `Y`, `<Enter>`)
 67 | 
 68 | Now source the file:
 69 | ```
 70 | source .bash_profile
 71 | ```
 72 | 
 73 | Try an `ls`! Is your `miniconda3` folder a different color than the `Miniconda3-latest-Linux-x86_64.sh` installer?
 74 | 
 75 | ## Configuring channels
 76 | 
 77 | Conda works by searching for software packages in online repositories (**Channels**).
 78 | By default, conda searches for software only in Continuum’s (Conda’s developer) channels.
 79 | 
 80 | Most of the scientific software we'll be using is not available within the default channel, so we will add additional channels to our conda settings.
 81 | 
 82 | Channels in Conda are ordered. 
 83 | The channel with the highest priority is the first one that Conda checks, looking for the package you asked for. 
 84 | You can change this order, and also add channels to it (and set their priority as well).
 85 | 
 86 | If multiple channels contain a package, and one channel contains a newer version than the other one, the order of the channels’ determines which one of these two versions are going to be installed, even if the higher priority channel contains the older version.
 87 | ```
 88 | conda config --add channels defaults
 89 | conda config --add channels bioconda
 90 | conda config --add channels conda-forge
 91 | ```
 92 | Note that these commands stack. In this case, the highest priority channel will be `conda-forge`, followed by `bioconda` and then the `defaults` channel.
 93 | 
 94 | So when installing software, conda will start by looking for our desired software packing in the `conda-forge` channel, then search in the `bioconda` channel, and finally search in the `defaults` channel.
 95 | 
 96 | 
 97 | ## Using Conda environments
 98 | 
 99 | Different software packages often have different "dependencies": other software packages that are required for installation.
100 | In many cases, you'll need software with dependencies that conflict -- e.g. one program requires python version 3, while the other requires python 2.
101 | To avoid conflicts, we install software into "environments" that are isolated from one another - so that the software installed in one environment does not impact the software installed in another environment.
102 | 
103 | ### Create an environment
104 | Let's start by creating an environment for this project
105 | 
106 | ```
107 | conda create -y --name nsurp-env
108 | ```
109 | 
110 | This creates an empty environment named `nsurp-env`.
111 | To activate this environment, run:
112 | 
113 | ```
114 | conda activate nsurp-env
115 | ```
116 | 
117 | Your prompt should now start with `(nsurp-env)`.
118 | 
119 | ### Install software into the environment
120 | 
121 | We can now install software into our environment. 
122 | Let's install sourmash, which we will use in a later lesson. 
123 | 
124 | ```
125 | conda install -y sourmash
126 | ```
127 | 
128 | ## Deactivating and Exiting
129 | 
130 | When you'd like to leave your environment, you can type `conda deactivate` and you will return to the base environment.
131 | 
132 | When you log out of farm by typing `exit`, when you end a `tmux` or `screen` session, or when an `srun` job ends, your environment will automatically be deactivated.
133 | To restart the environment, you can run `conda activate nsurp-env`.
134 | 
135 | ## Additional Resources
136 | 
137 | [This tutorial](https://angus.readthedocs.io/en/2019/conda_tutorial.html) covers the basics of conda including a brief introduction to conda and why it is useful, installation and setup, creating environments, and installing software. 
138 | 
139 | These videos cover the material in the above tutorial: 
140 |   
141 |   + [video 1](https://www.youtube.com/watch?v=Ef1QwhELuMs)
142 |   + [video 2](https://www.youtube.com/watch?v=MOlYlvBBa9c) (there were some technical issues with this recording...sorry!)
143 | 


--------------------------------------------------------------------------------
/doc/10.workflows-and-repeatability.md:
--------------------------------------------------------------------------------
  1 | Workflows, Automation, and Repeatability
  2 | ===
  3 | 
  4 | For everything we have done so far, we have copied and pasted a lot of commands
  5 | to accomplish what we want. This works! But can also be time consuming, and is
  6 | more prone to error. We will show you next how to put all of these commands into
  7 | a shell script.
  8 | 
  9 | A **shell script** is a text file full of shell commands, that run just as if you're
 10 | running them interactively at the command line.
 11 | 
 12 | ## Writing a shell script
 13 | 
 14 | Let's put some of our commands from the quality trimming module into one script.
 15 | 
 16 | We'll call it `run_qc.sh`. The `sh` at the end of the tells you that this is a bash script.
 17 | 
 18 | First, cd into the `2020-NSURP` directory
 19 | 
 20 | ```
 21 | cd ~/2020-NSURP
 22 | ```
 23 | 
 24 | Now, use `nano` to create and edit a file called `run-qc.sh` 
 25 | 
 26 | `nano run-qc.sh` will open the file. Now add the following text:
 27 | 
 28 | ```
 29 | cd ~/2020-NSURP
 30 | mkdir -p quality
 31 | cd quality
 32 | 
 33 | ln -s ~/2020-NSURP/raw_data/*.fastq.gz ./
 34 | 
 35 | printf "I see $(ls -1 *.fastq.gz | wc -l) files here.\n"
 36 | 
 37 | for infile in *_R1.fastq.gz
 38 |   do
 39 |     name=$(basename ${infile} _R1.fastq.gz)
 40 |     fastp --in1 ${name}_R1.fastq.gz  --in2 ${name}_R2.fastq.gz   --out1 ${name}_1.trim.fastq.gz --out2 ${name}_2.trim.fastq.gz  --detect_adapter_for_pe \
 41 |       --qualified_quality_phred 4  --length_required 31 --correction --json ${name}.trim.json --html ${name}.trim.html
 42 |   done
 43 | 
 44 | ```
 45 | 
 46 | This is now a shell script that you can use to execute all of those commands in *one* go, including running `fastp` on all six samples!
 47 | Exit `nano` and try it out! 
 48 | 
 49 | Run:
 50 | ```
 51 | cd ~/2020-NSURP
 52 | bash run-qc.sh
 53 | ```
 54 | 
 55 | ### Re-running the shell script
 56 | 
 57 | Suppose you wanted to re-run the script. How would you do that?
 58 | 
 59 | Well, note that the `quality` directory is created at the top of the script, and everything is executed in that directory. So if you remove the quality directory like so,
 60 | 
 61 | ```
 62 | rm -rf quality
 63 | ```
 64 | 
 65 | > The `-rf` here means that you'd like to remove the whole directory "recursively" (`r`) and that you'd like file deltion to happen *without* asking for permission for each file (`f`)
 66 | 
 67 | 
 68 | You can then do:
 69 | ```
 70 | bash run-qc.sh
 71 | ```
 72 | 
 73 | ### Some tricks for writing shell scripts
 74 | 
 75 | #### Make it executable
 76 | 
 77 | You can get rid of the `bash` part of the command above with
 78 | some magic:
 79 | 
 80 | Put
 81 | ```
 82 | #! /bin/bash
 83 | ```
 84 | at the top of the file, and then run
 85 | 
 86 | ```
 87 | chmod +x ~/2020-NSURP/run-qc.sh
 88 | ```
 89 | 
 90 | at the command line.
 91 | 
 92 | You can now run
 93 | ```
 94 | ./run-qc.sh
 95 | ```
 96 | instead of `bash run-qc.sh`.
 97 | 
 98 | You might be thinking, ok, why is this important? Well, you can do the same with R scripts and Python scripts (but put `/usr/bin/env Rscript` or `/usr/bin/env python` at the top, instead of `/bin/bash`). This basically annotates the script with the language it's written in, so you don't have to know or remember yourself.
 99 | 
100 | So: it's not necessary but it's a nice trick.
101 | 
102 | You can also always *force* a script to be run in a particular language by specifying `bash <scriptname>` or `Rscript <Scriptname>`, too.
103 | 
104 | ## Automation with Workflow Systems!
105 | 
106 | Automation via shell script is wonderful, but there are a few problems here.
107 | 
108 | First, you have to run the entire workflow each time and it recomputes everything every time.
109 | If you're running a workflow that takes 4 days, and you change a command at the end, you'll have to manually go in and just run the stuff that depends on the changed command.
110 | 
111 | Second, it's very _explicit_ and not very _generalizable_. 
112 | If you want to run it on a different dataset, you're going to have to change a lot of commands.
113 | 
114 | You can read more about using workflow systems to streamline data-intensive biology in our preprint [here](https://www.biorxiv.org/content/10.1101/2020.06.30.178673v1).
115 | 
116 | ## Snakemake
117 | 
118 | Snakemake is one of several workflow systems that help solve these problems. 
119 | 
120 | If you want to learn snakemake, we recommend working through a tutorial, such as the one [here](https://hackmd.io/7k6JKE07Q4aCgyNmKQJ8Iw?view). It's also worth checking out the snakemake documentation [here](https://snakemake.readthedocs.io/en/stable/).
121 | 
122 | Here, we'll demo how to run the same steps above, but in Snakemake.
123 | 
124 | First, let's install snakemake in our conda environment:
125 | ```
126 | conda install -y snakemake-minimal
127 | ```
128 | 
129 | We're going to automate the same set of commands for trimming, but in snakemake.
130 | 
131 | Open a file called `Snakefile` using `nano`:
132 | 
133 | ```
134 | nano Snakefile
135 | ```
136 | 
137 | Here is the command we would need for a single sample, `CSM7KOJE`
138 | ```
139 | rule all:
140 |     input:
141 |         "quality/CSM7KOJE_1.trim.fastq.gz",
142 |         "quality/CSM7KOJE_2.trim.fastq.gz"
143 | 
144 | rule trim_reads:
145 |     input:
146 |         in1="raw_data/CSM7KOJE_R1.fastq.gz",
147 |         in2="raw_data/CSM7KOJE_R2.fastq.gz",
148 |     output:
149 |         out1="quality/CSM7KOJE_1.trim.fastq.gz",
150 |         out2="quality/CSM7KOJE_2.trim.fastq.gz",
151 |         json="quality/CSM7KOJE.fastp.json",
152 |         html="quality/CSM7KOJE.fastp.html"
153 |     shell:
154 |         """
155 |         fastp --in1 {input.in1}  --in2 {input.in2}  \
156 |         --out1 {output.out1} --out2 {output.out2}  \
157 |         --detect_adapter_for_pe  --qualified_quality_phred 4 \
158 |         --length_required 31 --correction \
159 |         --json {output.json} --html {output.html}
160 |         """
161 | ```
162 | 
163 | We can run it like this:
164 | ```
165 | cd ~/2020-NSURP
166 | snakemake -n
167 | ```
168 | > the `-n` tells snakemake to run a "dry run" - that is, just check that the input files exist and all files specified in rule `all` can be created from the rules provided within the Snakefile).
169 | 
170 | you should see "Nothing to be done."
171 | 
172 | That's because the trimmed files already exist!
173 | 
174 | Let's fix that:
175 | 
176 | ```
177 | rm quality/CSM7KOJE*.trim.fastq.gz
178 | ```
179 | 
180 | and now, when you run `snakemake`, you should see the fastp being run. Yay w00t! Then if you run `snakemake` again, you will see that it doesn't need to do anything - all the files are "up to date".
181 | 
182 | 
183 | ### Running all files at once
184 | 
185 | Snakemake wouldn't be very useful if it could only trim one file at a time, so let's modify the Snakefile to run more files at once:
186 | 
187 | ```
188 | SAMPLES = ["CSM7KOJE", "CSM7KOJ0"]
189 | rule all:
190 |     input:
191 |         expand("quality/{sample}_1.trim.fastq.gz", sample=SAMPLES)
192 |         expand("quality/{sample}_2.trim.fastq.gz", sample=SAMPLES)
193 | 
194 | rule trim_reads:
195 |     input:
196 |         in1="raw_data/{sample}_R1.fastq.gz",
197 |         in2="raw_data/{sample}_R2.fastq.gz",
198 |     output:
199 |         out1="quality/{sample}_1.trim.fastq.gz",
200 |         out2="quality/{sample}_2.trim.fastq.gz",
201 |         json="quality/{sample}.fastp.json",
202 |         html="quality/{sample}.fastp.html"
203 |     shell:
204 |         """
205 |         fastp --in1 {input.in1}  --in2 {input.in2}  \
206 |         --out1 {output.out1} --out2 {output.out2}  \
207 |         --detect_adapter_for_pe  --qualified_quality_phred 4 \
208 |         --length_required 31 --correction \
209 |         --json {output.json} --html {output.html}
210 |         """
211 | ```
212 | Try another dryrun:
213 | ```
214 | snakemake -n
215 | ```
216 | 
217 | Now actually run the workflow:
218 | ```
219 | snakemake -j 1
220 | ```
221 | > the `-j 1` tells snakemake to run a single job at a time. You can increase this number if you have access to more cpu (e.g. you're in an `srun` session where you asked for more cpu with the `-n` parameter).
222 | 
223 | Again, we see there's nothing to be done - the files exist! 
224 | Try removing the quality trimmed files and running again.
225 | 
226 | ```
227 | rm quality/*.trim.fastq.gz
228 | ```
229 | 
230 | ### Adding an environment
231 | 
232 | We've been using a conda environment throughout our modules. 
233 | We can export the installed package names to a file that we can use to re-install all packages in a single step (like on a different computer). 
234 | ```
235 | conda env export -n nsurp-env -f ~/2020-NSURP/nsurp-environment.yaml
236 | ```
237 | 
238 | We can use this environment in our snakemake rule as well!
239 | 
240 | ```
241 | SAMPLES = ["CSM7KOJE", "CSM7KOJ0"]
242 | 
243 | rule all:
244 |     input:
245 |         expand("quality/{sample}_1.trim.fastq.gz", sample=SAMPLES)
246 |         expand("quality/{sample}_2.trim.fastq.gz", sample=SAMPLES)
247 | 
248 | rule trim_reads:
249 |     input:
250 |         in1="raw_data/{sample}_R1.fastq.gz",
251 |         in2="raw_data/{sample}_R2.fastq.gz",
252 |     output:
253 |         out1="quality/{sample}_1.trim.fastq.gz",
254 |         out2="quality/{sample}_2.trim.fastq.gz",
255 |         json="quality/{sample}.fastp.json",
256 |         html="quality/{sample}.fastp.html"
257 |     conda: "nsurp-environment.yaml"
258 |     shell:
259 |         """
260 |         fastp --in1 {input.in1}  --in2 {input.in2}  \
261 |         --out1 {output.out1} --out2 {output.out2}  \
262 |         --detect_adapter_for_pe  --qualified_quality_phred 4 \
263 |         --length_required 31 --correction \
264 |         --json {output.json} --html {output.html}
265 |         """
266 | ```
267 | 
268 | Here, we just have a single environment, so it was pretty easy to just run the Snakefile while within our `nsurp-env` environment. Using conda environment with snakemake becomes more useful as you use more tools, because it helps to keep different tools (which likely have different software dependencies) in separate conda environments.
269 | 
270 | Run snakemake with `--use-conda` to have snakemake use the conda environment for this step.
271 | ```
272 | snakemake -j 1 --use-conda
273 | ```
274 | 
275 | 
276 | ## Why Automate with Workflow Systems?
277 | 
278 | Workflow systems contain powerful infrastructure for workflow management that can coordinate runtime behavior, self-monitor progress and resource usage, and compile reports documenting the results of a workow.
279 | These features ensure that the steps for data analysis are minimally documented and repeatable from start to finish. 
280 | When paired with proper software management, fully-contained workows are scalable, robust to software updates, and executable across platforms, meaning they will likely still execute the same set of commands with little investment by the user after weeks, months, or years.
281 | 
282 | Check out our [workflows preprint](https://www.biorxiv.org/content/10.1101/2020.06.30.178673v1) for a guide.
283 | 


--------------------------------------------------------------------------------
/doc/12.angus-github.md:
--------------------------------------------------------------------------------
  1 | # Version Control with Github
  2 | 
  3 | 
  4 | Learning objectives
  5 | 
  6 | + Learn about version Control
  7 | + Learn about Github repositories
  8 | + Create local repositories
  9 | + Backup your work online using git
 10 | 
 11 | ## Setup
 12 | 
 13 | You’ll need to sign up for a free account on [GitHub.com](https://github.com/). It’s as simple as signing up for any other social network. Keep the email you picked handy; we’ll be referencing it again in the lesson.
 14 | 
 15 | `Git` is installed on many system, but if you don't already have it, instructions to install Git for Windows, Mac or Linux can be found [here](https://git-scm.com/downloads).
 16 | 
 17 | ## What is Github?
 18 | 
 19 | GitHub is a code hosting platform for version control and collaboration. It lets you and others work together on projects from anywhere. GitHub is now the largest online storage space of collaborative works that exists in the world
 20 | 
 21 | ## What Is Git?
 22 | 
 23 | Why use something like Git? Say you and a coworker are both updating pages on the same website. You make your changes, save them, and upload them back to the website. So far, so good. The problem comes when your coworker is working on the same page as you at the same time. One of you is about to have your work overwritten and erased.
 24 | 
 25 | A version control application like Git keeps that from happening. You and your coworker can each upload your revisions to the same page, and Git will save two copies. Later, you can merge your changes together without losing any work along the way. You can even revert to an earlier version at any time, because Git keeps a “snapshot” of every change ever made.
 26 | 
 27 | ## Git terms
 28 | 
 29 | #### **Repository:** 
 30 | A directory or storage space where your projects can live. Sometimes GitHub users shorten this to “repo.” It can be local to a folder on your computer, or it can be a storage space on GitHub or another online host. You can keep code files, text files, image files, you name it, inside a repository.
 31 | 
 32 | #### **Version Control:** 
 33 | Basically, the purpose Git was designed to serve. When you have a Microsoft Word file, you either overwrite every saved file with a new save, or you save multiple versions. With Git, you don’t have to. It keeps “snapshots” of every point in time in the project’s history, so you can never lose or overwrite it.
 34 | 
 35 | #### **Commit:** 
 36 | This is the command that gives Git its power. When you commit, you are taking a “snapshot” of your repository at that point in time, giving you a checkpoint to which you can reevaluate or restore your project to any previous state.
 37 | 
 38 | #### **Branch:** 
 39 | How do multiple people work on a project at the same time without Git getting them confused? Usually, they “branch off” of the main project with their own versions full of changes they themselves have made. After they’re done, it’s time to “merge” that branch back with the “master,” the main directory of the project.
 40 | 
 41 | ![branch](_static/git_branch.png)
 42 | 
 43 | ## Git-Specific Commands
 44 | 
 45 | `git init`: Initializes a new Git repository. Until you run this command inside a repository or directory, it’s just a regular folder. Only after you input this does it accept further Git commands.
 46 | 
 47 | `git config`: Short for “configure,” this is most useful when you’re setting up Git for the first time.
 48 | 
 49 | `git help`: Forgot a command? Type this into the command line to bring up the 21 most common git commands. You can also be more specific and type “git help init” or another term to figure out how to use and configure a specific git command.
 50 | 
 51 | `git status`: Check the status of your repository. See which files are inside it, which changes still need to be committed, and which branch of the repository you’re currently working on.
 52 | 
 53 | `git add`: This does not add new files to your repository. Instead, it brings new files to Git’s attention. After you add files, they’re included in Git’s “snapshots” of the repository.
 54 | 
 55 | `git commit`: Git’s most important command. After you make any sort of change, you input this in order to take a “snapshot” of the repository. Usually it goes `git commit -m “Message here.”` The `-m` indicates that the following section of the command should be read as a message.
 56 | 
 57 | `git branch`: Working with multiple collaborators and want to make changes on your own? This command will let you build a new branch, or timeline of commits, of changes and file additions that are completely your own. Your title goes after the command. If you wanted a new branch called “cats,” you’d type `git branch cats`.
 58 | 
 59 | `git checkout`: Literally allows you to “check out” a repository that you are not currently inside. This is a navigational command that lets you move to the repository you want to check. You can use this command as g`it checkout master` to look at the master branch, or `git checkout cats` to look at another branch.
 60 | 
 61 | `git merge`: When you’re done working on a branch, you can merge your changes back to the master branch, which is visible to all collaborators. `git merge cats` would take all the changes you made to the “cats” branch and add them to the master.
 62 | 
 63 | `git push`: If you’re working on your local computer, and want your commits to be visible online on GitHub as well, you “push” the changes up to GitHub with this command.
 64 | 
 65 | `git pull`: If you’re working on your local computer and want the most up-to-date version of your repository to work with, you “pull” the changes down from GitHub with this command.
 66 | 
 67 | ## Setting Up GitHub And Git For The First Time
 68 | 
 69 | **It’s time to introduce yourself to Git. Type in the following code:**
 70 | 
 71 | ```
 72 | git config --global user.name "Your Name Here"
 73 | ```
 74 | 
 75 | Next, tell it your email and make sure it’s the same email you used when you signed up for a GitHub.com account
 76 | 
 77 | ```
 78 | git config --global user.email "your_email@youremail.com"
 79 | ```
 80 | 
 81 | ## Creating Your Online Repository
 82 | 
 83 | Now that you’re all set up, it’s time to create a place for your project to live. Both Git and GitHub refer to this as a repository, or “repo” for short, a digital directory or storage space where you can access your project, its files, and all the versions of its files that Git saves.
 84 | 
 85 | - On your Github profile, click the plus button and select a "New Repository".
 86 | 
 87 | ![newrepo](_static/git_new_repo.png)
 88 | 
 89 | - Give your repository a name & fill out the necessary information for your repository to be distinct and recognizeable.
 90 | 
 91 | - Don’t worry about clicking the checkbox next to “Initialize this repository with a README.” A Readme file is usually a text file that explains a bit about the project. But we can make our own Readme file locally for practice.
 92 | 
 93 | - Click the green “Create Repository” button and you’re set. You now have an online space for your project to live in.
 94 | 
 95 | ![createrepo](_static/git_create_repo.png)
 96 | 
 97 | ## Creating Your Local Repository
 98 | 
 99 | To begin, let's create a new directory called MyProject.
100 | 
101 | ```
102 | mkdir ~/MyProject
103 | ```
104 | 
105 | Then we will move into this new directory.
106 | 
107 | ```
108 | cd ~/MyProject
109 | ```
110 | 
111 | To create a local repository, we will first initiate a new repository for "MyProject" by entering the following command:
112 | 
113 | ```
114 | git init
115 | ```
116 | 
117 | `touch` is a multi-purpose command, but one of its key uses is to creat new, empty files. In our case, we will create a new file called Readme.txt. 
118 | 
119 | ```
120 | touch Readme.txt
121 | ```
122 | 
123 | We can check the status of our new repository by using `git status`.  
124 | 
125 | ```
126 | git status
127 | ```
128 | 
129 | When we want Git to track a file, we use `git add` followed by the file we want Git to "see". If we do not use `git add`, Git will not "see" this file.
130 | 
131 | ```
132 | git add Readme.txt
133 | ```
134 | 
135 | Lastly, to have Git track the current "snapshot" of our file, we enter `git commit`. The `-m` flag allows us to add a personal message with the files we are committing. In the following example, our message is "Add Readme.txt". Examples of other messages could include version information, changes made to a document, document descriptions, etc. 
136 | 
137 | ```
138 | git commit -m “Add Readme.txt”
139 | ```
140 | 
141 | Now Git has a "snapshot" of this version of Readme.txt which you can return to at any time in the future!
142 | 
143 | ![gitworking](_static/git_working.png)
144 | 
145 | 
146 | ## Connect Your Local Repository To Your GitHub Repository Online
147 | 
148 | ![gitremote](_static/git_remote.png)
149 | 
150 | This setup also makes it easy to have multiple collaborators working on the same project. Each of you can work alone on your own computers, but upload or “push” your changes up to the GitHub repository when they’re ready.
151 | 
152 | To tell Git the address off your remote repo in Github, Type the following replacing the address of the repo with your own
153 | 
154 | ```
155 | git remote add origin https://github.com/username/myproject.git
156 | ```
157 | 
158 | Git now knows there’s a remote repository and it’s where you want your local repository changes to go. To confirm, type this to check:
159 | 
160 | ```
161 | git remote -v
162 | ```
163 | 
164 | Great, Git is able to connect with our remote on Github. So, let's go ahead and push our files to Github
165 | 
166 | ```
167 | git push origin master
168 | ```
169 | 
170 | **You will be prompted for your Github username and password at this point**
171 | 
172 | and you can see some output like this that git is sending packets of data to your github repo and by this you will force git to back up all of your commits since the last time you pushed to be backed up online. FOR FREE!
173 | 
174 | ```
175 | Counting objects: 3, done.
176 | Writing objects: 100% (3/3), 217 bytes | 217.00 KiB/s, done.
177 | Total 3 (delta 0), reused 0 (delta 0)
178 | To https://github.com/sateeshbio5/angus_test.git
179 |  * [new branch]      master -> master
180 | 
181 | ```
182 | 
183 | > Note: To avoid having to type your username and password each time you push/pull from your github repos, read about Secure Login [here](https://help.github.com/articles/connecting-to-github-with-ssh/)
184 | 
185 | ![repo](_static/github_repo.png)
186 | 
187 | 
188 | ## Collaborating via GitHub
189 | 
190 | - **GitHub Issues:** Issues are a great way to keep track of tasks, enhancements, and bugs for your projects. They’re kind of like email—except they can be shared and discussed with all. Read more about Mastering Issues on Github [here](https://guides.github.com/features/issues/)
191 | 
192 | ![issues](_static/git_issues.png)
193 | 
194 | - **GitHub Pull-Requests:** Pull requests let you tell others about changes you've pushed to a branch in a repository on GitHub. Once a pull request is opened, you can discuss and review the potential changes with collaborators and add follow-up commits before your changes are merged into the base branch.
195 | 
196 | ![PR](_static/git_PR.png)
197 | 
198 | Look at others' repositories:
199 | 
200 | - [Hadley Wickham (ggplot2)](https://github.com/hadley)
201 | 
202 | - [Yihui Xie (knitr)](https://github.com/yihui)
203 | 
204 | - [ANGUS 2019](https://angus.readthedocs.io/en/2019/)
205 | 
206 | ## Host Websites & Blogs on GitHub
207 | 
208 | - GitHub Pages is an awesome feature that lets you host websites/blogs for you and your projects.
209 | 
210 | - Hosted directly from your GitHub repository. Just edit, push, and your changes are live.
211 | 
212 | - Read more about GitHub Pages [here](https://pages.github.com/)
213 | 
214 | 
215 | ## Sources for this tutorial & Additional Git Resources
216 | 
217 | - **Introductory tutorial by Lauren Orsini [here](https://readwrite.com/2013/09/30/understanding-github-a-journey-for-beginners-part-1/)**
218 | 
219 | - [Pro Git](https://git-scm.com/book)
220 | 
221 | - [Try Git](https://www.codeschool.com/courses/try-git)
222 | 
223 | - [Github Guides](https://www.youtube.com/GitHubGuides)
224 | 
225 | - [Github Reference](http://gitref.org/)
226 | 
227 | - [Git - Simple Guide](https://rogerdudler.github.io/git-guide)
228 | 
229 | - [Github Hello World](https://guides.github.com/activities/hello-world/)
230 | 


--------------------------------------------------------------------------------
/doc/08.taxonomic-discovery-with-sourmash.md:
--------------------------------------------------------------------------------
  1 | Taxonomic Discovery with Sourmash
  2 | ===
  3 | 
  4 | Until now, we've performed general pre-processing steps on our sequencing data;
  5 | sequence quality analysis and trimming usually occur at the start of any sequencing data analysis pipeline.
  6 | Now we will begin performing analysis that makes sense for metagenomic sequencing data. 
  7 | 
  8 | We are working with publicly-available data, but let's pretend that this is a brand new sample that we just got back from our sequencing core. 
  9 | One of the first things we often want to do with new metagenome sequencing samples is figure out their approximate species composition. 
 10 | This allows us to tap in to all of the information known about these species and relate our community to existing literature. 
 11 | 
 12 | We can determine the approximate composition of our sample using `sourmash`. 
 13 | 
 14 | ## Introduction to sourmash
 15 | 
 16 | Please read [this tutorial](https://angus.readthedocs.io/en/2019/sourmash.html) for an introduction to how sourmash works. 
 17 | 
 18 | tl;dr (but actually please read it): sourmash breaks nucleotide sequences down into k-mers, systematically subsamples those k-mers into a representative "signature", and then enables searches for those k-mers in databases.
 19 | This makes it really fast to make comparisons. Here, we will compare our metagenome sample against a pre-prepared database that contains all microbial sequences in GenBank.
 20 | 
 21 | ## Workspace Setup
 22 | 
 23 | If you're starting a new work session on FARM, be sure to follow the instructions [here](05.starting-a-work-session.md).
 24 | You can just do the part to enter a `tmux` session, since we'll be using a larger `srun` session than usual.
 25 | 
 26 | ## Starting with sourmash
 27 | 
 28 | Sourmash doesn't have a big memory or CPU footprint, and can be run on most laptops. 
 29 | Below is a recommended `srun` command to start an interactive session in which to run the `srun` commands.
 30 | 
 31 | ```
 32 | srun -p bmh -J sourmash24 -t 24:00:00 --mem=16gb -c 1 --pty bash
 33 | ```
 34 | 
 35 | ### Install sourmash
 36 | 
 37 | *Be sure you've set up conda channels properly, as in the [Install Conda](02.conda.md) section*
 38 | 
 39 | ```
 40 | conda activate nsurp-env
 41 | conda install -y sourmash
 42 | ```
 43 | 
 44 | Next, let's create a directory in which to store our sourmash signatures
 45 | 
 46 | ```
 47 | cd ~/2020-NSURP
 48 | mkdir -p sourmash
 49 | cd sourmash
 50 | ```
 51 | 
 52 | ## What data to use?
 53 | 
 54 | We could run sourmash with our adapter trimmed or k-mer trimmed data.
 55 | In fact, doing so would make sourmash faster because there would be fewer k-mers in the sample.
 56 | 
 57 | We are currently comparing our sample against a database of trusted DNA sequences, so any k-mers in our sample that contain adapters sequence or errors will not match to the trusted reference sequences in the database.
 58 | However, even though we very lightly trimmed our reads, there is a chance that we removed a very low abundance organism that was truly present in the sample.
 59 | Given this trade-off, we use often raw reads for reference data comparisons, and quality-controlled reads for all other comparisons.
 60 | 
 61 | 
 62 | ## Generate a sourmash signature
 63 | 
 64 | Next, let's make sourmash signatures from our reads.
 65 | 
 66 | Remember from the [Quick Insights from Sequencing Data with sourmash](https://angus.readthedocs.io/en/2019/sourmash.html) tutorial that a k-mer size of 21 is approximately specific at the genus level, a 31 is at the species level, and 51 at the strain level. We will calculate our signature with all three k-mer sizes so we can choose which one we want to use later.
 67 | 
 68 | ```
 69 | sourmash compute -o CSM7KOJE.raw.sig --merge CSM7KOJE --scaled 2000 -k 21,31,51 --track-abundance ~/2020-NSURP/raw_data/CSM7KOJE_*fastq.gz
 70 | ```
 71 | 
 72 | You should see output that looks like this:
 73 | 
 74 | ```
 75 | == This is sourmash version 3.4.1. ==
 76 | == Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==
 77 | 
 78 | setting num_hashes to 0 because --scaled is set
 79 | computing signatures for files: /home/ntpierce/2020-NSURP/raw_data/CSM7KOJE_R1.fastq.gz, /home/ntpierce/2020-NSURP/raw_data/CSM7KOJE_R2.fastq.gz
 80 | Computing signature for ksizes: [21, 31, 51]
 81 | Computing only nucleotide (and not protein) signatures.
 82 | Computing a total of 3 signature(s).
 83 | Tracking abundance of input k-mers.
 84 | ... reading sequences from /home/ntpierce/2020-NSURP/raw_data/CSM7KOJE_R1.fastq.gz
 85 | ... /home/ntpierce/2020-NSURP/raw_data/CSM7KOJE_R1.fastq.gz 9704045 sequences
 86 | ... reading sequences from /home/ntpierce/2020-NSURP/raw_data/CSM7KOJE_R2.fastq.gz
 87 | ... /home/ntpierce/2020-NSURP/raw_data/CSM7KOJE_R2.fastq.gz 9704045 sequences
 88 | calculated 1 signatures for 19408090 sequences taken from 2 files
 89 | saved signature(s) to CSM7KOJE.raw.sig. Note: signature license is CC0.
 90 | ```
 91 | 
 92 | The outputs file, `CSM7KOJE.raw.sig` holds a representative subset of k-mers from our original sample, as well as their abundance information. 
 93 | The k-mers are "hashed", or transformed, into numbers to make selecting, storing, and looking up the k-mers more efficient.
 94 | 
 95 | ## Sourmash gather
 96 | 
 97 | `sourmash gather` is a method for estimating the taxonomic composition of known sequences in a metagenome.
 98 | 
 99 | Please go read through the sourmash documentation on [Breaking down metagenomic samples with gather and lca](https://sourmash.readthedocs.io/en/latest/classifying-signatures.html#).
100 | Check out Appendix A and B in this documentation for a good overview of how sourmash gather works.
101 | 
102 | Running gather on our IBD samples can give us an idea of the microbes present in each sample.
103 |  `gather` results provide strain-level specificity to matches in its output -- e.g. all strains that match any sequences (above a threshold) in your metagenome will be reported, along with the percent of each strain that matches. 
104 | This is useful both to estimate the amount of metagenome sample that is known, and to estimate the closest strain relative to the organisms in your metagenomes.
105 | 
106 | ### Download and unzip the database:
107 | 
108 | ```
109 | mkdir -p ~/2020-NSURP/databases/
110 | cd ~/2020-NSURP/databases/
111 | curl -L https://osf.io/jgu93/download -o genbank-k31.sbt.zip
112 | cd ~/2020-NSURP/sourmash
113 | ```
114 | 
115 | ### Run sourmash gather
116 | 
117 | First, let's run a very quick search:
118 | 
119 | ```
120 | sourmash gather --num-results 10 CSM7KOJE.raw.sig ~/2020-NSURP/databases/genbank-k31.sbt.zip
121 | ```
122 | 
123 | > - the `--num-results 10` is a way of shortening the search. In this case, we ask for only the top 10 results
124 | 
125 | We see an output that looks like this:
126 | 
127 | ```
128 | == This is sourmash version 3.4.1. ==
129 | == Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==
130 | 
131 | selecting default query k=31.
132 | loaded query: CSM7KOJE... (k=31, DNA)
133 | loaded 1 databases.
134 | 
135 | 
136 | overlap     p_query p_match avg_abund
137 | ---------   ------- ------- ---------
138 | 6.4 Mbp        9.5%   63.6%      17.4    CBWF010000001.1 Klebsiella pneumoniae...
139 | 5.3 Mbp       25.6%   90.6%      56.5    KB851045.1 Clostridium clostridioform...
140 | 5.1 Mbp        3.6%   74.4%       8.5    GG668320.1 Clostridium hathewayi DSM ...
141 | 4.4 Mbp        2.0%   83.9%       5.4    LBDB01000001.1 Vibrio parahaemolyticu...
142 | 3.2 Mbp        5.0%   80.5%      18.6    JTBP01000001.1 Proteus mirabilis stra...
143 | 4.8 Mbp        1.0%   33.6%       4.2    JRSL01000930.1 Escherichia coli strai...
144 | 2.7 Mbp        0.8%   59.5%       3.5    FUNQ01000052.1 Clostridioides diffici...
145 | 2.5 Mbp        3.5%   33.9%      16.3    CZAT01000001.1 Flavonifractor plautii...
146 | 4.9 Mbp        2.3%   45.1%      11.7    KQ087951.1 Escherichia coli strain BI...
147 | 2.2 Mbp        3.3%   64.6%      18.1    FCEY01000001.1 Clostridium sp. AT5 ge...
148 | 
149 | found 10 matches total;
150 | (truncated gather because --num-results=10)
151 | the recovered matches hit 56.8% of the query
152 | ```
153 | 
154 | The shortened search will be quite quick. 
155 | 
156 | The two columns to pay attention to are `p_query` and `p_match`.
157 | `p_query` is the percent of the metagenome sample that is
158 | (estimated to be) from the named organism.  `p_match` is the percent
159 | of the database match that is found in the query.  
160 | These metrics` are affected by both evolutionary distance and by low coverage of the
161 | organism's gene set (low sequencing coverage, or little expression).
162 | 
163 | 
164 | Now, let's run the full `gather` analysis:
165 | This will take a long time to run.
166 | Sourmash will also output a csv with all the results information that we will use later to visualize our results.
167 | 
168 | ```
169 | sourmash gather -o CSM7KOJE_x_genbank-k31.gather.csv CSM7KOJE.raw.sig ~/2020-NSURP/databases/genbank-k31.sbt.zip
170 | ```
171 | 
172 | When sourmash is finished running, it tells us the % of our sequence was unclassified; i.e. it doesn't match any sequence in the database.
173 | 
174 | In a later module, we may use additional steps prior to `gather` to improve the percent of sequence in the metagenome that is classifiable.
175 | These include, for example, using `bbduk` to remove additional human genome k-mers or using assembly-style programs such as `megahit` or [spacegraphcats](https://link.springer.com/article/10.1186/s13059-020-02066-4), to build longer contiguous gene sequences.
176 | 
177 | ## Other Methods for Taxonomic Discovery and Classification
178 | 
179 | There are many tools, such as Kraken and Kaiju, that can do taxonomic classification of individual reads from metagenomes.
180 | These seem to perform well (albeit with high false positive rates) in situations where you don’t necessarily have the genome sequences that are in the metagenome. 
181 | Sourmash, by contrast, can estimate which known genomes are actually present, so that you can extract them and map/align to them. 
182 | It seems to have a very low false positive rate and is quite sensitive to strains.
183 | 
184 | ## Detecting contamination or incorrect data
185 | 
186 | sourmash `gather` taxonomic discovery can help uncover contamination or errors in your sequencing samples.
187 | We recommend doing sourmash gather immediately after receiving your data from the sequencing facility.
188 | If your environmental metagenome has a tremendous amount of mouse sequence in it... maybe the sequencing facility sent you the wrong data?
189 | 
190 | ## Challenge: sourmash gather
191 | 
192 | ### Gather with trimmed data
193 | 
194 | Above, we ran `sourmash gather` on our untrimmed data. 
195 | 44% of the sample did not contain sequence in any GenBank assembly. 
196 | A substantial proportion of this sequence could be due to k-mers with errors.
197 | Run `sourmash gather` again on the adapter/ k-mer trimmed data.
198 | How much less of the sequence is unclassifiable when the errors and adapters are removed?
199 | How many species are no longer detected after k-mer and error trimming?
200 | 
201 | ### Gather at different ksizes
202 | 
203 | The genbank reference databases for signatures of ksize k=21 and k=51 are available for download.
204 | 
205 | k=21
206 | ```
207 | cd ~/2020-NSURP/databases/
208 | curl -L https://osf.io/dm7n4/download -o genbank-k21.sbt.zip
209 | ```
210 | 
211 | k=51
212 | ```
213 | cd ~/2020-NSURP/databases/
214 | curl -L https://osf.io/2uvsc/download -o genbank-k51.sbt.zip
215 | ```
216 | 
217 | How do you expect the gather results to differ for each? Why?
218 | 
219 | ### Test Gather parameters 
220 | 
221 | By running `sourmash gather --help`, you can see all the options for the `gather` program.
222 | 
223 | #### scaled
224 | 
225 | The `scaled` option provides a chaces to downample the query to the specified scaled factor.
226 | 
227 | ```
228 |   --scaled FLOAT        downsample query to the specified scaled factor
229 | ```
230 | 
231 | Try running gather with a `scaled` value of 50000. How do the results change, and why?
232 | 
233 | #### base pair threshold for matches
234 | 
235 | The `threshold-bp` option lets you only find matches that have at least this many base pairs in common (default 50,000 bp)
236 | 
237 | Increasing the threshold makes gather quicker (at the expense of losing some of the smaller matches):
238 | ```
239 | --threshold-bp 10000000
240 | ```
241 | What happens if you run gather with the threshold above?
242 | 
243 | 
244 | Decreasing the threshold will take more time, but be more thorough. A threshold of 0bp does an exhaustive search for all matches
245 | ```
246 | --threshold-bp 0
247 | ```
248 | The full gather took quite a long time on our samples, so there's no need to run this one!
249 | But do keep it in mind as a way to make sure we get absolutely **all** of the matches we can get using gather.
250 | 
251 | 


--------------------------------------------------------------------------------
/doc/07.quality-control.md:
--------------------------------------------------------------------------------
  1 | Quality Control the Data
  2 | ===
  3 | 
  4 | If you're starting a new work session on FARM, be sure to follow the instructions [here](05.starting-a-work-session.md).
  5 | 
  6 | 
  7 | After [downloading sequencing data](06.download-assess-ibd-data.md), the next step in many pipelines is to perform quality control trimming on the reads.
  8 | However, deciding when and how to trim data is pipeline dependent.
  9 | Below, we define a few types of quality control and explore a use cases and how trimming recommendations may change with different applications.
 10 | Although this project focuses on metagenomic sequencing, we include other applications in this discussion.
 11 | 
 12 | ## Types of Quality Control
 13 | 
 14 | + **Adapter and barcode trimming**: Adapter sequences are added to a sample library to aid in the physical process of sequencing.
 15 | They are ubiquitous within a certain chemistry, and so are present across all sequenced samples. 
 16 | Barcodes are unique nucleotide sequences used to identify a specific sample when multiple samples are sequenced in a single lane.
 17 | After barcoded samples are separated from one another in a process called demultiplexing, barcodes are no longer needed in a sequence. 
 18 | It is generally a good idea to remove adapters and barcodes from sequencing samples before proceeding with any downstream application. 
 19 | However, if you are using a pipeline that involves matching between reads and a quality reference, you may get similar results with or without adapter trimming. For quick estimation 
 20 | + **Quality trimming**: Quality trimming removes low-quality bases from sequences reads. The user can set the stringency cut off for "low quality" by indicating a phred score at which to trim. 
 21 | + **K-mer trimming**: K-mer trimming removes k-mers that occur very few times in a sequencing dataset. In reads with sufficient sequencing depth, we expect real k-mers to occur multiple times. When a single sequencing error occurs in a read, this produces *k* erroneous k-mers. K-mer trimming trims a read to remove all of these k-mers. K-mer trimming does not rely on information from the sequencer like phred scores, but instead on the biological signal in the reads themselves.
 22 | 
 23 | ## When and how to trim?
 24 | 
 25 | Trimming is a balance of removing artificial or incorrect nucleotides and retaining true nucleotides in sequencing data. 
 26 | What and when to trim therefore changes with the sequencing application, and with the sequencing data itself. 
 27 | Below we explore some trimming use cases to help develop an intuition for what type of trimming is necessary and when.
 28 | 
 29 | + **Single-species genomic sequencing for assembly**: 
 30 | Let's imagine we have just sequenced an *Escherichia coli* isolate with 100X coverage and would like to assemble the isolate. 
 31 | We would first want to remove adapters and barcodes to prevent these sequences from ending up in our final assembly. 
 32 | Then, stringent quality and k-mer trimming may be appropriate, because we have high coverage data; even if we were to stringently trim and were only left with 50% of our original number of reads, we would still have 50X coverage of very high quality data. 
 33 | 50X coverage is sufficient to acheive a good bacterial assembly in most cases. 
 34 | + **_de novo_ RNA-sequencing assembly** Now let's imagine we have sequenced the transcriptome of our favorite species which does not currently have a reference transcriptome.
 35 | Because RNA transcripts have different abundance profiles, we can't use average coverage in the same way as we used it for single-species genomic sequencing. 
 36 | We need to be more careful when we k-mer and error trim so as not to accidentally remove low-abundance reads that represent true transcripts. We would likely use light quality trimming (e.g. a phred score of ~5). For k-mer trimming, we would only trim reads that contain high-abundance k-mers.
 37 | + **Metagenome *de novo* assembly** Trimming metagenomic reads for *de novo* assembly is similar to trimming RNA-sequencing reads for *de novo* transcriptome assembly. Because there are often low-abundance organisms that have low-coverage in our sequencing datasets, we need to be careful not to accidently remove these during trimming. 
 38 | + **Metagenome read mapping** 
 39 | In referenced-based analyses including mapping of metagenomic reads to a set of reference genomes, reads will often map even when they contain adapters and barcodes. 
 40 | However, in some cases, the presence of adapters and barcodes does prevent mapping, so it is safer to remove all barcodes and adapters. 
 41 | 
 42 | 
 43 | ## References about trimming
 44 | 
 45 | Many scientific studies have explored the trimming parameter space in an effort to make recommendations for different applications. 
 46 | We include some of these studies below.
 47 | 
 48 | + [On the optimal trimming of high-throughput mRNA sequence data](https://www.frontiersin.org/articles/10.3389/fgene.2014.00013/full)
 49 | + [An Extensive Evaluation of Read Trimming Effects on Illumina NGS Data Analysis](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0085024)
 50 | 
 51 | 
 52 | ## Quality and Adapter trimming with Fastp
 53 | 
 54 | We will use [fastp](https://academic.oup.com/bioinformatics/article/34/17/i884/5093234) to do quality trimming of our reads.
 55 | 
 56 | In the [Download and Visual Assessment Module](06.download-assess-ibd-data.md), we saw using FastQC that the Illumina Universal Adapter was present in our samples.
 57 | 
 58 | ![](_static/adapter_content_R1.png)
 59 | 
 60 | We also saw that the sequence read quality dropped dramatically toward the end of the read.
 61 | 
 62 | ![](_static/per_base_seq_qual_R1.png)
 63 | 
 64 | We will remove both of these sequences using fastp.
 65 | Fastp also creates its own FastQC-style `html` reports for the files that we can look at after running.
 66 | 
 67 | 
 68 | ### Run fastp
 69 | 
 70 | **Reminder, make sure you've followed the [Starting a Work Session](05.starting-a-work-session.md) steps to get your Farm session set up.**
 71 | **You should be within your `nsurp-env` conda environment.**
 72 | 
 73 | 
 74 | Install fastp:
 75 | 
 76 | ```
 77 | conda install -y fastp
 78 | ```
 79 | 
 80 | We can now trim our data!
 81 | Let's set up our directory structure:
 82 | 
 83 | ```
 84 | cd ~/2020-NSURP
 85 | mkdir -p trim
 86 | cd trim
 87 | ```
 88 | 
 89 | Run fastp on the `CSM7KOJE` sample with the following command:
 90 | 
 91 | ```
 92 | fastp --in1 ~/2020-NSURP/raw_data/CSM7KOJE_R1.fastq.gz \
 93 |   --in2 ~/2020-NSURP/raw_data/CSM7KOJE_R2.fastq.gz \
 94 |   --out1 CSM7KOJE_1.trim.fastq.gz \
 95 |   --out2 CSM7KOJE_2.trim.fastq.gz \
 96 |   --detect_adapter_for_pe \
 97 |   --qualified_quality_phred 4 \
 98 |   --length_required 31 --correction \
 99 |   --json CSM7KOJE.trim.json \
100 |   --html CSM7KOJE.trim.html
101 | ```
102 | 
103 | > **Command Breakdown**
104 | 
105 | > - `--in1`, `--in2` - the read1 and read2 input file names
106 | > - `--out1`, `--out2` - the read1 and read2 output file names
107 | > - `--detect_adapter_for_pe` - Auto detect the adapters for our paired end (PE) reads, and remove them during trimming
108 | > - `--length_required` - discard reads shorter than `length_required` paramter (default is 15)
109 | > - `--correction` - enable base correction if the paired end reads overlap (only for PE data),
110 | > - `--qualified_quality_phred` - the quality value that a base is qualified. Default 15 means phred quality >=Q15 is qualified. (int [=15])
111 | > - `--html`, `--json` - file name for the fastp trimming report printed to html and/or json format 
112 | 
113 | We change the Phred quality score cutoff to `4` to be more lenient in our trimming.
114 | Recall from our FastQC lesson that a quality score of 10 indicates a 1 in 10 chance that the base is inaccurate. 
115 | A score of 20 is a 1 in 100 chance that the base is inaccurate. 30 is 1 in 1,000. And 40 in 1 in 10,000. 
116 | By using a score of 4, we are more likely to keep data that has a high probability of being accurate. 
117 | 
118 | As done in [downloading sequencing data](06.download-assess-ibd-data.md), you can use `scp` to copy the html report to your computer:
119 | 
120 | ```
121 | scp -P 2022  -i /path/to/key/file username@farm.cse.ucdavis.edu:~/2020-NSURP/trim/*.html ./
122 | ```
123 | 
124 | If you're on a mac using `zsh`, you may need to replace the `scp` with `noglob scp` in the command above.
125 | 
126 | If you're on windows, you may need to move the the files from the download location on your Linux shell over to the windows side of your computer before opening.
127 | 
128 | Once the file is on your local computer, double click on it and it will open in your browser.
129 | You can now explore the fastp trimming report.
130 | 
131 | 
132 | ## Why (or why not) do k-mer trimming?
133 | 
134 | Even after quality trimming with fastp, our reads will still contain errors. Why?
135 | 
136 | First, fastp trims based solely on the quality score, which is a statistical statement about the correctness of a base - a Q score of 30 means that, of 1000 bases with that Q score, 1 of those bases will be wrong. So, a base can have a high Q score and still be wrong (and many bases will have a low Q score and still be correct)!
137 | 
138 | Second, we trimmed very lightly - only bases that had a very low quality were removed. This was intentional because we want to retain as much coverage as possible for our downstream techniques (many of which do not suffer too much if some errors remain).
139 | 
140 | An alternative to trimming based on the quality scores is to trim based on k-mer abundance - this is known as k-mer spectral error trimming. K-mer spectral error trimming always beats quality score trimming in terms of eliminating errors; e.g. look at this table from [Zhang et al., 2014](https://journals.plos.org/plosone/article?id=10.1371%2Fjournal.pone.0101271):
141 | 
142 | ![khmer output table](_static/2014-zhang.png)
143 | 
144 | The basic logic is this: if you see low abundance k-mers in a high coverage data set, those k-mers are almost certainly the result of errors. (Caveat: strain variation could also create them.)
145 | 
146 | In metatranscriptomic data sets we do have the problem that we may have very low and very high coverage data. So we don’t necessarily want to get rid of all low-abundance k-mers, because they may represent truly low abundance (but useful) data.
147 | 
148 | As part of the khmer project in our lab, we have developed an approach that sorts reads into high abundance and low abundance reads, and only error trims the high abundance reads.
149 | 
150 | ![kmer trimming](_static/kmer-trimming.png)
151 | 
152 | This does mean that many errors may get left in the data set, because we have no way of figuring out if they are errors or simply low coverage, but that’s OK (and you can always trim them off if you really care).
153 | 
154 | 
155 | ## Kmer trimming with khmer
156 | 
157 | Next, let's k-mer trim our data. 
158 | This will take 20GB of RAM and a few hours to complete. 
159 | We didn't ask for quite that much RAM when we initially got our computer, so we'll need a different one.
160 | 
161 | First, exit your current `srun` session
162 | 
163 | ```
164 | exit
165 | ```
166 | 
167 | Next, use this `srun` command to get a larger computer that can handle the k-mer trimming analysis:
168 | 
169 | ```
170 | srun -p bmh -J khmer -t 20:00:00 --mem=21gb -c 1 --pty bash
171 | ```
172 | 
173 | Since we changed computers, our conda environment was automatically deactivated.
174 | 
175 | Activate your project environment again:
176 | 
177 | ```
178 | conda activate nsurp-env
179 | ```
180 | 
181 | ### Install khmer
182 | 
183 | We need to install the software we will use to perform k-mer trimming, `khmer`.
184 | Make sure you activate the conda environment you are using for this project with `conda activate env_name`.
185 | 
186 | ```
187 | conda install -y khmer
188 | ```
189 | 
190 | ### Using khmer for k-mer trimming
191 | 
192 | Once `khmer` is installed, we can use it for k-mer trimming. 
193 | Let's get our files and directories set up:
194 | ```
195 | cd ~/2020-NSURP
196 | mkdir -p kmer-trim
197 | cd kmer-trim
198 | ```
199 | 
200 | Now we can run k-mer trimming!
201 | The first line of this command interleaves our paired end reads, putting them in one file where forward and reverse reads alternate on each line.
202 | The second line of this command performs the k-mer trimming. 
203 | 
204 | Note that these commands are connected by the pipe (`|`) character.
205 | This character means that the first half of the command (before the `|`) is executed first, and the output is passed ("piped") to the second half of the command (after the `|`).
206 | 
207 | ```
208 | interleave-reads.py ~/2020-NSURP/trim/CSM7KOJE_1.trim.fastq.gz ~/2020-NSURP/trim/CSM7KOJE_2.trim.fastq.gz | \
209 |         trim-low-abund.py --gzip -C 3 -Z 18 -M 20e9 -V - -o CSM7KOJE.kmertrim.fq.gz
210 | ```
211 | > Note: Here, we are referencing the trimmed files using an absolute path: `~/2020-NSURP/trim/`.
212 | > That is, to access these files, we go to our home directory (`~`), then descend into the `2020-NSURP` folder, then descend again into the `trim` folder.
213 | 
214 | 
215 | ### Assess changes in kmer abundance
216 | 
217 | To see how many k-mers we removed, you can examine the distribution as above,
218 | or use the `unique-kmers.py` script. Let's compare kmers for one sample.
219 | 
220 | ```
221 | unique-kmers.py ../trim/CSM7KOJE_1.trim.fastq.gz ../trim/CSM7KOJE_2.trim.fastq.gz
222 | unique-kmers.py CSM7KOJE.kmertrim.fq.gz
223 | ```  
224 | 
225 | > Note, here we are using a relative path, `../trim/`.
226 | > That is, to access the `CSM7KOJE_*.trim.fastq.gz` files, we go up one directory (`../`), then down into `trim`.
227 | 
228 | 
229 | The raw adapter-trimmed inputs have an estimated 164426731 unique 32-mers.
230 | ```
231 | Estimated number of unique 32-mers in ../trim/CSM7KOJE_1.trim.fastq.gz: 83127191
232 | Estimated number of unique 32-mers in ../trim/CSM7KOJE_2.trim.fastq.gz: 80110484
233 | Total estimated number of unique 32-mers: 98077936
234 | ```
235 | 
236 | The k-mer trimmed file (kmer output) has an estimated 163890994 unique 32-mers.
237 | ```
238 | Estimated number of unique 32-mers in CSM7KOJE.kmertrim.fq.gz: 163890994
239 | Total estimated number of unique 32-mers: 163890994
240 | ```
241 | 
242 | Note that the second number is smaller than the first, with a little over 500,000 low-abundance k-mers having been removed as likely errors.
243 | These are pretty small sample datasets that are already relatively clean - often the difference in unique k-mers is MUCH larger!
244 | 
245 | ### Challenge: quality control
246 | 
247 | Make sure you do `fastp` and `khmer` trimming on each of the 6 datasets.
248 | Keep track of the commands you use in a [HackMD](hackmd.io) lab notebook.
249 | Use backticks to create code blocks and be sure to write notes describing the purpose of each step and any problems you encountered.
250 | There's no need to count unique k-mers for every dataset, but feel free if you'd like to look at the differences :).
251 | 


--------------------------------------------------------------------------------
/doc/06.download-assess-ibd-data.md:
--------------------------------------------------------------------------------
  1 | Download and Visually Assess the Data
  2 | ===
  3 | 
  4 | Metagenomics is the analysis of genetic material from environmental samples ("environment" here meaning anything from human gut to open ocean). 
  5 | Metagenomics (DNA sequencing) and metatranscriptomics (RNA sequencing) can be used to assess the composition and functional potential of microbial commmunities.
  6 | 
  7 | Human-associated microbial communities, such as the trillions of microorganisms that colonize the human gut, have co-evolved with humans and play important roles both in human biology and disease.
  8 | Gut symbionts contribute to human digestive and metabolic functions, immune system regulation, and regulation of the intestinal epithelial barrier, including providing protection against pathogens.
  9 | 
 10 | 
 11 | Inflammatory bowel disease (IBD) is an umbrella term used for diseases (Crohn's disease, Ulcerative Colitis) characterized by chronic inflammation of the intestines.
 12 | These diseases impact about 3 million people in the United States.
 13 | 
 14 | IBD is thought to be caused by a combination of genetic and environmental factors that alter gut homestasis and trigger immune-mediated inflammation.
 15 | In particular, IBD is associated with an alteration of the composition of gut microbiota ("dysbiosis"), though the exact impact of the microbial community is still under investigation.
 16 | 
 17 | Here, we will compare metagenome samples from patients with Inflammatory bowel disease (IBD) to samples from patients without IBD.
 18 | We will characterize the microbial community associated with IBD vs non-IBD and assess the results in the context of current community findings for IBD
 19 | 
 20 | 
 21 | ## Background Reading
 22 | 
 23 | Here are some articles that contain good background info on the human microbiome and IBD.
 24 | 
 25 | - [The human microbiome in evolution](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5744394/)
 26 | - [Host–microbiota interactions in inflammatory bowel disease](https://www.nature.com/articles/s41577-019-0268-7)
 27 | - [Microbial genes and pathways in inflammatory bowel disease](https://www.nature.com/articles/s41579-019-0213-6)
 28 | 
 29 | 
 30 | ## Using FARM for downloads and analysis:
 31 | 
 32 | Follow the instructions on [Starting a Work Session on FARM](05.starting-a-work-session.md) to start a `tmux` session, get access to a compute node (via an `srun` interactive session).
 33 | 
 34 | 
 35 | ## Download the data
 36 | 
 37 | Now that we have a computer, let's download the data.
 38 | 
 39 | _note that you can also run these steps (and most analyses) on your personal computer_
 40 | 
 41 | 1. Make a project directory
 42 | 
 43 | ```
 44 | cd
 45 | mkdir -p 2020-NSURP/raw_data
 46 | ```
 47 | 
 48 | 2. download samples to `raw_data` directory
 49 | 
 50 | ```
 51 | cd 2020-NSURP/raw_data
 52 | ```
 53 | 
 54 | Now, download two files for each of the following sample accession numbers using `wget`:
 55 | 
 56 | 
 57 | ```
 58 | # patient with Crohns disease
 59 | wget https://ibdmdb.org/tunnel/static/HMP2/WGS/1818/CSM7KOJO.tar
 60 | wget https://ibdmdb.org/tunnel/static/HMP2/WGS/1818/CSM7KOJG.tar
 61 | wget https://ibdmdb.org/tunnel/static/HMP2/WGS/1818/CSM7KOJE.tar
 62 | 
 63 | # patient with no IBD
 64 | wget https://ibdmdb.org/tunnel/static/HMP2/WGS/1818/HSM5MD5B.tar
 65 | wget https://ibdmdb.org/tunnel/static/HMP2/WGS/1818/HSM5MD5D.tar
 66 | wget https://ibdmdb.org/tunnel/static/HMP2/WGS/1818/HSM6XRSX.tar
 67 | ```
 68 | 
 69 | if you do `ls` now, you should see the following:
 70 | 
 71 | ```
 72 | CSM7KOJE.tar  CSM7KOJO.tar  HSM5MD5D.tar
 73 | CSM7KOJG.tar  HSM5MD5B.tar  HSM6XRSX.tar
 74 | ```
 75 | 
 76 | Untar each set of files:
 77 | ```
 78 | tar xf CSM7KOJO.tar
 79 | ```
 80 | 
 81 | 
 82 | 
 83 | Now, let's make the files difficult to modify or delete:
 84 | 
 85 | ```
 86 | chmod u-w *fastq.gz
 87 | ```
 88 | 
 89 | ## FASTQ format
 90 | 
 91 | Although it looks complicated (and it is), we can understand the
 92 | [fastq](https://en.wikipedia.org/wiki/FASTQ_format) format with a little decoding. Some rules about the format
 93 | include...
 94 | 
 95 | |Line|Description|
 96 | |----|-----------|
 97 | |1|Always begins with '@' and then information about the read|
 98 | |2|The actual DNA sequence|
 99 | |3|Always begins with a '+' and sometimes the same info in line 1|
100 | |4|Has a string of characters which represent the quality scores; must have same number of characters as line 2|
101 | 
102 | We can view the first complete read in a fastq file by using `head` to look at the first four lines.
103 | Because the our files are gzipped, we first temporarily decompress them with `zcat`.
104 | 
105 | ```
106 | zcat CSM7KOJE_R1.fastq.gz | head -n 4
107 | ```
108 | 
109 | The first four lines of the file look something like this:
110 | 
111 | *Note: this is a different dataset, so your will look slightly different, though the formatting is the same*
112 | 
113 | ```
114 | @SRR2584863.1 HWI-ST957:244:H73TDADXX:1:1101:4712:2181/1
115 | TTCACATCC@CAYNHANXX170426:3:1101:10002:54478/1
116 | ATCCTTTACAATTACAAGATGCGTATGACCGCCTGATACAACAAGACATAAGAACGGAGGTTCCAGCTCTAAGTTTTCTATATAATTGGCAGAAATATG
117 | +
118 | B<BBBFFFFFFFFFFFFFFFFFBFFFFFFFFFFF<BFBFFFFFFF<FFFFFBFFFF/<F/BFF<<BFFBFFFFFFF<F<</FF/FFFFBFF</<FB7FFTGACCATTCAGTTGAGCAAAATAGTTCTTCAGTGCCTGTTTAACCGAGTCACGCAGGGGTTTTTGGGTTACCTGATCCTGAGAGTTAACGGTAGAAACGGTCAGTACGTCAGAATTTACGCGTTGTTCGAACATAGTTCTG
119 | +
120 | CCCFFFFFGHHHHJIJJJJIJJJIIJJJJIIIJJGFIIIJEDDFEGGJIFHHJIJJDECCGGEGIIJFHFFFACD:BBBDDACCCCAA@@CA@C>C3>@5(8&>C:9?8+89<4(:83825C(:A#########################
121 | ```
122 | 
123 | Line 4 shows the quality for each nucleotide in the read.
124 | Quality is interpreted as the probability of an incorrect base call (e.g. 1 in 10) or, equivalently, the base call accuracy (e.g. 90%).
125 | To make it possible to line up each individual nucleotide with its quality score, the numerical score is converted into a code where each individual character represents the numerical quality score for an individual nucleotide. 'For example, in the line above, the quality score line is:
126 | 
127 | ```
128 | CCCFFFFFGHHHHJIJJJJIJJJIIJJJJIIIJJGFIIIJEDDFEGGJIFHHJIJJDECCGGEGIIJFHFFFACD:BBBDDACCCCAA@@CA@C>C3>@5(8&>C:9?8+89<4(:83825C(:A#########################
129 | ```
130 | 
131 | The numerical value assigned to each of these characters depends on the  sequencing platform that generated the reads. The sequencing machine used to generate our data  uses the standard Sanger quality PHRED score encoding, using Illumina version 1.8 onwards.
132 | Each character is assigned a quality score between 0 and 41 as shown in the chart below.
133 | 
134 | ```
135 | Quality encoding: !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJ
136 |                    |         |         |         |         |
137 | Quality score:    01........11........21........31........41
138 | ```
139 | 
140 | 
141 | Each quality score represents the probability that the corresponding nucleotide call is incorrect.
142 | This quality score is logarithmically based, so a quality score of 10 reflects a base call accuracy of 90%, but a quality score of 20 reflects a base call accuracy of 99%.
143 | These probability values are the results from the base calling algorithm and depend on how much signal was captured for the base incorporation.
144 | 
145 | Looking back at our example read:
146 | 
147 | ```
148 | @SRR2584863.1 HWI-ST957:244:H73TDADXX:1:1101:4712:2181/1
149 | TTCACATCCTGACCATTCAGTTGAGCAAAATAGTTCTTCAGTGCCTGTTTAACCGAGTCACGCAGGGGTTTTTGGGTTACCTGATCCTGAGAGTTAACGGTAGAAACGGTCAGTACGTCAGAATTTACGCGTTGTTCGAACATAGTTCTG
150 | +
151 | CCCFFFFFGHHHHJIJJJJIJJJIIJJJJIIIJJGFIIIJEDDFEGGJIFHHJIJJDECCGGEGIIJFHFFFACD:BBBDDACCCCAA@@CA@C>C3>@5(8&>C:9?8+89<4(:83825C(:A#########################
152 | ```
153 | 
154 | we can now see that there is a range of quality scores, but that the end of the sequence is very poor (`#` = a quality score of 2).
155 | How does the first read in `SRR1211680_1.fastq.gz` compare to this example?
156 | 
157 | ## Assessing Quality with FastQC
158 | 
159 | For the most part, you won't be assessing the quality of all your reads by visually inspecting your FASTQ files.
160 | Rather, you'll be using a software program to assess read quality and filter out poor quality reads.
161 | We'll first use a program called [FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) to visualize the quality of our reads.
162 | 
163 | FastQC has a number of features which can give you a quick impression of any problems your data may have, so you can take these issues into consideration before moving forward with your analyses.
164 | Rather than looking at quality scores for each individual read, FastQC looks at quality collectively across all reads within a sample.
165 | The image below shows one FastQC-generated plot that indicatesa very high quality sample:
166 | 
167 | ![](_static/good_quality1.8.png)
168 | 
169 | The x-axis displays the base position in the read, and the y-axis shows quality scores. In this
170 | example, the sample contains reads that are 40 bp long. This is much shorter than the reads we
171 | are working with in our workflow. For each position, there is a box-and-whisker plot showing
172 | the distribution of quality scores for all reads at that position. The horizontal red line
173 | indicates the median quality score and the yellow box shows the 1st to
174 | 3rd quartile range. This means that 50% of reads have a quality score that falls within the
175 | range of the yellow box at that position. The whiskers show the absolute range, which covers
176 | the lowest (0th quartile) to highest (4th quartile) values.
177 | 
178 | For each position in this sample, the quality values do not drop much lower than 32. This
179 | is a high quality score. The plot background is also color-coded to identify good (green),
180 | acceptable (yellow), and bad (red) quality scores.
181 | 
182 | Now let's take a look at a quality plot on the other end of the spectrum.
183 | 
184 | ![](_static/bad_quality1.8.png)
185 | 
186 | Here, we see positions within the read in which the boxes span a much wider range.
187 | Also, quality scores drop quite low into the "bad" range, particularly on the tail end of the reads.
188 | The FastQC tool produces several other diagnostic plots to assess sample quality, in addition to the one plotted above.
189 | 
190 | ## Running FastQC
191 | 
192 | We will now assess the quality of the reads that we downloaded. First, make sure you're still in the `raw_data` directory
193 | 
194 | ```
195 | cd ~/2020-NSURP/raw_data
196 | ```
197 | 
198 | Next, activate the conda environment we created in the [Install Conda](02.conda.md) lesson.
199 | ```
200 | conda activate nsurp-env
201 | ```
202 | 
203 | Now, use conda to install fastqc.
204 | 
205 | ```
206 | conda install fastqc
207 | ```
208 | 
209 | FastQC can accept multiple file names as input, and on both zipped and unzipped files, so we can use the \*.fastq* wildcard to run FastQC on all of the FASTQ files in this directory.
210 | 
211 | ```
212 | fastqc *.fastq*
213 | ```
214 | 
215 | The FastQC program has created several new files within our directory.
216 | For each input FASTQ file, FastQC has created a `.zip` file and a
217 | `.html` file. The `.zip` file extension indicates that this is
218 | actually a compressed set of multiple output files. We'll be working
219 | with these output files soon. The `.html` file is a stable webpage
220 | displaying the summary report for each of our samples.
221 | 
222 | ## Transferring data from Farm to your computer
223 | 
224 | To transfer a file from a remote server to our own machines, we will use `scp`.
225 | To learn more about `scp`, see the bottom of [this tutorial](https://datacarpentry.org/shell-genomics/05-writing-scripts/).
226 | 
227 | Now we can transfer our HTML files to our local computer using `scp`. The `./` indicates that you're transferring files to the directory you're currently working from.
228 | 
229 | ```
230 | scp -P 2022  -i /path/to/key/file username@farm.cse.ucdavis.edu:~/2020-NSURP/raw_data/*.html ./
231 | ```
232 | 
233 | If you're on a mac using `zsh`, you may need to replace the `scp` with `noglob scp` in the command above.
234 | If you're on windows, you may need to move the the files from the download location on your Linux shell over to the windows side of your computer before opening.
235 | 
236 | Once the file is on your local computer, double click on it and it will open in your browser.
237 | You can now explore the FastQC output.
238 | 
239 | ## Decoding the FastQC Output
240 | 
241 | We've now looked at quite a few "Per base sequence quality" FastQC graphs, but there are nine other graphs that we haven't talked about!
242 | Below we have provided a brief overview of interpretations for each of these plots.
243 | For more information, please see the FastQC documentation [here](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/)
244 | 
245 | + [**Per tile sequence quality**](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/3%20Analysis%20Modules/12%20Per%20Tile%20Sequence%20Quality.html): the machines that perform sequencing are divided into tiles. This plot displays patterns in base quality along these tiles. Consistently low scores are often found around the edges, but hot spots can also occur in the middle if an air bubble was introduced at some point during the run.
246 | + [**Per sequence quality scores**](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/3%20Analysis%20Modules/3%20Per%20Sequence%20Quality%20Scores.html): a density plot of quality for all reads at all positions. This plot shows what quality scores are most common.
247 | + [**Per base sequence content**](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/3%20Analysis%20Modules/4%20Per%20Base%20Sequence%20Content.html): plots the proportion of each base position over all of the reads. Typically, we expect to see each base roughly 25% of the time at each position, but this often fails at the beginning or end of the read due to quality or adapter content.
248 | + [**Per sequence GC content**](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/3%20Analysis%20Modules/5%20Per%20Sequence%20GC%20Content.html): a density plot of average GC content in each of the reads.
249 | + [**Per base N content**](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/3%20Analysis%20Modules/6%20Per%20Base%20N%20Content.html): the percent of times that 'N' occurs at a position in all reads. If there is an increase at a particular position, this might indicate that something went wrong during sequencing.
250 | + [**Sequence Length Distribution**](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/3%20Analysis%20Modules/7%20Sequence%20Length%20Distribution.html): the distribution of sequence lengths of all reads in the file. If the data is raw, there is often on sharp peak, however if the reads have been trimmed, there may be a distribution of shorter lengths.
251 | + [**Sequence Duplication Levels**](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/3%20Analysis%20Modules/8%20Duplicate%20Sequences.html): A distribution of duplicated sequences. In sequencing, we expect most reads to only occur once. If some sequences are occurring more than once, it might indicate enrichment bias (e.g. from PCR). If the samples are high coverage (or RNA-seq or amplicon), this might not be true.
252 | + [**Overrepresented sequences**](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/3%20Analysis%20Modules/9%20Overrepresented%20Sequences.html): A list of sequences that occur more frequently than would be expected by chance.
253 | + [**Adapter Content**](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/3%20Analysis%20Modules/10%20Adapter%20Content.html): a graph indicating where adapater sequences occur in the reads.
254 | + [**K-mer Content**](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/3%20Analysis%20Modules/11%20Kmer%20Content.html): a graph showing any sequences which may show a positional bias within the reads.
255 | 
256 | ## Extra Info
257 | 
258 | if you ever need to download >10 accessions from the SRA, the `sra-toolkit` is a great tool to do this with!
259 | However, we find `sra-toolkit` cumbersome when only a couple accessions need to be downloaded.
260 | 
261 | 
262 | 


--------------------------------------------------------------------------------
/Snakefile:
--------------------------------------------------------------------------------
  1 | ## NSURP IBD Analysis ##
  2 | import pandas as pd
  3 | 
  4 | 
  5 | #sample_info = pd.read_csv("ihmp-CD-nonIBD-tiny_subset.csv")
  6 | sample_info = pd.read_csv("ihmp-CD-nonIBD-subset.csv")
  7 | # need to filter out any "External ID" with underscores in them
  8 | sample_info = sample_info[~sample_info["External ID"].str.contains("_")]
  9 | 
 10 | SAMPLES = sample_info["External ID"].tolist()
 11 | 
 12 | #patient_id = sample_info["Participant ID"]
 13 | #sample_id = sample_info["External ID"]
 14 | #diagnosis = sample_info["diagnosis"]
 15 | 
 16 | #SAMPLES = ["SRR1211157", "SRR1211428", "SRR1211440", "SRR1211568", "SRR1757110", "SRR1765025"]
 17 | 
 18 | #samples_csv= "ibd_samples.csv"
 19 | # to do: read the samples from csv instead
 20 | 
 21 | out_dir = "iHMP_project"
 22 | logs_dir = os.path.join(out_dir, "logs")
 23 | 
 24 | rule all:
 25 |     input:
 26 |         expand(os.path.join(out_dir, "gather", "{alphabet}", "{sample}_x_genbank-k{ksize}.csv"), sample=SAMPLES, ksize=[31], alphabet="dna"), #[21,31,51]
 27 |         #expand(os.path.join(out_dir, "gather", "{alphabet}", "{sample}_x_gtdb_pep.rep_genus-k{ksize}.csv"), sample=SAMPLES, alphabet="protein", ksize=33),
 28 |         #expand(os.path.join(out_dir, "gather", "{alphabet}", "{sample}_x_gtdb_pep-k{ksize}.csv"), sample=SAMPLES, alphabet="protein", ksize=33),
 29 |         #expand(os.path.join(out_dir, "gather", "{alphabet}", "{sample}_x_gtdb_pep.rep_genus-k{ksize}.csv"), sample=SAMPLES, alphabet="dayhoff", ksize=57),
 30 |         #expand(os.path.join(out_dir, "gather", "{alphabet}", "{sample}_x_gtdb_pep-k{ksize}.csv"), sample=SAMPLES, alphabet="dayhoff", ksize=57),
 31 |         
 32 | 
 33 |         #expand(os.path.join(out_dir, "megahit", "{sample}_contigs.fa", sample=SAMPLES),        
 34 |         #expand(os.path.join(out_dir, "gather_to_tax", "{alphabet}", "{sample}_x_gtdb_pep.rep_genus-k{ksize}.gather_summary.csv"), sample=SAMPLES, alphabet="protein", ksize=33),
 35 |         #expand(os.path.join(out_dir, "gather_to_tax", "{alphabet}", "{sample}_x_gtdb_pep.rep_genus-k{ksize}.gather_summary.csv"), sample=SAMPLES, alphabet="dayhoff", ksize=57),
 36 |         #expand(os.path.join(out_dir, "compare","{sample}.{alphabet}-k{ksize}.compare.np.matrix.pdf"), sample=SAMPLES, alphabet="dna", ksize=[21,31,51]),
 37 |         #expand(os.path.join(out_dir, "compare","{sample}.{alphabet}-k{ksize}.compare.np.matrix.pdf"), sample=SAMPLES, alphabet="protein", ksize=[33]),
 38 |         #expand(os.path.join(out_dir, "compare","{sample}.{alphabet}-k{ksize}.compare.np.matrix.pdf"), sample=SAMPLES, alphabet="dayhoff", ksize=[57]),
 39 | 
 40 | #rule download_hostseqs:
 41 | ## http://seqanswers.com/forums/archive/index.php/t-42552.html
 42 | # https://drive.google.com/file/d/0B3llHR93L14wd0pSSnFULUlhcUk/edit?usp=sharing
 43 | #    output: "databases/hg19_main_mask_ribo_animal_allplant_allfungus.fa.gz"
 44 | #    params:
 45 | #        download_link="https://osf.io/p6dg4/download"
 46 | #    log: os.path.join(logs_dir, "download_hostseqs.log")
 47 | #    shell:
 48 | #        """
 49 | #        wget {params.download_link} > {output} 2> {log}
 50 | #        """
 51 | 
 52 | localrules: download_reads
 53 | rule download_reads:
 54 |     output:
 55 |         tar=os.path.join(out_dir, "raw_data", "{sample}.tar"),
 56 |     params:
 57 |         download_link = lambda w: f"https://ibdmdb.org/tunnel/static/HMP2/WGS/1818/{w.sample}.tar"
 58 |     resources:
 59 |         mem_mb=lambda wildcards, attempt: attempt *4000,
 60 |         runtime=600,
 61 |     shell:
 62 |         """
 63 |         curl -L {params.download_link} -o {output.tar}
 64 |         """
 65 | 
 66 | localrules: untar_reads
 67 | rule untar_reads:
 68 |     input: os.path.join(out_dir, "raw_data", "{sample}.tar"),
 69 |     output:
 70 |         r1=os.path.join(out_dir, "raw_data", "{sample}_R1.fastq.gz"),
 71 |         r2=os.path.join(out_dir, "raw_data", "{sample}_R2.fastq.gz"),
 72 |     params:
 73 |         output_dir= os.path.join(out_dir, "raw_data")
 74 |     resources:
 75 |         mem_mb=lambda wildcards, attempt: attempt *4000,
 76 |         runtime=600,
 77 |     shell:
 78 |         """
 79 |         tar -xf {input} --directory {params.output_dir}
 80 |         """
 81 | 
 82 | rule trim_reads:
 83 |     input:
 84 |         in1=os.path.join(out_dir, "raw_data", "{sample}_R1.fastq.gz"),
 85 |         in2=os.path.join(out_dir, "raw_data", "{sample}_R2.fastq.gz"),
 86 |     output:
 87 |         out1=os.path.join(out_dir, "trim", "{sample}_1.trim.fastq.gz"),
 88 |         out2=os.path.join(out_dir, "trim", "{sample}_2.trim.fastq.gz"),
 89 |         json=os.path.join(out_dir, "trim", "{sample}.fastp.json"),
 90 |         html=os.path.join(out_dir, "trim", "{sample}.fastp.html")
 91 |     conda: 'envs/fastp-env.yml'
 92 |     resources:
 93 |         mem_mb=lambda wildcards, attempt: attempt *10000,
 94 |         runtime=600,
 95 |     shell:
 96 |         """
 97 |         fastp --in1 {input.in1}  --in2 {input.in2}  \
 98 |         --out1 {output.out1} --out2 {output.out2}  \
 99 |         --detect_adapter_for_pe  --qualified_quality_phred 4 \
100 |         --length_required 31 --correction \
101 |         --json {output.json} --html {output.html}
102 |         """
103 | 
104 | rule remove_host:
105 |     input:
106 |         r1=os.path.join(out_dir, "trim", "{sample}_1.trim.fastq.gz"),
107 |         r2=os.path.join(out_dir, "trim", "{sample}_2.trim.fastq.gz"),
108 |         human='databases/hg19_main_mask_ribo_animal_allplant_allfungus.fa.gz'
109 |     output:
110 |         r1 = os.path.join(out_dir, "bbduk", "{sample}_1.nohost.fq.gz"),
111 |         r2 = os.path.join(out_dir, "bbduk", "{sample}_2.nohost.fq.gz"),
112 |         human_r1=os.path.join(out_dir, "bbduk", "{sample}_1.human.fq.gz"),
113 |         human_r2=os.path.join(out_dir, "bbduk", "{sample}_2.human.fq.gz")
114 |     conda: 'envs/bbduk-env.yml'
115 |     threads: 4
116 |     resources:
117 |         mem_mb=62500,
118 |         runtime=6000,
119 |     shell:
120 |         """
121 |         bbduk.sh -Xmx64g t=4 in={input.r1} in2={input.r2} out={output.r1} out2={output.r2} outm={output.human_r1} outm2={output.human_r2} k=31 ref={input.human}
122 |         """
123 | 
124 | rule kmer_trim:
125 |     input:
126 |         r1 = os.path.join(out_dir, "bbduk", "{sample}_1.nohost.fq.gz"),
127 |         r2 = os.path.join(out_dir, "bbduk", "{sample}_2.nohost.fq.gz"),
128 |     output:
129 |         os.path.join(out_dir, "kmer-trim", "{sample}.nohost.kmer-trim.pe.fastq.gz"),
130 |     conda: 'envs/khmer-env.yml'
131 |     resources:
132 |         mem_mb=lambda wildcards, attempt: attempt *20000,
133 |         runtime=600,
134 |     shell:
135 |         """
136 |         interleave-reads.py {input.r1} {input.r2} |
137 |         trim-low-abund.py --gzip -C 3 -Z 18 -M 20e9 -V - -o {output}
138 |         """
139 | 
140 | ksizesD={"dna": "21,31,51", "protein": "33", "dayhoff": "57"}
141 | scaledD={"dna": "2000", "protein": "100", "dayhoff": "100"}
142 | 
143 | rule sourmash_compute:
144 |     input: rules.kmer_trim.output
145 |     output: os.path.join(out_dir, "sourmash_signatures", "{alphabet}", "{sample}.nohost.kmer-trim.pe.sig")
146 |     params:
147 |         scaled= lambda w: scaledD[w.alphabet], #2000,
148 |         k=lambda w: ksizesD[w.alphabet], #"21,31,51,33,57",
149 |         alpha_cmd= lambda w: "--" + w.alphabet, # --dna --protein --dayhoff",
150 |         abund_cmd= "--track-abundance"
151 |     log: os.path.join(logs_dir, "sourmash", "{alphabet}", "{sample}.nohost.kmer-trim.pe.compute.log")
152 |     conda: "envs/sourmash-env.yml"
153 |     resources:
154 |         mem_mb=lambda wildcards, attempt: attempt *5000,
155 |         runtime=600,
156 |     shell:
157 |         """
158 |         sourmash compute -k {params.k} --scaled={params.scaled}  \
159 |         {input} -o {output} {params.alpha_cmd} {params.abund_cmd} --merge={wildcards.sample:q} 2> {log}
160 |         """
161 | 
162 | rule sourmash_compare:
163 |     input: expand(os.path.join(out_dir, "sourmash_signatures", "{{alphabet}}", "{sample}.nohost.kmer-trim.pe.sig"), sample=SAMPLES)
164 |     output:
165 |         csv = os.path.join(out_dir, "compare", "{sample}.{alphabet}-k{ksize}.compare.csv"),
166 |         np = os.path.join(out_dir, "compare", "{sample}.{alphabet}-k{ksize}.compare.np")
167 |     params:
168 |         alpha_cmd = lambda wildcards: "--" + wildcards.alphabet 
169 |     conda: "envs/sourmash-env.yml"
170 |     resources:
171 |         mem_mb=lambda wildcards, attempt: attempt *5000,
172 |         runtime=60,
173 |     shell:
174 |         """
175 |         sourmash compare -k {wildcards.ksize} -o {output.np} --csv {output.csv} --ignore-abundance {params.alpha_cmd}
176 |         """
177 | 
178 | localrules: sourmash_plot
179 | 
180 | rule sourmash_plot:
181 |     input: rules.sourmash_compare.output.np
182 |     output:
183 |         matrix = os.path.join(out_dir, "compare","{sample}.{alphabet}-k{ksize}.compare.np.matrix.pdf")
184 |     conda: "envs/sourmash-env.yml"
185 |     #resources:
186 |     #    mem_mb=lambda wildcards, attempt: attempt *1000,
187 |     #    runtime=60,
188 |     shell:
189 |         """
190 |         sourmash plot --labels {input}
191 |         """
192 | 
193 | rule sourmash_gather_genbank:
194 |     input: 
195 |         sig=rules.sourmash_compute.output,
196 |         ref="databases/genbank-k{ksize}.sbt.zip"
197 |     output:
198 |         csv=os.path.join(out_dir, "gather", "{alphabet}", "{sample}_x_genbank-k{ksize}.csv")
199 |     conda: "envs/sourmash-env.yml"
200 |     params:
201 |         #threshold="10000000",
202 |         alpha_cmd=lambda wildcards: "--" + wildcards.alphabet
203 |     resources:
204 |         mem_mb=lambda wildcards, attempt: attempt *10000,
205 |         runtime=6000,
206 |     shell:
207 |         """
208 |         sourmash gather {input.sig} {input.ref} -o {output.csv} {params.alpha_cmd} -k {wildcards.ksize}
209 |         """
210 |         #  --threshold-bp {params.threshold}
211 | 
212 | 
213 | pepD_full={"protein": "databases/gtdb_pep.protein_scaled100_k11.index.sbt.zip", "dayhoff":"databases/gtdb_pep.dayhoff_scaled100_k19.index.sbt.zip"}
214 | pepD={"protein": "databases/gtdb_pep.rep_genus.protein_scaled100_k11.sbt.zip", "dayhoff":"databases/gtdb_pep.rep_genus.dayhoff_scaled100_k19.sbt.zip"}
215 | 
216 | rule gather_gtdb_pep_rep:
217 |     input:
218 |         query=os.path.join(out_dir, "sourmash_signatures", "{alphabet}", "{sample}.nohost.kmer-trim.pe.sig"),
219 |         db=lambda w: pepD[w.alphabet]
220 |     output:
221 |         csv = os.path.join(out_dir, "gather", "{alphabet}", "{sample}_x_gtdb_pep.rep_genus-k{ksize}.csv"),
222 |         matches = os.path.join(out_dir, "gather", "{alphabet}", "{sample}_x_gtdb_pep.rep_genus-k{ksize}.matches"),
223 |         unassigned = os.path.join(out_dir, "gather", "{alphabet}", "{sample}_x_gtdb_pep.rep_genus-k{ksize}.unassigned")
224 |     params:
225 |         alpha_cmd = lambda w: "--" + w.alphabet,
226 |         scaled = 100,
227 |     resources:
228 |         mem_mb=lambda wildcards, attempt: attempt *10000,
229 |         runtime=600,
230 |     log: os.path.join(logs_dir, "gather", "{alphabet}-k{ksize}",  "{sample}_x_gtdb_pep.rep_genus.{alphabet}-k{ksize}.gather.log")
231 |     benchmark: os.path.join(logs_dir, "gather","{alphabet}-k{ksize}", "{sample}_x_gtdb_pep.rep_genus.{alphabet}-k{ksize}.gather.benchmark")
232 |     conda: "envs/sourmash-env.yml"
233 |     shell:
234 |         # --ignore-abundance to turn abund off
235 |         """
236 |         sourmash gather {input.query} {input.db} -o {output.csv} {params.alpha_cmd} \
237 |         --save-matches {output.matches} \
238 |         --output-unassigned {output.unassigned} \
239 |         --scaled {params.scaled} \
240 |         -k {wildcards.ksize} 2> {log}
241 |         touch {output}
242 |         """
243 | 
244 | rule gather_gtdb_pep_full:
245 |     input:
246 |         query=os.path.join(out_dir, "sourmash_signatures", "{alphabet}", "{sample}.nohost.kmer-trim.pe.sig"),
247 |         db=lambda w: pepD_full[w.alphabet]
248 |     output:
249 |         csv = os.path.join(out_dir, "gather", "{alphabet}", "{sample}_x_gtdb_pep-k{ksize}.csv"),
250 |         matches = os.path.join(out_dir, "gather", "{alphabet}", "{sample}_x_gtdb_pep-k{ksize}.matches"),
251 |         unassigned = os.path.join(out_dir, "gather", "{alphabet}", "{sample}_x_gtdb_pep-k{ksize}.unassigned")
252 |     params:
253 |         alpha_cmd = lambda w: "--" + w.alphabet,
254 |         scaled = 100,
255 |     resources:
256 |         mem_mb=lambda wildcards, attempt: attempt *10000,
257 |         runtime=6000,
258 |     log: os.path.join(logs_dir, "gather", "{alphabet}-k{ksize}",  "{sample}_x_gtdb_pep.{alphabet}-k{ksize}.gather.log")
259 |     benchmark: os.path.join(logs_dir, "gather","{alphabet}-k{ksize}", "{sample}_x_gtdb_pep.{alphabet}-k{ksize}.gather.benchmark")
260 |     conda: "envs/sourmash-env.yml"
261 |     shell:
262 |         # --ignore-abundance to turn abund off
263 |         """
264 |         sourmash gather {input.query} {input.db} -o {output.csv} {params.alpha_cmd} \
265 |         --save-matches {output.matches} \
266 |         --output-unassigned {output.unassigned} \
267 |         --scaled {params.scaled} \
268 |         -k {wildcards.ksize} 2> {log}
269 |         touch {output}
270 |         """
271 | 
272 | rule gather_to_tax_pep_rep:
273 |     input:
274 |         gather_csv = os.path.join(out_dir, "gather", "{alphabet}", "{sample}_x_gtdb_pep.rep_genus-k{ksize}.csv"),
275 |         lineages_csv = "databases/gtdb-lineages.protein-filenames.representative-at-genus.csv"
276 |     output:
277 |         gather_tax = os.path.join(out_dir, "gather_to_tax", "{alphabet}", "{sample}_x_gtdb_pep.rep_genus-k{ksize}.gather_summary.csv"),
278 |         top_matches = os.path.join(out_dir, "gather_to_tax", "{alphabet}", "{sample}_x_gtdb_pep.rep_genus-k{ksize}.gather_tophits.csv"),
279 |     log: os.path.join(logs_dir, "gather_to_tax", "{alphabet}-k{ksize}", "{sample}_x_gtdb_pep.rep_genus.{alphabet}-k{ksize}.gather-to-tax.log")
280 |     benchmark: os.path.join(logs_dir, "gather_to_tax", "{alphabet}-k{ksize}","{sample}_x_gtdb_pep.rep_genus.{alphabet}-k{ksize}.gather-to-tax.benchmark")
281 |     group: "gather"
282 |     resources:
283 |         mem_mb=lambda wildcards, attempt: attempt *10000,
284 |         runtime=200,
285 |     conda: "envs/sourmash-env.yml"
286 |     shell:
287 |         """
288 |         python scripts/gather-to-tax.py {input.gather_csv} {input.lineages_csv} --tophits-csv {output.top_matches} > {output.gather_tax} 2> {log}
289 |         """
290 | 
291 | 
292 | #rule gather_to_tax_dna:
293 | #    input:
294 | #        gather_csv = os.path.join(out_dir, "gather", "{sample}_x_genbank-k{ksize}.csv"
295 | #        #lineages_csv = lambda w: refInfo[moltype_map[w.alphabet]][w.db_name]["lineages_csv"]
296 | #    output:
297 | #        gather_tax = os.path.join(gather_dir, "{db_name}.{alphabet}-k{ksize}", "{genome}_x_{db_name}.{alphabet}-k{ksize}.gather_summary.csv"),
298 | #        top_matches = os.path.join(gather_dir, "{db_name}.{alphabet}-k{ksize}", "{genome}_x_{db_name}.{alphabet}-k{ksize}.gather_tophits.csv")
299 | #    log: os.path.join(logs_dir, "gather_to_tax", "{db_name}.{alphabet}-k{ksize}", input_type, "{genome}_x_{db_name}.{alphabet}-k{ksize}.gather-to-tax.log")
300 | #    benchmark: os.path.join(logs_dir, "gather_to_tax", "{db_name}.{alphabet}-k{ksize}","{genome}_x_{db_name}.{alphabet}-k{ksize}.gather-to-tax.benchmark")
301 | #    resources:
302 | #        mem_mb=lambda wildcards, attempt: attempt *10000,
303 | #        runtime=200,
304 | #    conda: "envs/sourmash-env.yml"
305 | #    shell:
306 | #        """
307 | #        python scripts/gather-to-tax.py {input.gather_csv} {input.lineages_csv} --tophits-csv {output.top_matches} > {output.gather_tax} 2> {log}
308 | #        """
309 | 
310 | #rule megahit_assemble:
311 | #    input:
312 | #        os.path.join(out_dir, "kmer-trim", "{sample}.nohost.kmer-trim.pe.fastq.gz") 
313 | #    output:
314 | #        os.path.join(out_dir, "megahit", "{sample}_contigs.fa" 
315 | #    message:
316 | #        """### Assembling read data with MEGAHIT ### """
317 | #    params:
318 | #        megahit_dir=os.path.join(out_dir, "megahit")
319 | #    threads: 10
320 | ##    resources:
321 | #        mem_mb=20000,
322 | #        runtime=6000
323 | #    log: os.path.join(logs_dir, "megahit", "{sample}_megahit.log")
324 | #    benchmark: os.path.join(logs_dir, "megahit", "{sample}_megahit.benchmark")
325 | #    conda: "envs/megahit-env.yaml"
326 | #    shell:
327 | #        """
328 | #        megahit --12 {input} -o {params.megahit_dir} --out_prefix {wildcards.sample} -t {threads}
329 | #        """
330 | 


--------------------------------------------------------------------------------