├── README.md
├── _config.yml
├── activities
    └── practice_exercise.md
├── answer_key
    ├── exp_design_table_answer_key.xlsx
    └── salmon_all_samples.sbatch
├── assets
    ├── css
    │   └── style.scss
    └── images
    │   └── dna-sequence-1600x800.jpg
├── data
    └── exp_design_table.xlsx
├── fastqc
    ├── Icons
    │   ├── error.png
    │   ├── fastqc_icon.png
    │   ├── tick.png
    │   └── warning.png
    ├── Images
    │   ├── duplication_levels copy.png
    │   ├── duplication_levels.png
    │   ├── kmer_profiles.png
    │   ├── per_base_gc_content.png
    │   ├── per_base_n_content.png
    │   ├── per_base_quality.png
    │   ├── per_base_sequence_content.png
    │   ├── per_sequence_gc_content.png
    │   ├── per_sequence_quality.png
    │   └── sequence_length_distribution.png
    └── Mov10oe_1-fastqc_report.html
├── img
    ├── FastQC_contam.png
    ├── FastQC_seq_qual.png
    ├── FileZilla_click.gif
    ├── FileZilla_information.gif
    ├── Filezilla_step1.png
    ├── Filezilla_step2.png
    ├── Gene_products.png
    ├── Gene_structure.png
    ├── IGV_mov10.png
    ├── O2_login.gif
    ├── O2_primary-storage.png
    ├── Overrepresented_sequences_adaptor_only.png
    ├── Overrepresented_sequences_homopolymers.png
    ├── Per_base_sequence_content_bad.png
    ├── Per_sequence_GC_content_bad.png
    ├── Per_sequence_GC_content_good.png
    ├── Per_sequence_quality_scores_bad.png
    ├── Per_sequence_quality_scores_good.png
    ├── QC_workflow_Sept2018.png
    ├── README.md
    ├── RNA-seq_library_prep.png
    ├── RNAseqWorkflow.png
    ├── R_screenshot.png
    ├── R_screenshot2.png
    ├── Rstudio_interface.png
    ├── SAM_file.png
    ├── Slide1.jpg
    ├── alignment_STAR_step1.ai
    ├── alignment_STAR_step1.png
    ├── alignment_STAR_step2.ai
    ├── alignment_STAR_step2.png
    ├── alignment_STAR_step3.ai
    ├── alignment_STAR_step3.png
    ├── alignment_STAR_step4.ai
    ├── alignment_STAR_step4.png
    ├── alignment_STAR_step5.ai
    ├── alignment_STAR_step5.png
    ├── alignmentfree_workflow_aug2017.png
    ├── alignmentfree_workflow_june2017.png
    ├── bad_quality.png
    ├── base_calling.png
    ├── batch_effect.png
    ├── batch_effect_pca.png
    ├── bioconductor_logo.png
    ├── bitwiseflags.png
    ├── cigar_strings.png
    ├── clonal_amplification.png
    ├── cluster_generation.png
    ├── complete_wd_setup.png
    ├── confounded_batch.png
    ├── confounded_design.png
    ├── console.png
    ├── corr_map.png
    ├── count-fig1.png
    ├── count-fig2.png
    ├── count-matrix.png
    ├── count_matrix.png
    ├── counts-workflow.jpg
    ├── counts-workflow.png
    ├── counts_view.png
    ├── cran_packages.png
    ├── data-lifecycle-base.png
    ├── data_life_cycle_gouldv2.png
    ├── de_norm_counts_var.png
    ├── de_replicates_img.png
    ├── de_replicates_img2.png
    ├── de_variation.png
    ├── demultiplexing.png
    ├── drawings.pptx
    ├── environment.png
    ├── exp_design.png
    ├── factors.png
    ├── factors_both.png
    ├── factors_new.png
    ├── factors_sm.png
    ├── factors_sm_intact.png
    ├── fastqc_GC.png
    ├── fastqc_basic_stats.png
    ├── fastqc_duplication.png
    ├── fastqc_over-represented_sequences.png
    ├── fastqc_per_base_sequence_content.png
    ├── fastqc_per_sequence_quality_scores.png
    ├── fastqc_results.png
    ├── fastqc_summary.png
    ├── feature-overlap.png
    ├── filezilla_login.png
    ├── filezilla_setup.png
    ├── flow_cell_oligos.png
    ├── flow_cells.png
    ├── full_workflow_2019.png
    ├── full_workflow_Sept2018.png
    ├── full_workflow_qualimap_2019.png
    ├── gProfiler.png
    ├── gene_expression2.png
    ├── gene_expression_cells.png
    ├── genemania.png
    ├── getwd.png
    ├── good_quality.png
    ├── gvng.jpg
    ├── igv_screenshot.png
    ├── illumina_platforms.png
    ├── illumina_sequencing.png
    ├── illumina_sequencing_process.png
    ├── libraryprep_step1-2.png
    ├── libraryprep_step3.png
    ├── libraryprep_step4-5.png
    ├── libraryprep_step6.png
    ├── long_read_tech.png
    ├── metadata_batch.png
    ├── multiqc_GC_content.png
    ├── multiqc_alignment_scores.png
    ├── multiqc_alignment_scores1.png
    ├── multiqc_columns.png
    ├── multiqc_coverage_profile.png
    ├── multiqc_coverage_profile1.png
    ├── multiqc_duplicates.png
    ├── multiqc_table.png
    ├── multiqc_table1.png
    ├── multithreaded_hpc_3samples.png
    ├── non_confounded_design.png
    ├── paired-end_data.png
    ├── paired_end_reads.png
    ├── pca_plot.png
    ├── permission-directory.png
    ├── placeholder.png
    ├── pseudo_count_comparison-cufflinks.png
    ├── pseudo_count_comparison-sailfish.png
    ├── pseudo_count_comparison-sailfish_sm.png
    ├── pseudo_count_comparison-star.png
    ├── pseudo_count_comparison-star_sm.png
    ├── pseudo_count_comparison.gif
    ├── pseudo_count_comparison.png
    ├── qc_cycles_lost.png
    ├── qc_manifold_burst.png
    ├── qc_overclustering.png
    ├── qc_phasing.png
    ├── qc_read2_failed.png
    ├── qc_signal_decay.png
    ├── qc_troubleshooting.png
    ├── qualimap_coverage_profile.png
    ├── qualimap_genomic _origin.png
    ├── qualimap_genomic_feature.png
    ├── qualimap_genomic_origin.png
    ├── qualimap_genomic_origin1.png
    ├── qualimap_junctions.png
    ├── qualimap_read_alignment.png
    ├── qualimap_transcript_coverage.png
    ├── r_starting_how_it_should_like.png
    ├── replicates.png
    ├── rnaseq_salmon_workflow.png
    ├── rnaseq_workflow.png
    ├── rnaseq_workflow_FASTQC.png
    ├── rnaseq_workflow_trimming.png
    ├── rrna.png
    ├── rstudio_logo.png
    ├── salmon_plot_multiqc.png
    ├── salmon_plot_multiqc1.png
    ├── salmon_quasialignment.png
    ├── salmon_rstudio.png
    ├── salmon_workflow.png
    ├── salmon_workflow_subset.png
    ├── sam_bam.png
    ├── sam_bam2.png
    ├── sam_bam3.png
    ├── sbs_image.png
    ├── scratch3_best-practice.png
    ├── scratch_recommended_practice.png
    ├── serial_hpc_3samples.png
    ├── star.png
    ├── teachin-team.png
    ├── union.png
    ├── vim_insert.png
    ├── vim_postsave.png
    ├── vim_quit.png
    ├── vim_save.png
    ├── vim_spider.png
    ├── vim_spider_number.png
    ├── why_R.png
    ├── workflow_align_qualimap.png
    ├── workflow_alignment.png
    ├── workflow_salmon.png
    └── wrap_option.png
├── lectures
    ├── 2_day
    │   ├── HPC_intro_O2.pdf
    │   ├── Intro_to_workshop.pdf
    │   ├── RNAseq-analysis-methods.pdf
    │   ├── Wrap_up.pdf
    │   └── rna-seq_design.pdf
    ├── Intro_to_workshop.pdf
    ├── RNA-seq_troubleshooting.pdf
    ├── alignment_quantification.pdf
    ├── expression_quantification.pdf
    ├── workflow_overview.pdf
    ├── workshop_intro_slides.pdf
    ├── workshop_wrapup.pdf
    └── workshop_wrapup_slides.pdf
├── lessons
    ├── 01_intro-to-RNAseq.md
    ├── 02_experimental_planning_considerations.md
    ├── 03_working_on_HPC.md
    ├── 04a_data_organization.md
    ├── 04b_data_organization.md
    ├── 05_qc_running_fastqc_interactively.md
    ├── 06_qc_running_fastqc_sbatch.md
    ├── 07_qc_fastqc_assessment.md
    ├── 08_quasi_alignment_salmon.md
    ├── 09_quasi_alignment_salmon_sbatch.md
    ├── 10_QC_Qualimap.md
    ├── 11_multiQC.md
    ├── 12_automating_workflow.md
    ├── 2day_rnaseq_workflow.md
    ├── DE_analysis.md
    ├── QC_STAR_and_Qualimap_run.md
    ├── STAR Alignment Strategy.md
    ├── STAR_alignment.md
    ├── STAR_alignment_strategy.md
    ├── alignment_quality.md
    ├── counting_reads.md
    ├── fastqc-troubleshooting.md
    ├── more_bash_cluster.md
    ├── rnaseq_workflow.md
    ├── sam.md
    ├── shell_review.md
    ├── shell_review_answer_key.md
    ├── test.md
    └── working_on_HPC_noExercises.md
├── multiqc
    └── multiqc_report_rnaseq.html
├── schedule
    ├── README.md
    └── links-to-lessons.md
└── scripts
    ├── PE-rnaseq_analysis_on_allfiles_for-slurm.sh
    ├── PE-rnaseq_analysis_on_input_file.sh
    ├── mov10_fastqc.run
    ├── rnaseq_analysis_on_allfiles_for-slurm.sh
    ├── rnaseq_analysis_on_input_file.sh
    ├── salmon_all_files_PE.sh
    └── star_genome_index.run


/README.md:
--------------------------------------------------------------------------------
 1 | # Introduction to bulk RNA-seq: From reads to count matrix
 2 | 
 3 | | Audience | Computational skills required | Duration |
 4 | :----------|:----------|:----------|
 5 | | Biologists | [Shell for Bioinformatics](https://hbctraining.github.io/Shell-for-bioinformatics/) | 3-session online workshop (~7.5 hours of trainer-led time) |
 6 | 
 7 | ### Description
 8 | 
 9 | This repository has teaching materials for a 3-day **Introduction to bulk RNA-seq: From reads to count matrix** workshop. This workshop focuses on teaching basic computational skills to enable the effective use of an high-performance computing environment to implement an RNA-seq data analysis workflow. In addition to running the RNA-seq workflow from FASTQ files to count data using Salmon, the workshop covers best practice guidelines for RNA-seq experimental design and data organization/management.
10 | 
11 | > **Pre-requisite for this workshop:** The *Basic Data Skills* [Shell for Bioinformatics](https://hbctraining.github.io/Shell-for-bioinformatics/) workshop or a working knowledge of the command line and cluster computing.
12 | 
13 | **Note for Trainers:** Please note that the schedule linked below assumes that learners will spend between 3-4 hours on reading through, and completing exercises from selected lessons between classes. 
14 | 
15 | > These materials were developed for a trainer-led workshop, but are also amenable to self-guided learning.
16 | 
17 | ### Learning Objectives
18 | 
19 | 1.	Utilize the command line interface (bash) and HPC for analyzing high-throughput sequencing data.
20 | 2.	Understand best practices for designing an RNA-seq experiment
21 | 3.	Perform read-level QC on bulk RNA-seq data
22 | 4.	Quantify reads from bulk RNA-seq to generat a counts matrix
23 | 
24 | ### Lessons
25 | 
26 | * [Workshop schedule (trainer-led learning)](schedule/)
27 | * [Self-learning](schedule/links-to-lessons.md)
28 | 
29 | ### Installation Requirements
30 | 
31 | ***All:***
32 | 
33 | * [FileZilla Client](https://filezilla-project.org/download.php?type=client) (make sure you get ‘FileZilla Client')
34 | 
35 | ***Mac users:***
36 | 
37 | * Plain text editor like [Sublime text](http://www.sublimetext.com/) or similar
38 | 
39 | ***Windows users:***
40 | 
41 | * [GitBash](https://git-scm.com/download/win)
42 | * Plain text editor like [Notepad++](http://notepad-plus-plus.org/) or similar
43 | 
44 | 
45 | ---
46 | 
47 | ### Citation
48 | 
49 | To cite material from this course in your publications, please use:
50 | 
51 | > Mary E. Piper, Meeta Mistry, Jihe Liu, William J. Gammerdinger, & Radhika S. Khetani. (2022, January 10). hbctraining/Intro-to-rnaseq-hpc-salmon-flipped: Introduction to RNA-seq using Salmon Lessons from HCBC (first release). Zenodo. https://doi.org/10.5281/zenodo.5833880. RRID:SCR_025373.
52 | 
53 | 
54 | A lot of time and effort went into the preparation of these materials. Citations help us understand the needs of the community, gain recognition for our work, and attract further funding to support our teaching activities. Thank you for citing this material if it helped you in your data analysis.
55 | 
56 | ---
57 | 
58 | *These materials have been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.*
59 | 
60 | * *Some materials used in these lessons were derived from work that is Copyright © Data Carpentry (http://datacarpentry.org/). 
61 | All Data Carpentry instructional material is made available under the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0).*
62 | 


--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-cayman
2 | title: Introduction to RNA-Seq using high-performance computing
3 | google_analytics: UA-150953419-1
4 | 


--------------------------------------------------------------------------------
/activities/practice_exercise.md:
--------------------------------------------------------------------------------
 1 | _**To perform this exercise you will need an O2 account. You can request an account by following the instructions on [O2's account request page](https://harvardmed.service-now.com/stat?id=service_catalog_cards&sys_id=5165e1dbdb209050b642f27139961979&sysparm_category=991a7f2edb890c10b642f2713996196a).**_
 2 | 
 3 | ## Running the RNA-seq workflow
 4 | 
 5 | We have downloaded the raw FASTQ files from the SRA for the sequencing data used in the paper: [Silencing SMOC2 ameliorates kidney fibrosis by inhibiting fibroblast to myofibroblast transformation](https://pubmed.ncbi.nlm.nih.gov/28422762/). The paper explores kidney fibrosis in wildtype and SMOC2-overexpressing mice. 
 6 | 
 7 | >_**NOTE:** If you are interested in downloading other datasets from the SRA, we have [materials](https://hbctraining.github.io/Accessing_public_genomic_data/lessons/downloading_from_SRA.html) available detailing how to do this._
 8 | 
 9 | ### Set-up
10 | 1. Copy the compressed experimental data folder from `/n/groups/hbctraining/kidney_fibrosis_rnaseq.tar.gz` to your own `/n/scratch3/users/ecommonsID` directory.
11 | 2. Extract the directory using the command `tar -xzvf kidney_fibrosis_rnaseq.tar.gz`. This command may take a while to run.
12 | 3. Look inside the directory, you should find the following:
13 | 
14 |     - a `raw_fastq` folder containing the raw fastq files
15 |     - a `meta` folder with a metadata file containing information about each of the samples
16 | 4. Create a `reference_data` folder and download the transcriptome FASTA file for mouse to the folder. 
17 | 
18 |     - For Ensembl references, go to [http://useast.ensembl.org/info/data/ftp/index.html](http://useast.ensembl.org/info/data/ftp/index.html)
19 |     - Find the mouse species row and click on the *FASTA* link in the **cDNA (FASTA)** column. 
20 |     - Right-click on the link for the `*cdna.all.fa.gz` file to copy it.
21 |     - Navigate to the `reference_data` folder and run the command `wget <paste contents of link>`. This should download the transcriptome FASTA file to the directory.
22 |     - Extract the `*cdna.all.fa.gz` file by running the code: `gzip -d *cdna.all.fa.gz`.
23 | 5. Set-up additional expected folders (e.g. results, etc.) for your project (i.e. create subdirectories and additional directories where you feel is necessary). 
24 | 
25 | ### Analysis
26 | Using the workflow and submission scripts we generated in class, parallelize the RNA-Seq analysis of all files in this dataset. For each FASTQ file you will need to perform the following:
27 | 
28 |   - Run FastQC
29 |   - Generate abundance estimates with Salmon
30 |   - Evaluate the MultiQC report
31 |  
32 |   **HINT: You will need to create a mouse index for Salmon.** 
33 | 


--------------------------------------------------------------------------------
/answer_key/exp_design_table_answer_key.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/answer_key/exp_design_table_answer_key.xlsx


--------------------------------------------------------------------------------
/answer_key/salmon_all_samples.sbatch:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH -p short 
 4 | #SBATCH -c 6 
 5 | #SBATCH -t 0-12:00 
 6 | #SBATCH --mem 8G 
 7 | #SBATCH --job-name salmon_in_serial 
 8 | #SBATCH -o %j.out 
 9 | #SBATCH -e %j.err
10 | #SBATCH --mail-type=END
11 | #SBATCH --mail-user=xyz10@harvard.edu
12 | 
13 | # Load Salmon module
14 | module load salmon/1.8.0
15 | 
16 | # Change directory to where the Salmon results will be output
17 | cd ~/rnaseq/results/salmon
18 | 
19 | # Main script for running salmon with for loop
20 | 
21 | for fq in ~/rnaseq/raw_data/*.fq
22 | 
23 | do
24 | 
25 | # create a prefix for the output file
26 | samplename=`basename $fq .fq`
27 | 
28 | # run salmon
29 | salmon quant -i /n/groups/hbctraining/rna-seq_2019_02/reference_data/salmon_index \
30 |  -l A \
31 |  -r $fq \
32 |  -o ${samplename}_salmon \
33 |  --seqBias \
34 |  --useVBOpt \
35 |  --validateMappings \
36 |  -p 6 \
37 | 
38 | done
39 | 
40 | 


--------------------------------------------------------------------------------
/assets/css/style.scss:
--------------------------------------------------------------------------------
1 | ---
2 | ---
3 | 
4 | @import "{{ site.theme }}";
5 | 
6 | .page-header { color: #fff; text-align: center; background-image: url("../images/dna-sequence-1600x800.jpg"); }
7 | 
8 | .main-content h1, .main-content h2, .main-content h3, .main-content h4, .main-content h5, .main-content h6 { margin-top: 2rem; margin-bottom: 1rem; font-weight: normal; color: #000000; }
9 | 


--------------------------------------------------------------------------------
/assets/images/dna-sequence-1600x800.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/assets/images/dna-sequence-1600x800.jpg


--------------------------------------------------------------------------------
/data/exp_design_table.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/data/exp_design_table.xlsx


--------------------------------------------------------------------------------
/fastqc/Icons/error.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/fastqc/Icons/error.png


--------------------------------------------------------------------------------
/fastqc/Icons/fastqc_icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/fastqc/Icons/fastqc_icon.png


--------------------------------------------------------------------------------
/fastqc/Icons/tick.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/fastqc/Icons/tick.png


--------------------------------------------------------------------------------
/fastqc/Icons/warning.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/fastqc/Icons/warning.png


--------------------------------------------------------------------------------
/fastqc/Images/duplication_levels copy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/fastqc/Images/duplication_levels copy.png


--------------------------------------------------------------------------------
/fastqc/Images/duplication_levels.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/fastqc/Images/duplication_levels.png


--------------------------------------------------------------------------------
/fastqc/Images/kmer_profiles.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/fastqc/Images/kmer_profiles.png


--------------------------------------------------------------------------------
/fastqc/Images/per_base_gc_content.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/fastqc/Images/per_base_gc_content.png


--------------------------------------------------------------------------------
/fastqc/Images/per_base_n_content.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/fastqc/Images/per_base_n_content.png


--------------------------------------------------------------------------------
/fastqc/Images/per_base_quality.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/fastqc/Images/per_base_quality.png


--------------------------------------------------------------------------------
/fastqc/Images/per_base_sequence_content.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/fastqc/Images/per_base_sequence_content.png


--------------------------------------------------------------------------------
/fastqc/Images/per_sequence_gc_content.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/fastqc/Images/per_sequence_gc_content.png


--------------------------------------------------------------------------------
/fastqc/Images/per_sequence_quality.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/fastqc/Images/per_sequence_quality.png


--------------------------------------------------------------------------------
/fastqc/Images/sequence_length_distribution.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/fastqc/Images/sequence_length_distribution.png


--------------------------------------------------------------------------------
/fastqc/Mov10oe_1-fastqc_report.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Strict//EN">
  2 | <html>
  3 | <head><title>Mov10_oe_1.fastq FastQC Report</title>
  4 | 
  5 | <style type="text/css">
  6 | 
  7 |  @media screen {
  8 |   div.summary {
  9 |     width: 18em;
 10 |     position:fixed;
 11 |     top: 3em;
 12 |     margin:1em 0 0 1em;
 13 |   }
 14 |   
 15 |   div.main {
 16 |     display:block;
 17 |     position:absolute;
 18 |     overflow:auto;
 19 |     height:auto;
 20 |     width:auto;
 21 |     top:4.5em;
 22 |     bottom:2.3em;
 23 |     left:18em;
 24 |     right:0;
 25 |     border-left: 1px solid #CCC;
 26 |     padding:0 0 0 1em;
 27 |     background-color: white;
 28 |     z-index:1;
 29 |   }
 30 |   
 31 |   div.header {
 32 |     background-color: #EEE;
 33 |     border:0;
 34 |     margin:0;
 35 |     padding: 0.5em;
 36 |     font-size: 200%;
 37 |     font-weight: bold;
 38 |     position:fixed;
 39 |     width:100%;
 40 |     top:0;
 41 |     left:0;
 42 |     z-index:2;
 43 |   }
 44 | 
 45 |   div.footer {
 46 |     background-color: #EEE;
 47 |     border:0;
 48 |     margin:0;
 49 | 	padding:0.5em;
 50 |     height: 1.3em;
 51 | 	overflow:hidden;
 52 |     font-size: 100%;
 53 |     font-weight: bold;
 54 |     position:fixed;
 55 |     bottom:0;
 56 |     width:100%;
 57 |     z-index:2;
 58 |   }
 59 |   
 60 |   img.indented {
 61 |     margin-left: 3em;
 62 |   }
 63 |  }
 64 |  
 65 |  @media print {
 66 | 	img {
 67 | 		max-width:100% !important;
 68 | 		page-break-inside: avoid;
 69 | 	}
 70 | 	h2, h3 {
 71 | 		page-break-after: avoid;
 72 | 	}
 73 | 	div.header {
 74 |       background-color: #FFF;
 75 |     }
 76 | 	
 77 |  }
 78 |  
 79 |  body {    
 80 |   font-family: sans-serif;   
 81 |   color: #000;   
 82 |   background-color: #FFF;
 83 |   border: 0;
 84 |   margin: 0;
 85 |   padding: 0;
 86 |   }
 87 |   
 88 |   div.header {
 89 |   border:0;
 90 |   margin:0;
 91 |   padding: 0.5em;
 92 |   font-size: 200%;
 93 |   font-weight: bold;
 94 |   width:100%;
 95 |   }    
 96 |   
 97 |   #header_title {
 98 |   display:inline-block;
 99 |   float:left;
100 |   clear:left;
101 |   }
102 |   #header_filename {
103 |   display:inline-block;
104 |   float:right;
105 |   clear:right;
106 |   font-size: 50%;
107 |   margin-right:2em;
108 |   text-align: right;
109 |   }
110 | 
111 |   div.header h3 {
112 |   font-size: 50%;
113 |   margin-bottom: 0;
114 |   }
115 |   
116 |   div.summary ul {
117 |   padding-left:0;
118 |   list-style-type:none;
119 |   }
120 |   
121 |   div.summary ul li img {
122 |   margin-bottom:-0.5em;
123 |   margin-top:0.5em;
124 |   }
125 | 	  
126 |   div.main {
127 |   background-color: white;
128 |   }
129 |       
130 |   div.module {
131 |   padding-bottom:1.5em;
132 |   padding-top:1.5em;
133 |   }
134 | 	  
135 |   div.footer {
136 |   background-color: #EEE;
137 |   border:0;
138 |   margin:0;
139 |   padding: 0.5em;
140 |   font-size: 100%;
141 |   font-weight: bold;
142 |   width:100%;
143 |   }
144 | 
145 | 
146 |   a {
147 |   color: #000080;
148 |   }
149 | 
150 |   a:hover {
151 |   color: #800000;
152 |   }
153 |       
154 |   h2 {
155 |   color: #800000;
156 |   padding-bottom: 0;
157 |   margin-bottom: 0;
158 |   clear:left;
159 |   }
160 | 
161 |   table { 
162 |   margin-left: 3em;
163 |   text-align: center;
164 |   }
165 |   
166 |   th { 
167 |   text-align: center;
168 |   background-color: #000080;
169 |   color: #FFF;
170 |   padding: 0.4em;
171 |   }      
172 |   
173 |   td { 
174 |   font-family: monospace; 
175 |   text-align: left;
176 |   background-color: #EEE;
177 |   color: #000;
178 |   padding: 0.4em;
179 |   }
180 | 
181 |   img {
182 |   padding-top: 0;
183 |   margin-top: 0;
184 |   border-top: 0;
185 |   }
186 | 
187 |   
188 |   p {
189 |   padding-top: 0;
190 |   margin-top: 0;
191 |   }
192 |   
193 | </style>
194 | 
195 | </head>
196 | <body>
197 | <div class="header">
198 | <div id="header_title"><img src="Icons/fastqc_icon.png" alt="FastQC">FastQC Report</div>
199 | <div id="header_filename">
200 | Wed 30 Sep 2015<br />
201 | Mov10_oe_1.fastq
202 | </div>
203 | </div>
204 | <div class="summary">
205 | <h2>Summary</h2>
206 | <ul>
207 | <li><img src="Icons/tick.png" alt="[PASS]"> <a href="#M0">Basic Statistics</a></li>
208 | <li><img src="Icons/tick.png" alt="[PASS]"> <a href="#M1">Per base sequence quality</a></li>
209 | <li><img src="Icons/tick.png" alt="[PASS]"> <a href="#M2">Per sequence quality scores</a></li>
210 | <li><img src="Icons/error.png" alt="[FAIL]"> <a href="#M3">Per base sequence content</a></li>
211 | <li><img src="Icons/error.png" alt="[FAIL]"> <a href="#M4">Per base GC content</a></li>
212 | <li><img src="Icons/tick.png" alt="[PASS]"> <a href="#M5">Per sequence GC content</a></li>
213 | <li><img src="Icons/tick.png" alt="[PASS]"> <a href="#M6">Per base N content</a></li>
214 | <li><img src="Icons/tick.png" alt="[PASS]"> <a href="#M7">Sequence Length Distribution</a></li>
215 | <li><img src="Icons/error.png" alt="[FAIL]"> <a href="#M8">Sequence Duplication Levels</a></li>
216 | <li><img src="Icons/tick.png" alt="[PASS]"> <a href="#M9">Overrepresented sequences</a></li>
217 | <li><img src="Icons/warning.png" alt="[WARNING]"> <a href="#M10">Kmer Content</a></li>
218 | </ul>
219 | </div>
220 | <div class="main">
221 | <div class="module"><h2 id="M0"><img src="Icons/tick.png" alt="[OK]"> Basic Statistics</h2>
222 | <table>
223 | <tr>
224 | <th>Measure</th>
225 | <th>Value</th>
226 | </tr>
227 | <tr>
228 | <td>Filename</td>
229 | <td>Mov10_oe_1.fastq</td>
230 | </tr>
231 | <tr>
232 | <td>File type</td>
233 | <td>Conventional base calls</td>
234 | </tr>
235 | <tr>
236 | <td>Encoding</td>
237 | <td>Sanger / Illumina 1.9</td>
238 | </tr>
239 | <tr>
240 | <td>Total Sequences</td>
241 | <td>39971841</td>
242 | </tr>
243 | <tr>
244 | <td>Filtered Sequences</td>
245 | <td>0</td>
246 | </tr>
247 | <tr>
248 | <td>Sequence length</td>
249 | <td>100</td>
250 | </tr>
251 | <tr>
252 | <td>%GC</td>
253 | <td>47</td>
254 | </tr>
255 | </table>
256 | </div>
257 | <div class="module"><h2 id="M1"><img src="Icons/tick.png" alt="[OK]"> Per base sequence quality</h2>
258 | <p><img class="indented" src="Images/per_base_quality.png" alt="Per base quality graph"></p>
259 | </div>
260 | <div class="module"><h2 id="M2"><img src="Icons/tick.png" alt="[OK]"> Per sequence quality scores</h2>
261 | <p><img class="indented" src="Images/per_sequence_quality.png" alt="Per Sequence quality graph"></p>
262 | </div>
263 | <div class="module"><h2 id="M3"><img src="Icons/error.png" alt="[FAIL]"> Per base sequence content</h2>
264 | <p><img class="indented" src="Images/per_base_sequence_content.png" alt="Per base sequence content"></p>
265 | </div>
266 | <div class="module"><h2 id="M4"><img src="Icons/error.png" alt="[FAIL]"> Per base GC content</h2>
267 | <p><img class="indented" src="Images/per_base_gc_content.png" alt="Per base GC content graph"></p>
268 | </div>
269 | <div class="module"><h2 id="M5"><img src="Icons/tick.png" alt="[OK]"> Per sequence GC content</h2>
270 | <p><img class="indented" src="Images/per_sequence_gc_content.png" alt="Per sequence GC content graph"></p>
271 | </div>
272 | <div class="module"><h2 id="M6"><img src="Icons/tick.png" alt="[OK]"> Per base N content</h2>
273 | <p><img class="indented" src="Images/per_base_n_content.png" alt="N content graph"></p>
274 | </div>
275 | <div class="module"><h2 id="M7"><img src="Icons/tick.png" alt="[OK]"> Sequence Length Distribution</h2>
276 | <p><img class="indented" src="Images/sequence_length_distribution.png" alt="Sequence length distribution"></p>
277 | </div>
278 | <div class="module"><h2 id="M8"><img src="Icons/error.png" alt="[FAIL]"> Sequence Duplication Levels</h2>
279 | <p><img class="indented" src="Images/duplication_levels.png" alt="Duplication level graph"></p>
280 | </div>
281 | <div class="module"><h2 id="M9"><img src="Icons/tick.png" alt="[OK]"> Overrepresented sequences</h2>
282 | <p>No overrepresented sequences</p>
283 | </div>
284 | <div class="module"><h2 id="M10"><img src="Icons/warning.png" alt="[WARN]"> Kmer Content</h2>
285 | <p><img class="indented" src="Images/kmer_profiles.png" alt="Kmer graph"></p>
286 | <table>
287 | <tr>
288 | <th>Sequence</th>
289 | <th>Count</th>
290 | <th>Obs/Exp Overall</th>
291 | <th>Obs/Exp Max</th>
292 | <th>Max Obs/Exp Position</th>
293 | </tr>
294 | <tr>
295 | <td>AAAAA</td>
296 | <td>16795015</td>
297 | <td>4.1748657</td>
298 | <td>6.059911</td>
299 | <td>2</td>
300 | </tr>
301 | <tr>
302 | <td>CTGGG</td>
303 | <td>8376590</td>
304 | <td>2.5479658</td>
305 | <td>6.4841547</td>
306 | <td>1</td>
307 | </tr>
308 | <tr>
309 | <td>TTCTT</td>
310 | <td>12161990</td>
311 | <td>2.543816</td>
312 | <td>5.0711346</td>
313 | <td>6</td>
314 | </tr>
315 | <tr>
316 | <td>TCTTC</td>
317 | <td>10938540</td>
318 | <td>2.529536</td>
319 | <td>5.185696</td>
320 | <td>7</td>
321 | </tr>
322 | <tr>
323 | <td>CTTCT</td>
324 | <td>10885845</td>
325 | <td>2.5173504</td>
326 | <td>5.0255194</td>
327 | <td>1</td>
328 | </tr>
329 | <tr>
330 | <td>CTCCA</td>
331 | <td>8804215</td>
332 | <td>2.377327</td>
333 | <td>8.132946</td>
334 | <td>1</td>
335 | </tr>
336 | <tr>
337 | <td>GGCAG</td>
338 | <td>7360785</td>
339 | <td>2.3646495</td>
340 | <td>9.048611</td>
341 | <td>1</td>
342 | </tr>
343 | <tr>
344 | <td>TCCAG</td>
345 | <td>8433860</td>
346 | <td>2.3336692</td>
347 | <td>5.7122536</td>
348 | <td>7</td>
349 | </tr>
350 | <tr>
351 | <td>CTCCT</td>
352 | <td>8862290</td>
353 | <td>2.2658334</td>
354 | <td>6.7507205</td>
355 | <td>1</td>
356 | </tr>
357 | <tr>
358 | <td>CAGGA</td>
359 | <td>7528755</td>
360 | <td>2.2545867</td>
361 | <td>5.9135337</td>
362 | <td>1</td>
363 | </tr>
364 | <tr>
365 | <td>CTTCA</td>
366 | <td>9153500</td>
367 | <td>2.235553</td>
368 | <td>6.1804776</td>
369 | <td>1</td>
370 | </tr>
371 | <tr>
372 | <td>CCCAG</td>
373 | <td>7216920</td>
374 | <td>2.207828</td>
375 | <td>6.119858</td>
376 | <td>1</td>
377 | </tr>
378 | <tr>
379 | <td>GCCAG</td>
380 | <td>6455370</td>
381 | <td>2.023714</td>
382 | <td>6.2788043</td>
383 | <td>1</td>
384 | </tr>
385 | <tr>
386 | <td>CTGCA</td>
387 | <td>7241750</td>
388 | <td>2.0038092</td>
389 | <td>5.1436768</td>
390 | <td>1</td>
391 | </tr>
392 | <tr>
393 | <td>CTTGG</td>
394 | <td>6897085</td>
395 | <td>1.8517264</td>
396 | <td>5.505673</td>
397 | <td>1</td>
398 | </tr>
399 | <tr>
400 | <td>CTGGA</td>
401 | <td>6511845</td>
402 | <td>1.8464246</td>
403 | <td>6.7122235</td>
404 | <td>1</td>
405 | </tr>
406 | <tr>
407 | <td>CTCAG</td>
408 | <td>6449570</td>
409 | <td>1.7846115</td>
410 | <td>7.22948</td>
411 | <td>1</td>
412 | </tr>
413 | <tr>
414 | <td>CTTTT</td>
415 | <td>8479045</td>
416 | <td>1.7734871</td>
417 | <td>5.9101095</td>
418 | <td>1</td>
419 | </tr>
420 | <tr>
421 | <td>TTTCA</td>
422 | <td>7934210</td>
423 | <td>1.7526736</td>
424 | <td>5.187293</td>
425 | <td>6</td>
426 | </tr>
427 | <tr>
428 | <td>TTCAG</td>
429 | <td>6830700</td>
430 | <td>1.7095337</td>
431 | <td>5.021524</td>
432 | <td>7</td>
433 | </tr>
434 | <tr>
435 | <td>CTTGA</td>
436 | <td>5609765</td>
437 | <td>1.4039677</td>
438 | <td>5.2207584</td>
439 | <td>1</td>
440 | </tr>
441 | <tr>
442 | <td>CTCAT</td>
443 | <td>4925100</td>
444 | <td>1.2028538</td>
445 | <td>5.1273108</td>
446 | <td>1</td>
447 | </tr>
448 | <tr>
449 | <td>CTCAA</td>
450 | <td>4489260</td>
451 | <td>1.1579475</td>
452 | <td>5.334822</td>
453 | <td>1</td>
454 | </tr>
455 | </table>
456 | </div>
457 | </div><div class="footer">Produced by <a href="http://www.bioinformatics.babraham.ac.uk/projects/fastqc/">FastQC</a> (version 0.10.1)</div>
458 | </body></html>


--------------------------------------------------------------------------------
/img/FastQC_contam.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/FastQC_contam.png


--------------------------------------------------------------------------------
/img/FastQC_seq_qual.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/FastQC_seq_qual.png


--------------------------------------------------------------------------------
/img/FileZilla_click.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/FileZilla_click.gif


--------------------------------------------------------------------------------
/img/FileZilla_information.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/FileZilla_information.gif


--------------------------------------------------------------------------------
/img/Filezilla_step1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/Filezilla_step1.png


--------------------------------------------------------------------------------
/img/Filezilla_step2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/Filezilla_step2.png


--------------------------------------------------------------------------------
/img/Gene_products.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/Gene_products.png


--------------------------------------------------------------------------------
/img/Gene_structure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/Gene_structure.png


--------------------------------------------------------------------------------
/img/IGV_mov10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/IGV_mov10.png


--------------------------------------------------------------------------------
/img/O2_login.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/O2_login.gif


--------------------------------------------------------------------------------
/img/O2_primary-storage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/O2_primary-storage.png


--------------------------------------------------------------------------------
/img/Overrepresented_sequences_adaptor_only.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/Overrepresented_sequences_adaptor_only.png


--------------------------------------------------------------------------------
/img/Overrepresented_sequences_homopolymers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/Overrepresented_sequences_homopolymers.png


--------------------------------------------------------------------------------
/img/Per_base_sequence_content_bad.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/Per_base_sequence_content_bad.png


--------------------------------------------------------------------------------
/img/Per_sequence_GC_content_bad.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/Per_sequence_GC_content_bad.png


--------------------------------------------------------------------------------
/img/Per_sequence_GC_content_good.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/Per_sequence_GC_content_good.png


--------------------------------------------------------------------------------
/img/Per_sequence_quality_scores_bad.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/Per_sequence_quality_scores_bad.png


--------------------------------------------------------------------------------
/img/Per_sequence_quality_scores_good.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/Per_sequence_quality_scores_good.png


--------------------------------------------------------------------------------
/img/QC_workflow_Sept2018.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/QC_workflow_Sept2018.png


--------------------------------------------------------------------------------
/img/README.md:
--------------------------------------------------------------------------------
1 | ###All images for Session II of NGS Data Analysis Course
2 | 


--------------------------------------------------------------------------------
/img/RNA-seq_library_prep.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/RNA-seq_library_prep.png


--------------------------------------------------------------------------------
/img/RNAseqWorkflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/RNAseqWorkflow.png


--------------------------------------------------------------------------------
/img/R_screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/R_screenshot.png


--------------------------------------------------------------------------------
/img/R_screenshot2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/R_screenshot2.png


--------------------------------------------------------------------------------
/img/Rstudio_interface.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/Rstudio_interface.png


--------------------------------------------------------------------------------
/img/SAM_file.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/SAM_file.png


--------------------------------------------------------------------------------
/img/Slide1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/Slide1.jpg


--------------------------------------------------------------------------------
/img/alignment_STAR_step1.ai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/alignment_STAR_step1.ai


--------------------------------------------------------------------------------
/img/alignment_STAR_step1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/alignment_STAR_step1.png


--------------------------------------------------------------------------------
/img/alignment_STAR_step2.ai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/alignment_STAR_step2.ai


--------------------------------------------------------------------------------
/img/alignment_STAR_step2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/alignment_STAR_step2.png


--------------------------------------------------------------------------------
/img/alignment_STAR_step3.ai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/alignment_STAR_step3.ai


--------------------------------------------------------------------------------
/img/alignment_STAR_step3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/alignment_STAR_step3.png


--------------------------------------------------------------------------------
/img/alignment_STAR_step4.ai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/alignment_STAR_step4.ai


--------------------------------------------------------------------------------
/img/alignment_STAR_step4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/alignment_STAR_step4.png


--------------------------------------------------------------------------------
/img/alignment_STAR_step5.ai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/alignment_STAR_step5.ai


--------------------------------------------------------------------------------
/img/alignment_STAR_step5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/alignment_STAR_step5.png


--------------------------------------------------------------------------------
/img/alignmentfree_workflow_aug2017.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/alignmentfree_workflow_aug2017.png


--------------------------------------------------------------------------------
/img/alignmentfree_workflow_june2017.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/alignmentfree_workflow_june2017.png


--------------------------------------------------------------------------------
/img/bad_quality.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/bad_quality.png


--------------------------------------------------------------------------------
/img/base_calling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/base_calling.png


--------------------------------------------------------------------------------
/img/batch_effect.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/batch_effect.png


--------------------------------------------------------------------------------
/img/batch_effect_pca.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/batch_effect_pca.png


--------------------------------------------------------------------------------
/img/bioconductor_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/bioconductor_logo.png


--------------------------------------------------------------------------------
/img/bitwiseflags.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/bitwiseflags.png


--------------------------------------------------------------------------------
/img/cigar_strings.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/cigar_strings.png


--------------------------------------------------------------------------------
/img/clonal_amplification.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/clonal_amplification.png


--------------------------------------------------------------------------------
/img/cluster_generation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/cluster_generation.png


--------------------------------------------------------------------------------
/img/complete_wd_setup.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/complete_wd_setup.png


--------------------------------------------------------------------------------
/img/confounded_batch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/confounded_batch.png


--------------------------------------------------------------------------------
/img/confounded_design.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/confounded_design.png


--------------------------------------------------------------------------------
/img/console.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/console.png


--------------------------------------------------------------------------------
/img/corr_map.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/corr_map.png


--------------------------------------------------------------------------------
/img/count-fig1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/count-fig1.png


--------------------------------------------------------------------------------
/img/count-fig2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/count-fig2.png


--------------------------------------------------------------------------------
/img/count-matrix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/count-matrix.png


--------------------------------------------------------------------------------
/img/count_matrix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/count_matrix.png


--------------------------------------------------------------------------------
/img/counts-workflow.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/counts-workflow.jpg


--------------------------------------------------------------------------------
/img/counts-workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/counts-workflow.png


--------------------------------------------------------------------------------
/img/counts_view.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/counts_view.png


--------------------------------------------------------------------------------
/img/cran_packages.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/cran_packages.png


--------------------------------------------------------------------------------
/img/data-lifecycle-base.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/data-lifecycle-base.png


--------------------------------------------------------------------------------
/img/data_life_cycle_gouldv2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/data_life_cycle_gouldv2.png


--------------------------------------------------------------------------------
/img/de_norm_counts_var.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/de_norm_counts_var.png


--------------------------------------------------------------------------------
/img/de_replicates_img.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/de_replicates_img.png


--------------------------------------------------------------------------------
/img/de_replicates_img2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/de_replicates_img2.png


--------------------------------------------------------------------------------
/img/de_variation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/de_variation.png


--------------------------------------------------------------------------------
/img/demultiplexing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/demultiplexing.png


--------------------------------------------------------------------------------
/img/drawings.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/drawings.pptx


--------------------------------------------------------------------------------
/img/environment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/environment.png


--------------------------------------------------------------------------------
/img/exp_design.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/exp_design.png


--------------------------------------------------------------------------------
/img/factors.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/factors.png


--------------------------------------------------------------------------------
/img/factors_both.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/factors_both.png


--------------------------------------------------------------------------------
/img/factors_new.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/factors_new.png


--------------------------------------------------------------------------------
/img/factors_sm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/factors_sm.png


--------------------------------------------------------------------------------
/img/factors_sm_intact.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/factors_sm_intact.png


--------------------------------------------------------------------------------
/img/fastqc_GC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/fastqc_GC.png


--------------------------------------------------------------------------------
/img/fastqc_basic_stats.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/fastqc_basic_stats.png


--------------------------------------------------------------------------------
/img/fastqc_duplication.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/fastqc_duplication.png


--------------------------------------------------------------------------------
/img/fastqc_over-represented_sequences.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/fastqc_over-represented_sequences.png


--------------------------------------------------------------------------------
/img/fastqc_per_base_sequence_content.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/fastqc_per_base_sequence_content.png


--------------------------------------------------------------------------------
/img/fastqc_per_sequence_quality_scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/fastqc_per_sequence_quality_scores.png


--------------------------------------------------------------------------------
/img/fastqc_results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/fastqc_results.png


--------------------------------------------------------------------------------
/img/fastqc_summary.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/fastqc_summary.png


--------------------------------------------------------------------------------
/img/feature-overlap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/feature-overlap.png


--------------------------------------------------------------------------------
/img/filezilla_login.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/filezilla_login.png


--------------------------------------------------------------------------------
/img/filezilla_setup.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/filezilla_setup.png


--------------------------------------------------------------------------------
/img/flow_cell_oligos.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/flow_cell_oligos.png


--------------------------------------------------------------------------------
/img/flow_cells.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/flow_cells.png


--------------------------------------------------------------------------------
/img/full_workflow_2019.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/full_workflow_2019.png


--------------------------------------------------------------------------------
/img/full_workflow_Sept2018.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/full_workflow_Sept2018.png


--------------------------------------------------------------------------------
/img/full_workflow_qualimap_2019.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/full_workflow_qualimap_2019.png


--------------------------------------------------------------------------------
/img/gProfiler.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/gProfiler.png


--------------------------------------------------------------------------------
/img/gene_expression2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/gene_expression2.png


--------------------------------------------------------------------------------
/img/gene_expression_cells.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/gene_expression_cells.png


--------------------------------------------------------------------------------
/img/genemania.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/genemania.png


--------------------------------------------------------------------------------
/img/getwd.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/getwd.png


--------------------------------------------------------------------------------
/img/good_quality.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/good_quality.png


--------------------------------------------------------------------------------
/img/gvng.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/gvng.jpg


--------------------------------------------------------------------------------
/img/igv_screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/igv_screenshot.png


--------------------------------------------------------------------------------
/img/illumina_platforms.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/illumina_platforms.png


--------------------------------------------------------------------------------
/img/illumina_sequencing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/illumina_sequencing.png


--------------------------------------------------------------------------------
/img/illumina_sequencing_process.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/illumina_sequencing_process.png


--------------------------------------------------------------------------------
/img/libraryprep_step1-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/libraryprep_step1-2.png


--------------------------------------------------------------------------------
/img/libraryprep_step3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/libraryprep_step3.png


--------------------------------------------------------------------------------
/img/libraryprep_step4-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/libraryprep_step4-5.png


--------------------------------------------------------------------------------
/img/libraryprep_step6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/libraryprep_step6.png


--------------------------------------------------------------------------------
/img/long_read_tech.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/long_read_tech.png


--------------------------------------------------------------------------------
/img/metadata_batch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/metadata_batch.png


--------------------------------------------------------------------------------
/img/multiqc_GC_content.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/multiqc_GC_content.png


--------------------------------------------------------------------------------
/img/multiqc_alignment_scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/multiqc_alignment_scores.png


--------------------------------------------------------------------------------
/img/multiqc_alignment_scores1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/multiqc_alignment_scores1.png


--------------------------------------------------------------------------------
/img/multiqc_columns.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/multiqc_columns.png


--------------------------------------------------------------------------------
/img/multiqc_coverage_profile.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/multiqc_coverage_profile.png


--------------------------------------------------------------------------------
/img/multiqc_coverage_profile1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/multiqc_coverage_profile1.png


--------------------------------------------------------------------------------
/img/multiqc_duplicates.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/multiqc_duplicates.png


--------------------------------------------------------------------------------
/img/multiqc_table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/multiqc_table.png


--------------------------------------------------------------------------------
/img/multiqc_table1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/multiqc_table1.png


--------------------------------------------------------------------------------
/img/multithreaded_hpc_3samples.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/multithreaded_hpc_3samples.png


--------------------------------------------------------------------------------
/img/non_confounded_design.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/non_confounded_design.png


--------------------------------------------------------------------------------
/img/paired-end_data.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/paired-end_data.png


--------------------------------------------------------------------------------
/img/paired_end_reads.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/paired_end_reads.png


--------------------------------------------------------------------------------
/img/pca_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/pca_plot.png


--------------------------------------------------------------------------------
/img/permission-directory.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/permission-directory.png


--------------------------------------------------------------------------------
/img/placeholder.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/placeholder.png


--------------------------------------------------------------------------------
/img/pseudo_count_comparison-cufflinks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/pseudo_count_comparison-cufflinks.png


--------------------------------------------------------------------------------
/img/pseudo_count_comparison-sailfish.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/pseudo_count_comparison-sailfish.png


--------------------------------------------------------------------------------
/img/pseudo_count_comparison-sailfish_sm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/pseudo_count_comparison-sailfish_sm.png


--------------------------------------------------------------------------------
/img/pseudo_count_comparison-star.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/pseudo_count_comparison-star.png


--------------------------------------------------------------------------------
/img/pseudo_count_comparison-star_sm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/pseudo_count_comparison-star_sm.png


--------------------------------------------------------------------------------
/img/pseudo_count_comparison.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/pseudo_count_comparison.gif


--------------------------------------------------------------------------------
/img/pseudo_count_comparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/pseudo_count_comparison.png


--------------------------------------------------------------------------------
/img/qc_cycles_lost.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/qc_cycles_lost.png


--------------------------------------------------------------------------------
/img/qc_manifold_burst.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/qc_manifold_burst.png


--------------------------------------------------------------------------------
/img/qc_overclustering.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/qc_overclustering.png


--------------------------------------------------------------------------------
/img/qc_phasing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/qc_phasing.png


--------------------------------------------------------------------------------
/img/qc_read2_failed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/qc_read2_failed.png


--------------------------------------------------------------------------------
/img/qc_signal_decay.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/qc_signal_decay.png


--------------------------------------------------------------------------------
/img/qc_troubleshooting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/qc_troubleshooting.png


--------------------------------------------------------------------------------
/img/qualimap_coverage_profile.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/qualimap_coverage_profile.png


--------------------------------------------------------------------------------
/img/qualimap_genomic _origin.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/qualimap_genomic _origin.png


--------------------------------------------------------------------------------
/img/qualimap_genomic_feature.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/qualimap_genomic_feature.png


--------------------------------------------------------------------------------
/img/qualimap_genomic_origin.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/qualimap_genomic_origin.png


--------------------------------------------------------------------------------
/img/qualimap_genomic_origin1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/qualimap_genomic_origin1.png


--------------------------------------------------------------------------------
/img/qualimap_junctions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/qualimap_junctions.png


--------------------------------------------------------------------------------
/img/qualimap_read_alignment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/qualimap_read_alignment.png


--------------------------------------------------------------------------------
/img/qualimap_transcript_coverage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/qualimap_transcript_coverage.png


--------------------------------------------------------------------------------
/img/r_starting_how_it_should_like.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/r_starting_how_it_should_like.png


--------------------------------------------------------------------------------
/img/replicates.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/replicates.png


--------------------------------------------------------------------------------
/img/rnaseq_salmon_workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/rnaseq_salmon_workflow.png


--------------------------------------------------------------------------------
/img/rnaseq_workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/rnaseq_workflow.png


--------------------------------------------------------------------------------
/img/rnaseq_workflow_FASTQC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/rnaseq_workflow_FASTQC.png


--------------------------------------------------------------------------------
/img/rnaseq_workflow_trimming.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/rnaseq_workflow_trimming.png


--------------------------------------------------------------------------------
/img/rrna.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/rrna.png


--------------------------------------------------------------------------------
/img/rstudio_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/rstudio_logo.png


--------------------------------------------------------------------------------
/img/salmon_plot_multiqc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/salmon_plot_multiqc.png


--------------------------------------------------------------------------------
/img/salmon_plot_multiqc1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/salmon_plot_multiqc1.png


--------------------------------------------------------------------------------
/img/salmon_quasialignment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/salmon_quasialignment.png


--------------------------------------------------------------------------------
/img/salmon_rstudio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/salmon_rstudio.png


--------------------------------------------------------------------------------
/img/salmon_workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/salmon_workflow.png


--------------------------------------------------------------------------------
/img/salmon_workflow_subset.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/salmon_workflow_subset.png


--------------------------------------------------------------------------------
/img/sam_bam.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/sam_bam.png


--------------------------------------------------------------------------------
/img/sam_bam2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/sam_bam2.png


--------------------------------------------------------------------------------
/img/sam_bam3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/sam_bam3.png


--------------------------------------------------------------------------------
/img/sbs_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/sbs_image.png


--------------------------------------------------------------------------------
/img/scratch3_best-practice.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/scratch3_best-practice.png


--------------------------------------------------------------------------------
/img/scratch_recommended_practice.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/scratch_recommended_practice.png


--------------------------------------------------------------------------------
/img/serial_hpc_3samples.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/serial_hpc_3samples.png


--------------------------------------------------------------------------------
/img/star.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/star.png


--------------------------------------------------------------------------------
/img/teachin-team.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/teachin-team.png


--------------------------------------------------------------------------------
/img/union.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/union.png


--------------------------------------------------------------------------------
/img/vim_insert.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/vim_insert.png


--------------------------------------------------------------------------------
/img/vim_postsave.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/vim_postsave.png


--------------------------------------------------------------------------------
/img/vim_quit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/vim_quit.png


--------------------------------------------------------------------------------
/img/vim_save.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/vim_save.png


--------------------------------------------------------------------------------
/img/vim_spider.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/vim_spider.png


--------------------------------------------------------------------------------
/img/vim_spider_number.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/vim_spider_number.png


--------------------------------------------------------------------------------
/img/why_R.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/why_R.png


--------------------------------------------------------------------------------
/img/workflow_align_qualimap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/workflow_align_qualimap.png


--------------------------------------------------------------------------------
/img/workflow_alignment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/workflow_alignment.png


--------------------------------------------------------------------------------
/img/workflow_salmon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/workflow_salmon.png


--------------------------------------------------------------------------------
/img/wrap_option.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/img/wrap_option.png


--------------------------------------------------------------------------------
/lectures/2_day/HPC_intro_O2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/lectures/2_day/HPC_intro_O2.pdf


--------------------------------------------------------------------------------
/lectures/2_day/Intro_to_workshop.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/lectures/2_day/Intro_to_workshop.pdf


--------------------------------------------------------------------------------
/lectures/2_day/RNAseq-analysis-methods.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/lectures/2_day/RNAseq-analysis-methods.pdf


--------------------------------------------------------------------------------
/lectures/2_day/Wrap_up.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/lectures/2_day/Wrap_up.pdf


--------------------------------------------------------------------------------
/lectures/2_day/rna-seq_design.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/lectures/2_day/rna-seq_design.pdf


--------------------------------------------------------------------------------
/lectures/Intro_to_workshop.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/lectures/Intro_to_workshop.pdf


--------------------------------------------------------------------------------
/lectures/RNA-seq_troubleshooting.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/lectures/RNA-seq_troubleshooting.pdf


--------------------------------------------------------------------------------
/lectures/alignment_quantification.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/lectures/alignment_quantification.pdf


--------------------------------------------------------------------------------
/lectures/expression_quantification.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/lectures/expression_quantification.pdf


--------------------------------------------------------------------------------
/lectures/workflow_overview.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/lectures/workflow_overview.pdf


--------------------------------------------------------------------------------
/lectures/workshop_intro_slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/lectures/workshop_intro_slides.pdf


--------------------------------------------------------------------------------
/lectures/workshop_wrapup.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/lectures/workshop_wrapup.pdf


--------------------------------------------------------------------------------
/lectures/workshop_wrapup_slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-bulk-RNAseq/fa4bf4dbac6ac549f63932124195bf19718475e9/lectures/workshop_wrapup_slides.pdf


--------------------------------------------------------------------------------
/lessons/04a_data_organization.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Data Management and Project Organization
 3 | author: Mary Piper, Meeta Mistry, Michael Steinbaugh, Radhika Khetani, Jihe Liu
 4 | date: November 9, 2020
 5 | duration: 35
 6 | ---
 7 | 
 8 | [Data management slides]: https://github.com/hbc/NGS-Data-Analysis-long-course/raw/Fall_2016/sessionI/slides/data_management.pdf
 9 | 
10 | ## Learning Objectives
11 | 
12 | - Explain the need for data management.
13 | - Describe strategies for managing your own projects.
14 | 
15 | 
16 | ## What is data management?
17 | 
18 | In this course we teach you how to independently analyze your own sequencing data, so naturally we should begin with what what to do once you get the raw data back from the sequencing facility. Obviously, we begin with the analysis! Right?
19 | 
20 | Wrong. 
21 | 
22 | One of the most important parts of research that involves large amounts of data is how best to manage it. Once data is generated we tend to prioritize the analysis. **In the excitement to get a first look at new data, there are many important aspects that are often overlooked.**
23 | 
24 | Wait, don't leave this page just yet!
25 | 
26 | We know that data management can be hard to get excited about. However, **ignoring it can be detrimental to your research.** Here are just a few reasons **why data management should matter to you**:
27 | 
28 | * It will make your life easier. It's easier to analyze organized, and well documented data.
29 | * Your future self will thank you. Managing well from the get-go means it's easier to retrieve at a later date.
30 | * Data is precious. Thinking ahead about things like storage means you reduce risk of losing it.
31 | * Funding agencies are increasingly mandating that research projects are developed with a data management plan.
32 | 
33 | ### Data Lifecycle
34 | The data lifecycle displayed below, courtesy of the [HMS Data Management Working Group](https://datamanagement.hms.harvard.edu/), illustrates some things to consider beyond data creation and analysis. Below, we discuss components of the lifecycle and how they apply to any NGS experiment.
35 | 
36 | <p align="center">
37 | <img src="../img/data-lifecycle-base.png" width="600">
38 | </p>
39 | 
40 | _Image acquired from the [Harvard Biomedical Data Management Website](https://datamanagement.hms.harvard.edu/data-lifecycle)_
41 | 
42 | ### Plan and Design
43 | You should approach your sequencing project in a very similar way you do with any biological experiment, and ideally, begins with a good **experimental design**. You want to think about experiment at the outset and collect appropriate samples such that you have enough statistical power to make the comparisons you need. In a later lesson, we delve more into the details of planning and the experimental design considerations. Planning for your computational work is just as important as planning when working on the bench. Every computational analysis you do is going to spawn many files and you will want to think about short-term storage options for your data and computational resources for analyzing it. 
44 | 
45 | ### Collect and Create
46 | The next step is preparing samples as required. During this stage it is important to keep track of how the experiment was performed, making sure to clearly document the source of starting materials and kits used. It is also best practice to include any information about any small variations within the experiment (across the different samples being prepared) or any changes relative to standard experiment protocols. This collection of information serves as the **metadata of the experiment** which will prove to be very useful during the analysis stage. 
47 | 
48 | ### Analyze and Collaborate
49 | Once you have the sequencing data back from the sequencing facility, it's time to analyze it. The process of data analysis should be well documented to ensure reproducibility and also for ease of collaboration. We will spend some more time on this component of the lifecycle later in class, as it applies to our dataset.
50 | 
51 | ### Evaluate and Archive
52 | When the analysis is complete you will want to think about which files are most pertinent to keep. Consider long-term storage options for your data that meet requirements of NIH, other funding agencies, and any guidelines from your institution.
53 | 
54 | ### Disseminate and share
55 | The results of your analysis will hopefully generate some exciting findings that will be beneficial to the scientific community. At this stage in the lifecycle you rely on your previous steps of documentation to turn those notes into a clear and concise methods section of your manuscript. 
56 | 
57 | ### Access and Reuse
58 | In addition to sharing information on the analysis, you should plan for sharing the data. It has become increasingly common for researchers to make their data available to others when they complete a study. While a major reason for sharing is compliance (with journals or research funding organizations), there are also important research benefits including reproducibility and data sharing and reuse.
59 | 
60 | 
61 | **Resources**
62 | 
63 | * The [HMS Data Management Working Group (DMWG)'s website](https://datamanagement.hms.harvard.edu/)
64 | * A guide from the [Harvard library](http://guides.library.harvard.edu/dmp).
65 | * **Sign-up** for the [DMWG quarterly newsletter](https://datamanagement.hms.harvard.edu/dmwg-newsletter) for helpful tips, classes and events related to data management
66 | 
67 | 
68 | ---
69 | 
70 | *This lesson has been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.*
71 | 
72 | * *The materials used in this lesson were derived from work that is Copyright © Data Carpentry (http://datacarpentry.org/). 
73 | All Data Carpentry instructional material is made available under the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0).*
74 | * *Adapted from the lesson by Tracy Teal. Original contributors: Paul Wilson, Milad Fatenejad, Sasha Wood and Radhika Khetani for Software Carpentry (http://software-carpentry.org/)*
75 | 
76 | 


--------------------------------------------------------------------------------
/lessons/04b_data_organization.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Data Management and Project Organization - In-class
  3 | author: Mary Piper, Meeta Mistry, Michael Steinbaugh, Radhika Khetani, Jihe Liu
  4 | date: November 9, 2020
  5 | duration: 35
  6 | ---
  7 | 
  8 | [Data management slides]: https://github.com/hbc/NGS-Data-Analysis-long-course/raw/Fall_2016/sessionI/slides/data_management.pdf
  9 | [SRA]: http://www.ncbi.nlm.nih.gov/sra  "Sequence Read Archive"
 10 | 
 11 | ## Learning Objectives
 12 | 
 13 | - Describe the example RNA-seq experiment and its objectives.
 14 | - Demonstrate strategies for good data management and project organization.
 15 | 
 16 | ## The Dataset
 17 | 
 18 | The dataset we are using for this workshop is part of a larger study described in [Kenny PJ et al., *Cell Rep* 2014](http://www.ncbi.nlm.nih.gov/pubmed/25464849). The authors are investigating interactions between various genes involved in Fragile X syndrome, a disease of aberrant protein production, which results in cognitive impairment and autistic-like features. **The authors sought to show that RNA helicase MOV10 regulates the translation of RNAs involved in Fragile X syndrome.**
 19 | 
 20 | ### Raw data
 21 | 
 22 | From this study we are using the [RNA-seq](http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE50499) data which is publicly available in the [Sequence Read Archive (SRA)](https://www.ncbi.nlm.nih.gov/sra/?term=SRP029367).
 23 | 
 24 | > **NOTE:** If you are interested in how to obtain publicly available sequence data from the SRA we have some materials on this [linked here](https://hbctraining.github.io/Accessing_public_genomic_data/lessons/downloading_from_SRA.html).
 25 | 
 26 | ### Metadata
 27 | 
 28 | In addition to the raw sequence data we also need to collect **information about the data**, also known as **metadata**.  We are usually quick to want to begin analysis of the sequence data (FASTQ files), but how useful is it if we know nothing about the samples that this sequence data originated from? 
 29 | 
 30 | Some relevant metadata for our dataset is provided below:
 31 | 
 32 | * The RNA was extracted from **HEK293F cells** that were transfected with a **MOV10 transgene**, **MOV10 siRNA**, or an **irrelevant siRNA**.  (*For this workshop we won't be using the MOV10 knock down samples.*)
 33 | * The libraries for this dataset are **stranded** and were generated using the standard Tru-seq prep kit (using the dUTP method). 
 34 | * Sequencing was carried out on the **Illumina HiSeq-2500** and **100bp single end** reads were generated. 
 35 | * The full dataset was sequenced to **~40 million reads** per sample, but for this workshop we will be looking at a small subset on chr1 (~300,000 reads/sample).
 36 | * For each group we have three replicates as described in the figure below.
 37 | 
 38 | <p align="center">
 39 | <img src="../img/exp_design.png" width="700">
 40 | </p>
 41 | 
 42 | 
 43 | ## Implementing data management best practices
 44 | 
 45 | In a [previous lesson](04a_data_organization.md) we describe the data lifecycle and the **different aspects to consider when working on your own projects**. Here, we implement some of those strategies to get ourselves setup before we begin with any analysis.
 46 | 
 47 | <p align="center">
 48 | <img src="../img/data-lifecycle-base.png" width="500">
 49 | </p>
 50 | 
 51 | _Image acquired from the [Harvard Biomedical Data Management Website](https://datamanagement.hms.harvard.edu/data-lifecycle)_
 52 | 
 53 | ### Planning and organization
 54 | 
 55 | For each experiment you work on and analyze data for, it is considered best practice to get organized by creating a planned storage space (directory structure). We will start by creating a directory that we can use for the rest of the workshop. First, make sure that you are in your home directory.
 56 | 
 57 | ```bash
 58 | $ cd
 59 | $ pwd
 60 | ```
 61 | 
 62 | This should return `/home/rc_training`. Create the directory `rnaseq` and move into it.
 63 | 
 64 | ```bash
 65 | $ mkdir rnaseq
 66 | $ cd rnaseq
 67 | ```
 68 | 
 69 | Next, we will create a project directory and set up the following structure to keep our files organized. 
 70 | 
 71 | ```bash
 72 | rnaseq
 73 |   ├── logs
 74 |   ├── meta
 75 |   ├── raw_data  
 76 |   ├── results
 77 |   └── scripts
 78 | ```
 79 | 
 80 | *This is a generic structure and can be tweaked based on personal preference and the analysis workflow.*
 81 | 
 82 | - `logs`: to keep track of the commands run and the specific parameters used, but also to have a record of any standard output that is generated while running the command. 
 83 | - `meta`: for any information that describes the samples you are using, which we refer to as [metadata](https://datamanagement.hms.harvard.edu/metadata-overview). 
 84 | - `raw_data`: for any **unmodified** (raw) data obtained prior to computational analysis here, e.g. FASTQ files from the sequencing center. We strongly recommend leaving this directory unmodified through the analysis.
 85 | - `results`: for output from the different tools you implement in your workflow. Create sub-folders specific to each tool/step of the workflow within this folder. 
 86 | - `scripts`: for scripts that you write and use to run analyses/workflow.
 87 | 
 88 | 
 89 | ```bash
 90 | $ mkdir logs meta raw_data results scripts
 91 | ``` 
 92 | 
 93 | > #### File naming conventions
 94 | > 
 95 | > Another aspect of staying organized is making sure that all the directories and filenames for an analysis are as consistent as possible. You want to avoid names like `alignment1.bam`, and rather have names like `20170823_kd_rep1_gmap-1.4.bam` which provide a basic level of information about the file. [This link](https://datamanagement.hms.harvard.edu/file-naming-conventions) and [this slideshow](http://www2.stat.duke.edu/~rcs46/lectures_2015/01-markdown-git/slides/naming-slides/naming-slides.pdf) have some good guidelines for file naming dos and don'ts.
 96 | 
 97 | 
 98 | ### Documentation
 99 | 
100 | In your lab notebook, you likely keep track of the different reagents and kits used for a specific protocol. Similarly, recording information about the tools used in the workflow is important for documenting your computational experiments. 
101 | 
102 | - **Make note of the software you use.** Do your research and find out what tools are best for the data you are working with. Don't just work with tools that you are able to easily install.
103 | - **Keep track of software versions.** Keep up with the literature and make sure you are using the most up-to-date versions.
104 | - **Record information on parameters used and summary statistics** at every step (e.g., how many adapters were removed, how many reads did not align)
105 |     - A general rule of thumb is to test on a single sample or a subset of the data before running your entire dataset through. This will allow you to debug quicker and give you a chance to also get a feel for the tool and the different parameters.
106 |     - Different tools have different ways of reporting log messages to the terminal. You might have to experiment a bit to figure out what output to capture. You can redirect standard output with the `>` symbol which is equivalent to `1> (standard out)`; other tools might require you to use `2>` to re-direct the `standard error` instead.
107 |  
108 | #### README files
109 | 
110 | After setting up the directory structure it is useful to have a **[README file](https://datamanagement.hms.harvard.edu/readme-files) within your project directory**. This is a plain text file containing a short summary about the project and a description of the files/directories found within it. An example README is shown below. It can also be helpful to include a README within each sub-directory with any information pertaining to the analysis.
111 | 
112 | ```
113 | ## README ##
114 | ## This directory contains data generated during the Introduction to RNA-seq workshop
115 | ## Date: 
116 | 
117 | There are five subdirectories in this directory:
118 | 
119 | raw_data : contains raw data
120 | meta:  contains...
121 | logs:
122 | results:
123 | scripts:
124 | ```
125 | 
126 | *** 
127 | 
128 | **Exercise**
129 | 
130 | 1. Take a moment to create a README for the `rnaseq/` folder (hint: use `vim` to create the file). Give a short description of the project and brief descriptions of the types of files you will be storing within each of the sub-directories. 
131 | 
132 | *** 
133 | 
134 | 
135 | ### Obtaining data
136 | 
137 | Let's populate the `rnaseq/` project with some data. The FASTQ files are located on the O2 cluster in the `/n/groups` space. Copy them over from the path shown below, into your `raw_data` directory:
138 | 
139 | ```bash
140 | $ cp /n/groups/hbctraining/unix_lesson/raw_fastq/*.fq ~/rnaseq/raw_data/
141 | ```
142 | 
143 | > **NOTE**: When obtaining data from your sequencing facility, the data will not be stored on O2 and so a simple copy command (`cp`) will not suffice. The raw sequence data will likely be located on another remote computer/server that is hosted by the sequencing facility and you will be given login credentials to access it. To copy it over you can use commands like `rsync`, `wget` or `scp`. These are all commands that can help securely copy the data over to the appropriate location on O2. We have some information [linked here](more_bash_cluster.md#copying-files-to-and-from-the-cluster-) if you would like to learn more. 
144 | 
145 | Now the structure of `rnaseq/` should look like this:
146 | 
147 | ```bash
148 | rnaseq
149 |   ├── logs
150 |   ├── meta
151 |   ├── raw_data
152 |   │   ├── Irrel_kd_1.subset.fq
153 |   │   ├── Irrel_kd_2.subset.fq
154 |   │   ├── Irrel_kd_3.subset.fq
155 |   │   ├── Mov10_oe_1.subset.fq
156 |   │   ├── Mov10_oe_2.subset.fq
157 |   │   └── Mov10_oe_3.subset.fq
158 |   ├── README.txt
159 |   ├── results
160 |   └── scripts
161 | ```
162 | 
163 | Okay, we are all set to begin the analysis!
164 | 
165 | 
166 | ---
167 | 
168 | *This lesson has been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.*
169 | 
170 | * *The materials used in this lesson were derived from work that is Copyright © Data Carpentry (http://datacarpentry.org/). 
171 | All Data Carpentry instructional material is made available under the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0).*
172 | * *Adapted from the lesson by Tracy Teal. Original contributors: Paul Wilson, Milad Fatenejad, Sasha Wood and Radhika Khetani for Software Carpentry (http://software-carpentry.org/)*
173 | 
174 | 


--------------------------------------------------------------------------------
/lessons/05_qc_running_fastqc_interactively.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Quality control using FASTQC"
  3 | author: "Mary Piper, Radhika Khetani, Meeta Mistry, Jihe Liu"
  4 | date: Friday, October 30, 2020
  5 | duration: 45 minutes
  6 | ---
  7 | 
  8 | ## Learning Objectives:
  9 | 
 10 | * Describe the contents and format of a FASTQ file
 11 | * Create a quality report using FASTQC
 12 |  
 13 | ## Quality Control of FASTQ files
 14 | 
 15 | 
 16 | The first step in the RNA-Seq workflow is to take the FASTQ files received from the sequencing facility and assess the quality of the sequence reads. 
 17 | 
 18 | <p align="center">
 19 | <img src="../img/rnaseq_salmon_workflow.png" width="400">
 20 | </p>
 21 | 
 22 | ### Unmapped read data (FASTQ)
 23 | 
 24 | The [FASTQ](https://en.wikipedia.org/wiki/FASTQ_format) file format is the defacto file format for sequence reads generated from next-generation sequencing technologies. This file format evolved from FASTA in that it contains sequence data, but also contains quality information. Similar to FASTA, the FASTQ file begins with a header line. The difference is that the FASTQ header is denoted by a `@` character. For a single record (sequence read), there are four lines, each of which are described below:
 25 | 
 26 | |Line|Description|
 27 | |----|-----------|
 28 | |1|Always begins with '@', followed by information about the read|
 29 | |2|The actual DNA sequence|
 30 | |3|Always begins with a '+', and sometimes the same info as in line 1|
 31 | |4|Has a string of characters representing the quality scores; must have same number of characters as line 2|
 32 | 
 33 | Let's use the following read as an example:
 34 | 
 35 | ```
 36 | @HWI-ST330:304:H045HADXX:1:1101:1111:61397
 37 | CACTTGTAAGGGCAGGCCCCCTTCACCCTCCCGCTCCTGGGGGANNNNNNNNNNANNNCGAGGCCCTGGGGTAGAGGGNNNNNNNNNNNNNNGATCTTGG
 38 | +
 39 | @?@DDDDDDHHH?GH:?FCBGGB@C?DBEGIIIIAEF;FCGGI#########################################################
 40 | ```
 41 | 
 42 | The line 4 has characters encoding the quality of each nucleotide in the read. The legend below provides the mapping of quality scores (Phred-33) to the quality encoding characters. *Different quality encoding scales exist (differing by offset in the ASCII table), but note the most commonly used one is fastqsanger, which is the scale output by Illumina since mid-2011.* 
 43 |  ```
 44 |  Quality encoding: !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHI
 45 |                    |         |         |         |         |
 46 |     Quality score: 0........10........20........30........40                                
 47 | ```
 48 |  
 49 | Using the quality encoding character legend, the first nucelotide in the read (C) is called with a quality score of 31 (corresponding to encoding character `@`), and our Ns are called with a score of 2 (corresponding to encoding character `#`). **As you can tell by now, this is a bad read.** 
 50 | 
 51 | Each quality score represents the probability that the corresponding nucleotide call is incorrect. This quality score is logarithmically based and is calculated as:
 52 | 
 53 | 	Q = -10 x log10(P), where P is the probability that a base call is erroneous
 54 | 
 55 | These probabaility values are the results from the base calling algorithm and dependent on how much signal was captured for the base incorporation. The score values can be interpreted as follows:
 56 | 
 57 | |Phred Quality Score |Probability of incorrect base call |Base call accuracy|
 58 | |:-------------------:|:---------------------------------:|:-----------------:|
 59 | |10	|1 in 10 |	90%|
 60 | |20	|1 in 100|	99%|
 61 | |30	|1 in 1000|	99.9%|
 62 | |40	|1 in 10,000|	99.99%|
 63 | 
 64 | Therefore, for the first nucleotide in the read (C), there is less than a 1 in 1000 chance that the base was called incorrectly. Whereas, for the the end of the read there is greater than 50% probabaility that the base is called incorrectly.
 65 | 
 66 | ## Assessing quality with FastQC
 67 | 
 68 | Now that we understand what information is stored in a FASTQ file, the next step is to examine quality metrics for our data.
 69 | 
 70 | [FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) provides a simple way to do some quality checks on raw sequence data coming from high throughput sequencing pipelines. It provides a modular set of analyses, which you can use to obtain an impression of whether your data has any problems that you should be aware of before moving on to the next analysis.
 71 | 
 72 | FastQC does the following:
 73 | * accepts FASTQ files (or BAM files) as input
 74 | * generates summary graphs and tables to help assess your data
 75 | * generates an easy-to-view HTML-based report with the graphs and tables
 76 | 
 77 | ***
 78 | 
 79 | > NOTE: Before we run FastQC, **you should be on a compute node** in an interactive session. Please run the following `srun` command if you are not on a compute node.
 80 | > 
 81 | > ```bash
 82 | > $ srun --pty -p interactive -t 0-3:00 --mem 1G /bin/bash
 83 | > ```
 84 | >
 85 | > ***An interactive session is very useful to test tools and workflows.***
 86 | 
 87 | ### Run FastQC  
 88 | 
 89 | Change directories to `raw_data`.
 90 | 
 91 | ```bash
 92 | $ cd ~/rnaseq/raw_data
 93 | ```  
 94 | 
 95 | Before we start using software, we have to load the module for each tool. On O2, this is done using an **LMOD** system. 
 96 | 
 97 | If we check which modules we currently have loaded, we should not see FastQC.
 98 | 
 99 | ```bash
100 | $ module list
101 | ```
102 | 
103 | This is because the FastQC program is not in our $PATH (i.e. it's not in a directory that shell will automatically check to run commands/programs).
104 | 
105 | ```bash
106 | $ echo $PATH
107 | ```
108 | 
109 | To run the FastQC program, we first need to load the appropriate module, so it puts the program into our path. To find the FastQC module to load we need to search the versions available:
110 | 
111 | ```bash
112 | $ module spider fastqc
113 | ```
114 | 
115 | Once we know which version we want to use (0.12.1), we can load the FastQC module:
116 | 
117 | ```bash
118 | $ module load fastqc/0.12.1
119 | ```
120 | 
121 | Once a module for a tool is loaded, you have essentially made it directly available to you like any other basic shell command.
122 | 
123 | ```bash
124 | $ module list
125 | 
126 | $ echo $PATH
127 | ```
128 | 
129 | Now, let's create a directory to store the output of FastQC:
130 | 
131 | ```bash
132 | $ mkdir ~/rnaseq/results/fastqc
133 | ```
134 | 
135 | We will need to specify this directory in the command to run FastQC. How do we know which argument to use?
136 | 
137 | ```bash
138 | $ fastqc --help
139 | ```
140 | 
141 | > **NOTE:** From the help manual, we know that `-o` (or `--outdir`) will create all output files in the specified output directory. Note that another argument, `-t`, specifies the number of files which can be processed simultaneously. We will use `-t` argument later. You may explore other arguments as well based on your needs.
142 | 
143 | FastQC will accept multiple file names as input, so we can use the `*.fq` wildcard.
144 | 
145 | ```bash
146 | $ fastqc -o ~/rnaseq/results/fastqc/ *.fq
147 | ```
148 | 
149 | *Did you notice how each file was processed serially? How do we speed this up?*
150 | 
151 | FastQC has the capability of splitting up a single process to run on multiple cores! To do this, we will need to specify an additional argument `-t` indicating number of cores. We will also need to exit the current interactive session, since we started this interactive session with only 1 core. We cannot have a tool to use more cores than requested on a compute node. 
152 | 
153 | Exit the interactive session and start a new one with 6 cores:
154 | 
155 | ```bash
156 | $ exit  #exit the current interactive session (you will be back on a login node)
157 | 
158 | $ srun --pty -c 6 -p interactive -t 0-3:00 --mem 2G /bin/bash  #start a new one with 6 cores (-c 6) and 2GB RAM (--mem 2G)
159 | ```
160 | 
161 | Once you are on the compute node, check what job(s) you have running and what resources you are using.
162 | 
163 | ```bash
164 | $ O2squeue
165 | ```
166 | 
167 | Now that we are in a new interactive session with the appropriate resources, we will need to load the module again for this new session.
168 | 
169 | ```bash
170 | $ module load fastqc/0.12.1  #reload the module for the new (6-core) interactive session
171 | ```
172 | 
173 | We will also move into the `raw_data` directory (remember we are on a new compute node now):
174 | 
175 | ```bash
176 | $ cd ~/rnaseq/raw_data
177 | ```
178 | 
179 | Run FastQC and use the multi-threading functionality of FastQC to run 6 jobs at once (with an additional argument `-t`).
180 | 
181 | ```bash
182 | $ fastqc -o ~/rnaseq/results/fastqc/ -t 6 *.fq  #note the extra parameter we specified for 6 threads
183 | ```
184 | 
185 | *Do you notice a difference? Is there anything in the ouput that suggests this is no longer running serially?*
186 | 
187 | ---
188 | *This lesson has been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.*
189 | 
190 | * *The materials used in this lesson was derived from work that is Copyright © Data Carpentry (http://datacarpentry.org/). 
191 | All Data Carpentry instructional material is made available under the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0).*
192 | 


--------------------------------------------------------------------------------
/lessons/06_qc_running_fastqc_sbatch.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Quality control using FASTQC - script running"
  3 | author: "Mary Piper, Radhika Khetani, Meeta Mistry, Jihe Liu"
  4 | date: Friday, October 30, 2020
  5 | duration: 45 minutes
  6 | ---
  7 | 
  8 | ## Learning Objectives:
  9 | 
 10 | * Create and run a SLURM job submission script to automate quality assessment
 11 | 
 12 | ## Quality Control of FASTQ files
 13 | 
 14 | 
 15 | ### Performing quality assessment using job submission scripts
 16 | So far in our FASTQC analysis, we have been directly submitting commands to O2 using an interactive session (ie. `srun --pty -c 6 -p interactive -t 0-12:00 --mem 6G --reservation=HBC /bin/bash`). However, there are many [more partitions available on O2](https://wiki.rc.hms.harvard.edu/display/O2/Using+Slurm+Basic#UsingSlurmBasic-Partitions(akaQueues)) than just the interactive partition. We can submit a command or series of commands to these partitions using job submission scripts. 
 17 | 
 18 | **Job submission scripts** for O2 are just regular shell scripts, but contain the Slurm **options/directives** for our job submission. These directives define the various resources we are requesting for our job (i.e *number of cores, name of partition, runtime limit* )
 19 | 
 20 | Submission of the script using the `sbatch` command allows Slurm to run your job when its your turn. Let's create a job submission script to automate what we have done in [previous lesson](05_qc_running_fastqc_interactively.md).
 21 | 
 22 | Our script will do the following:
 23 | 
 24 | 1. Change directories to where the FASTQ files are located
 25 | 2. Load the FastQC module
 26 | 3. Run FastQC on all of our FASTQ files
 27 | 
 28 | Let's first change the directory to `~/rnaseq/scripts`, and create a script named `mov10_fastqc.run` using `vim`.
 29 | 
 30 | ```bash
 31 | $ cd ~/rnaseq/scripts
 32 | 
 33 | $ vim mov10_fastqc.run
 34 | ```
 35 | 
 36 | Once in the vim editor, click `i` to enter INSERT mode. The first thing we need in our script is the **shebang line**:
 37 | 
 38 | ```bash
 39 | #!/bin/bash
 40 | ```
 41 | 
 42 | Following the shebang line are the Slurm directives. For the script to run, we need to include options for **queue/partition (-p) and runtime limit (-t)**. To specify our options, we precede the option with `#SBATCH`. Some key resources to specify are:
 43 | 
 44 | |Resource|Flag|Description|
 45 | |:----:|:----:|:----:|
 46 | |partition|-p|partition name|
 47 | |time|-t|hours:minutes run limit, after which the job will be killed|
 48 | |core|-c|number of cores requested -- this needs to be greater than or equal to the number of cores you plan to use to run your job|
 49 | |memory|--mem|memory limit per compute node for the job|
 50 | 
 51 | Let's specify those options as follows:
 52 | 
 53 | ```bash
 54 | #SBATCH -p short 		# partition name
 55 | #SBATCH -t 0-2:00 		# time limit
 56 | #SBATCH -c 6 		# number of cores
 57 | #SBATCH --mem 6G   # requested memory
 58 | #SBATCH --job-name rnaseq_mov10_fastqc 		# Job name
 59 | #SBATCH -o %j.out			# File to which standard output will be written
 60 | #SBATCH -e %j.err 		# File to which standard error will be written
 61 | ```
 62 | 
 63 | Now in the body of the script, we can include any commands we want to run. In this case, it will be the following:
 64 | 
 65 | ```bash
 66 | ## Change directories to where the fastq files are located
 67 | cd ~/rnaseq/raw_data
 68 | 
 69 | ## Load modules required for script commands
 70 | module load fastqc/0.12.1
 71 | 
 72 | ## Run FASTQC
 73 | fastqc -o ~/rnaseq/results/fastqc/ -t 6 *.fq
 74 | ```
 75 | 
 76 | > **NOTE:** These are the same commands we used when running FASTQC in the interactive session. Since we are writing them in a script, the `tab` completion function will **not work**, so please make sure you don't have any typos when writing the script!
 77 | 
 78 | Once done with your script, click `esc` to exit the INSERT mode. Then save and quit the script by typing `:wq`. You may double check your script by typing `less mov10_fastqc.run`. If everything looks good submit the job!
 79 | 
 80 | ```bash
 81 | $ sbatch mov10_fastqc.run
 82 | ```
 83 | 
 84 | You should immediately see a prompt saying `Submitted batch job JobID`. Your job is assigned with that unique identifier `JobID`. You can check on the status of your job with:
 85 | 
 86 | ```bash
 87 | $ O2sacct
 88 | ```
 89 | 
 90 | Look for the row that corresponds to your `JobID`. The third column indicates the state of your job. Possible states include `PENDING`, `RUNNING`, `COMPLETED`. Once your job state is `RUNNING`, you should expect it to finish in less than two minutes. When the state is `COMPLETED`, that means your job is finished.
 91 | 
 92 | > **NOTE:** Other helpful options for checking/managing jobs are available as a [cheatsheet](https://wiki.rc.hms.harvard.edu/display/O2/O2+Command+CheatSheet) from HMS-RC.
 93 | 
 94 | Check out the output files in your directory:
 95 | ```bash
 96 | $ ls -lh ../results/fastqc/
 97 | ```
 98 | There should also be one standard error (`.err`) and one standard out (`.out`) files from the job listed in `~/rnaseq/scripts`. You can move these over to your `logs` directory and give them more intuitive names:
 99 | 
100 | ```bash
101 | $ mv *.err ../logs/fastqc.err
102 | $ mv *.out ../logs/fastqc.out
103 | ```
104 | > **NOTE:** The `.err` and `.out` files store log information during the script running. They are helpful resources, especially when your script does not run as expected and you need to troubleshoot the script.
105 | 
106 | ***
107 | **Exercise**
108 | 1. Take a look at what's inside the `.err` and `.out` files. What do you observe? Do you remember where you see those information when using the interactive session?
109 | 2. How would you change the `mov10_fastqc.run` script if you had 9 fastq files you wanted to run in parallel? 
110 | 
111 | ---
112 | *This lesson has been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.*
113 | 
114 | * *The materials used in this lesson was derived from work that is Copyright © Data Carpentry (http://datacarpentry.org/). 
115 | All Data Carpentry instructional material is made available under the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0).*
116 | 


--------------------------------------------------------------------------------
/lessons/09_quasi_alignment_salmon_sbatch.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Quantification of transcript abundance using Salmon"
  3 | author: "Mary Piper, Meeta Mistry, Radhika Khetani, Jihe Liu"
  4 | date: "November 16, 2020"
  5 | ---
  6 | 
  7 | Approximate time: 30 minutes
  8 | 
  9 | ## Learning Objectives
 10 | 
 11 | * Create a job submission script to run Salmon on all samples in the dataset
 12 | 
 13 |  
 14 | ## Running Salmon on multiple samples 
 15 | 
 16 | In class we talked in depth about how the Salmon algorithm works, and provided the command required to run Salmon on a single sample. In this lesson we walk through the steps required to **efficiently run Salmon on all samples** in the dataset. Unlike our experience with FastQC, where we could use one command and simply provide all files with the use of a wildcard (`*`), Salmon is only able to take a single file as input.
 17 | 
 18 | Rather than typing out the Salmon command six times, we will use **a for loop to iterate over all FASTQ files in our dataset** (inside the `raw_fastq` directory). Furthermore, rather than running this `for` loop interactively, we will put it inside a text file and create a **job submission script**.
 19 | 
 20 | ### Create a job submission script to run Salmon in serial
 21 | 
 22 | Let's start by opening up a text file in `vim`:
 23 | 
 24 | ```
 25 | $ vim salmon_all_samples.sbatch
 26 | ```
 27 | 
 28 | Begin the script starting with the **shebang line**. 
 29 | 
 30 | ```bash
 31 | #!/bin/bash
 32 | 
 33 | ```
 34 | ***
 35 | 
 36 | **Exercise 1**
 37 | 
 38 | 1. Add the Slurm directives ( i.e `#SBATCH`) to request specific resources for our job. The resources we need are listed below. 
 39 | 
 40 | > **NOTE:** Helpful resources include:
 41 | > * This [linked lesson](03_working_on_HPC.md#requesting-resources-from-slurm) 
 42 | > * [HMS-RC's O2 Wiki](https://wiki.rc.hms.harvard.edu/display/O2/Using+Slurm+Basic) 
 43 | 
 44 | * Your job will use the `short` partition
 45 | * Request 6 cores to take advantage of Salmon's multi-threading capabilities
 46 | * Request 12 hours of runtime
 47 | * Request 8G of memory 
 48 | * Give your job the name `salmon_in_serial`
 49 | * Add an email and request to be notified when the job is complete
 50 | 
 51 | ***
 52 | 
 53 | Now that we have the resources requested, we can begin to **add the commands into our shell script**. 
 54 | 
 55 | 
 56 | ***
 57 | 
 58 | **Exercise 2**
 59 | 
 60 | 1. Add a line of code required to load the Salmon module
 61 | 2. Add a line of code to change directories to where the Salmon results will be output (be sure to use a full path here).
 62 | 
 63 | > *Add comments to your script liberally, wherever you feel it's needed.*
 64 | 
 65 | ***
 66 | 
 67 | The last piece of the shell script is the **for loop** code provided below. **Copy and paste this into your script**.
 68 | 
 69 | ```bash
 70 | for fq in ~/rnaseq/raw_data/*.fq
 71 | 
 72 | do
 73 | 
 74 | # create a prefix for the output file
 75 | samplename=`basename $fq .fq`
 76 | 
 77 | # run salmon
 78 | salmon quant -i /n/groups/hbctraining/RNA_seq_part_1/reference_data/salmon/ref-transcripts \
 79 |  -l A \
 80 |  -r $fq \
 81 |  -o ${samplename}_salmon \
 82 |  --seqBias \
 83 |  --useVBOpt \
 84 |  --validateMappings
 85 | 
 86 | done
 87 | ```
 88 | 
 89 | Note, that our for loop is iterating over all FASTQ files in the `raw_fastq` directory. For each file, a prefix is generated to name the output file and then the Salmon command is run with the same parameters as used in the single sample run.
 90 | 
 91 | ***
 92 | 
 93 | **Exercise 3**
 94 | 
 95 | 1. Add two additional parameters (as described below) to the current Salmon command (*remember to use "`\`" if dissecting one command in multiple lines*): 
 96 | 
 97 | 	1.  `-p`: specifies the number of processors or cores we would like to use for **multi-threading**. What value will you provide here, knowing what we asked for in our Slurm directives?
 98 | 	1. `--numBootstraps`: specifies computation of bootstrapped abundance estimates. **Bootstraps are required for isoform level differential expression analysis for estimation of technical variance**. Here, you can set the value to 30.
 99 | 	
100 | > _**NOTE:** `--numBootstraps` is necessary if performing **isoform-level differential expression analysis** with Sleuth, but not for gene-level differential expression analysis. Due to the statistical procedure required to assign reads to gene isoforms, in addition to the random processes underlying RNA-Seq, there will be **technical variability in the abundance estimates** output from the pseudo-alignment tool [[2](https://rawgit.com/pachterlab/sleuth/master/inst/doc/intro.html), [3](https://www.nature.com/articles/nmeth.4324)] for the isoform level abundance estimates (not necessary for gene-level estimates). Therefore, **we would need technical replicates to distinguish technical variability from the biological variability** for gene isoforms._
101 | >
102 | > _The bootstraps estimate technical variation per gene by calculating the abundance estimates for all genes using a different sub-sample of reads during each round of bootstrapping. The variation in the abundance estimates output from each round of bootstrapping is used for the estimation of the technical variance for each gene._
103 | 
104 | 2. Save and close the script. This script is now ready to run.
105 | 
106 | ```
107 | $ sbatch salmon_all_samples.sbatch
108 | ```
109 | 
110 | 3. **After you confirmed that the script runs as expected, copy and paste your final script to [Google forms](https://docs.google.com/forms/d/e/1FAIpQLScxaj3IIO4Bx7FCRw87cCeuTPQyhD_7WR2QU638y8IZDv5r1A/viewform?usp=sf_link).** 
111 | 
112 | ---
113 | 
114 | *This lesson has been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.*
115 | 


--------------------------------------------------------------------------------
/lessons/11_multiQC.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: MultiQC
  3 | authors: Radhika Khetani, Mary Piper, Jihe Liu, Meeta Mistry
  4 | ---
  5 | 
  6 | Approximate time: 30 minutes
  7 | 
  8 | ## Learning Objectives
  9 | * Run the multiQC tool to gather QC metrics from multiple tools for all samples
 10 | * Assess and compare QC metrics among samples
 11 | 
 12 | ## Documenting results and gathering QC metrics
 13 | 
 14 | As you go through the RNA-seq workflow (or any data analysis workflow), it is important to document the parameters you used for running the analysis. In addition, it is also very important to document the metrics/results at every step. Careful evaluation of metrics is a form of QC, and it will enable you to identify any issues with the data and/or the parameters you are using, as well as alert you to the presence of contamination or systematic biases, etc.  
 15 | 
 16 | There are several metrics you can evaluate in the RNA-seq workflow. Below are 3 important ones that you should keep track of for each sample:
 17 | 
 18 | * number of raw reads
 19 | * percentage of reads aligned to genome
 20 | * percentage of reads associated with genes 
 21 | 
 22 | An important QC step is to make sure that these metrics are consistent across the samples for a given experiment, and any outliers should be investigated further.
 23 | 
 24 | Manually tracking these metrics is tedious and error-prone. Many tools can help you with the documentation and QC assessment, some of which also have really nice visualizations to easily identify any issues, e.g. [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/), [Qualimap](http://qualimap.bioinfo.cipf.es/doc_html/index.html), [MultiQC](http://multiqc.info/). Some of these tools tend to focus on a single sample at a time, and on QC for a specific step in the workflow. MultiQC, on the other hand, is able to make a report from the output of many different tools (for RNA-seq analysis and other NGS workflows) and it is able to combine the information for multiple samples.
 25 | 
 26 | ### Tracking and aggregating results from workflow tools with *MultiQC*
 27 | 
 28 | In this lesson, we will be using MultiQC to aggregate results from several tools and generates a single HTML report with plots to visualize and compare QC metrics between the samples.
 29 | 
 30 | MultiQC can generate this report from 96 different bioinformatics tools, and these tools span various NGS analyses, e.g., basic QC, RNA-seq, ChIP-seq, variant calling, genome annotation, etc. We are going to use it to aggregate information from the results of [FastQC](http://multiqc.info/docs/#fastqc), [STAR](http://multiqc.info/docs/#star), [Qualimap](http://multiqc.info/docs/#qualimap), and [salmon](http://multiqc.info/docs/#salmon). MultiQC can parse the information from **specific output files** of these tools.
 31 | 
 32 | Start by creating a directory for our output called `multiqc_report`:
 33 | 
 34 | ```bash
 35 | $ cd ~/rnaseq/
 36 | 
 37 | $ mkdir results/multiqc_report
 38 | ```
 39 | 
 40 | Then navigate into that directory: 
 41 | 
 42 | ```bash
 43 | $ cd results/multiqc_report
 44 | ```
 45 | 
 46 | Next, load the three modules needed to run MultiQC: `gcc`, `python`, `multiqc`.
 47 | 
 48 | ```bash
 49 | $ cd results/multiqc_report
 50 | 
 51 | $ module load gcc/9.2.0 python/2.7.12 multiqc/1.21
 52 | ```
 53 | ***
 54 | 
 55 | **Exercise**
 56 | 
 57 | How did we know which modules to load in addition to multiqc?
 58 | 
 59 | ***
 60 | 
 61 | We are going to run MultiQC on the following 4 outputs from our workflow:
 62 | 
 63 | * `.zip` files from FastQC
 64 | * `.Log.final.out` files from STAR
 65 | * `qualimap/*` directories from Qualimap
 66 | * `salmon/*` directories from salmon
 67 | 
 68 | To create a more meaningful report to look at we thought it best to run MultiQC on the full dataset instead of the subset we have been working with so far. We have run each of the tools mentioned above on the full dataset and stored the result in the directory `/n/groups/hbctraining/intro_rnaseq_hpc/full_dataset`. We will point to these files as input for our MultiQC analysis.
 69 | 
 70 | To run MultiQC, we can provide it two inputs at a minimum:
 71 | 
 72 | 1. a name for our output report and folder
 73 | 2. the paths to our results files
 74 | 
 75 | > **NOTE:** MultiQC has additional parameters we could include; use `multiqc -h` to find out more.
 76 | 
 77 | ```bash
 78 | $ multiqc -n multiqc_report_rnaseq \
 79 | /n/groups/hbctraining/RNA_seq_part_1/full_dataset_results/fastqc/*zip \
 80 | /n/groups/hbctraining/RNA_seq_part_1/full_dataset_results/STAR/*Log.final.out \
 81 | /n/groups/hbctraining/RNA_seq_part_1/full_dataset_results/qualimap/* \
 82 | /n/groups/hbctraining/RNA_seq_part_1/full_dataset_results/salmon/*
 83 | ```
 84 | 
 85 | > **NOTE**: You will see the progress of analysis printed out on the terminal as the tool runs. If you want to save this output into a log file (for future reference), you can use `2>` operator to redirect it to a file. For example, at the end of script, add `2> log.txt`. `2>`redirects the output of so-called standard error.
 86 | 
 87 | It takes a couple of minutes to generate the MultiQC report. The report provides nice visualizations across samples, which is very useful to determine consistency and to identify problematic samples.
 88 | 
 89 | The output of MultiQC is one HTML file (`multiqc_report_rnaseq.html`) and a data folder. Transfer the interactive HTML report over to your laptop using **FileZilla**, and visualize the outputs of the four tools we used to generate the report.
 90 | 
 91 | > *For a refresher on using Filezilla, please refer back to our [FastQC assessment lesson](07_qc_fastqc_assessment.md).*
 92 | 
 93 | ## Assessing the quality control metrics
 94 | 
 95 | The main metrics to explore first are:
 96 | 
 97 | * number of raw reads or total reads
 98 | * percentage of reads aligned to genome
 99 | * percentage of reads associated with genes 
100 | 
101 | > Note: If you don't see exact columns as ours, you may need to configure the columns, which is a button just underneath the 'General Statistics' heading. 
102 | 
103 | <p align="center">
104 | <img src="../img/multiqc_table1.png" width="750">
105 |   </p>
106 | 
107 | Using `Configure Columns` button, we are going to choose the following columns:
108 | 
109 | <p align="center">
110 | <img src="../img/multiqc_columns.png" width="600">
111 |   </p>
112 | 
113 | In the above image, the description column is helpful in interpretating the table. Upon perusal of the table, we can see input from FastQC, STAR, Qualimap and salmon. For example, the total number of raw reads is given in the `M Seqs` column on the far right of the table. 
114 | 
115 | STAR provides information about *uniquely mapping reads* in the `%Aligned` column. A good quality sample will have **at least 75% of the reads uniquely mapped**. Once the value starts to drop below 60%, it's advisable to start troubleshooting. Low number of uniquely mapping reads means that more reads are mapped to multiple locations. 
116 | 
117 | The 'STAR: Alignment Scores' plot visually represents this mapping information. The % uniquely mapped, multimapped, and unmapped reads can be easily compared between samples to get a nice overview of the quality of the samples.
118 | 
119 | <p align="center">
120 | <img src="../img/multiqc_alignment_scores1.png" width="600">
121 |   </p>
122 | 
123 | > NOTE: The thresholds suggested above will vary depending on the organism that you are working with. Much of what is discussed here is in the context of working with human or mouse data. For example, 75% of mapped reads holds true only if the genome is good or mature. For badly assembled genomes, we may not observe a high mapping rate, even if the actual sequences from the sample are good.
124 | 
125 | Salmon also provides a `%Aligned` column representing the percent of mapped reads. The percentage from Salmon is different from that of STAR, because STAR is based on the alignment to genome reference, while Salmon is based on the alignment to transcriptome reference. Since we will be using the salmon abundance estimates for downstream analysis, these numbers are particularly important for our analysis.
126 | 
127 | 
128 | ### Complexity
129 | 
130 | The complexity of the RNA-seq library can be explored with the `%Dups` column. If a large percentage of the library is duplicated, then this could indicate a library of either low complexity or over-amplification. If huge differences of `%Dups` exist between samples, this may lead to biases in the data, such as different %GC content.
131 | 
132 | ### Exploring biases
133 | 
134 | Within this report, we can also explore the bias metrics output by Qualimap and FastQC. The `5'-3' bias` column denotes whether our data has any 5' or 3' biases. These biases could be due to RNA degradation or different sample preparation techniques. Generally, we should explore our data more if we have biases approaching 0.5 or 2. 
135 | 
136 | The transcript position plot can also help identify 5' or 3' bias, in addition to other coverage issues. We generally expect roughly even coverage.
137 | 
138 | <p align="center">
139 | <img src="../img/multiqc_coverage_profile1.png" width="600">
140 |   </p>
141 | 
142 | In addition, we can see whether our different samples have differences in `%GC` column. GC bias could be caused by low-complexity libraries, differences in amplification, or library-specific issues. We expect to observe similar GC content aross samples.
143 | 
144 | ### Contamination
145 | 
146 | We can also identify possible contamination of our samples by inspecting the percentage of reads that are exonic, intronic or intergenic. High levels of intergenic reads is indicative of DNA contamination (>30%). Also, if polyA selection of messenger RNAs was performed in library preparation, then high percentages of intronic reads would also be concerning. 
147 | 
148 | <p align="center">
149 | <img src="../img/qualimap_genomic_origin1.png" width="600">
150 |   </p>
151 | 
152 | Generally speaking, in a good library, we expect over 60% of reads to be mapped to exons for mouse or human organisms. For other organisms, the percentage depends on how well the genome is annotated.
153 | 
154 | ### Fragment length distribution 
155 | 
156 | The auxiliary directory generated from Salmon will contain a file called `fld.gz`. This file contains an approximation of the observed fragment length distribution. This is more meaningful for paired-end data, where the length can be estimated based on the location from both ends of the fragment. These plots can be compared to our expectations based on our knowledge of the size selection step performed during the library preparation stage.
157 | 
158 | > **NOTE:** For single end data (which is what we have), Salmon reports a fixed insert length distribution. Therefore, the values are identical for all samples, and we only observe one distribution curve in the plot.
159 | 
160 | <p align="center">
161 | <img src="../img/salmon_plot_multiqc1.png" width="600">
162 | </p>
163 | 
164 | ---
165 | *This lesson has been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.*
166 | 


--------------------------------------------------------------------------------
/lessons/STAR Alignment Strategy.md:
--------------------------------------------------------------------------------
 1 | title: "STAR alignment strategy"
 2 | author: "Meeta Mistry, Mary Piper"
 3 | date: Monday September 10, 2018
 4 | ---
 5 | 
 6 | Approximate time: 20 minutes
 7 | 
 8 | ### STAR Alignment Strategy
 9 | 
10 | STAR is shown to have high accuracy and outperforms other aligners by more than a factor of 50 in mapping speed, but it is memory intensive. The algorithm achieves this highly efficient mapping by performing a two-step process:
11 | 
12 | 1. Seed searching
13 | 2. Clustering, stitching, and scoring
14 | 
15 | #### Seed searching
16 | 
17 | For every read that STAR aligns, STAR will search for the longest sequence that exactly matches one or more locations on the reference genome. These longest matching sequences are called the Maximal Mappable Prefixes (MMPs):
18 | 
19 | ![STAR_step1](../img/alignment_STAR_step1.png)
20 | 	
21 | The different parts of the read that are mapped separately are called 'seeds'. So the first MMP that is mapped to the genome is called *seed1*.
22 | 
23 | STAR will then search again for only the unmapped portion of the read to find the next longest sequence that exactly matches the reference genome, or the next MMP, which will be *seed2*. 
24 | 
25 | ![STAR_step2](../img/alignment_STAR_step2.png)
26 | 
27 | This sequential searching of only the unmapped portions of reads underlies the efficiency of the STAR algorithm. STAR uses an uncompressed suffix array (SA) to efficiently search for the MMPs, this allows for quick searching against even the largest reference genomes. Other slower aligners use algorithms that often search for the entire read sequence before splitting reads and performing iterative rounds of mapping.
28 | 
29 | **If STAR does not find an exact matching sequence** for each part of the read due to mismatches or indels, the previous MMPs will be extended.
30 | 
31 | ![STAR_step3](../img/alignment_STAR_step3.png)
32 | 
33 | **If extension does not give a good alignment**, then the poor quality or adapter sequence (or other contaminating sequence) will be soft clipped.
34 | 
35 | ![STAR_step4](../img/alignment_STAR_step4.png)
36 | 
37 | 
38 | #### Clustering, stitching, and scoring
39 | 
40 | The separate seeds are stitched together to create a complete read by first clustering the seeds together based on proximity to a set of 'anchor' seeds, or seeds that are not multi-mapping.
41 | 
42 | Then the seeds are stitched together based on the best alignment for the read (scoring based on mismatches, indels, gaps, etc.). 
43 | 
44 | ![STAR_step5](../img/alignment_STAR_step5.png)
45 | 


--------------------------------------------------------------------------------
/lessons/STAR_alignment.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Alignment with STAR"
  3 | author: "Meeta Mistry, Bob Freeman, Mary Piper"
  4 | date: Wednesday, June 7, 2017
  5 | ---
  6 | 
  7 | Approximate time: 90 minutes
  8 | 
  9 | ## Learning Objectives:
 10 | 
 11 | * Understanding the alignment method STAR utilizes to align sequence reads to the reference genome
 12 | * Identifying the intricacies of alignment tools used in NGS analysis (parameters, usage, etc)
 13 | * Choosing appropriate STAR alignment parameters for our dataset
 14 | 
 15 | ## Read Alignment
 16 | 
 17 | <img src="../img/RNAseqWorkflow.png" width="400">
 18 | 
 19 | Now that we have explored the quality of our raw reads, we can move on to read alignment. We perform read alignment or mapping to determine where in the genome the reads originated from. The alignment process consists of choosing an appropriate reference genome to map our reads against and performing the read alignment using one of several splice-aware alignment tools such as [STAR](http://bioinformatics.oxfordjournals.org/content/early/2012/10/25/bioinformatics.bts635) or [HISAT2](http://ccb.jhu.edu/software/hisat2/index.shtml). The choice of aligner is often a personal preference and also dependent on the computational resources that are available to you.
 20 | 
 21 | ## STAR Aligner
 22 | 
 23 | To determine where on the human genome our reads originated from, we will align our reads to the reference genome using [STAR](http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3530905/) (Spliced Transcripts Alignment to a Reference). STAR is an aligner designed to specifically address many of the challenges of RNA-seq data mapping using a strategy to account for spliced alignments. 
 24 | 
 25 | ### STAR Alignment Strategy
 26 | 
 27 | STAR is shown to have high accuracy and outperforms other aligners by more than a factor of 50 in mapping speed, but it is memory intensive. The algorithm achieves this highly efficient mapping by performing a two-step process:
 28 | 
 29 | 1. Seed searching
 30 | 2. Clustering, stitching, and scoring
 31 | 
 32 | #### Seed searching
 33 | 
 34 | For every read that STAR aligns, STAR will search for the longest sequence that exactly matches one or more locations on the reference genome. These longest matching sequences are called the Maximal Mappable Prefixes (MMPs):
 35 | 
 36 | 
 37 | ![STAR_step1](../img/alignment_STAR_step1.png)
 38 | 	
 39 | The different parts of the read that are mapped separately are called 'seeds'. So the first MMP that is mapped to the genome is called *seed1*.
 40 | 
 41 | STAR will then search again for only the unmapped portion of the read to find the next longest sequence that exactly matches the reference genome, or the next MMP, which will be *seed2*. 
 42 | 
 43 | ![STAR_step2](../img/alignment_STAR_step2.png)
 44 | 
 45 | This sequential searching of only the unmapped portions of reads underlies the efficiency of the STAR algorithm. STAR uses an uncompressed suffix array (SA) to efficiently search for the MMPs, this allows for quick searching against even the largest reference genomes. Other slower aligners use algorithms that often search for the entire read sequence before splitting reads and performing iterative rounds of mapping.
 46 | 
 47 | **If STAR does not find an exact matching sequence** for each part of the read due to mismatches or indels, the previous MMPs will be extended.
 48 | 
 49 | ![STAR_step3](../img/alignment_STAR_step3.png)
 50 | 
 51 | **If extension does not give a good alignment**, then the poor quality or adapter sequence (or other contaminating sequence) will be soft clipped.
 52 | 
 53 | ![STAR_step4](../img/alignment_STAR_step4.png)
 54 | 
 55 | 
 56 | #### Clustering, stitching, and scoring
 57 | 
 58 | The separate seeds are stitched together to create a complete read by first clustering the seeds together based on proximity to a set of 'anchor' seeds, or seeds that are not multi-mapping.
 59 | 
 60 | Then the seeds are stitched together based on the best alignment for the read (scoring based on mismatches, indels, gaps, etc.). 
 61 | 
 62 | ![STAR_step5](../img/alignment_STAR_step5.png)
 63 | 
 64 | ## Running STAR
 65 | 
 66 | ### Set-up
 67 | 
 68 | To get started with this lesson, start an interactive session with 6 cores:
 69 | 
 70 | ```bash
 71 | $ srun --pty -p interactive -t 0-12:00 -n 6 --mem 8G --reservation=HBC1 /bin/bash	
 72 | ```
 73 | 
 74 | You should have a directory tree setup similar to that shown below. it is best practice to have all files you intend on using for your workflow present within the same directory. In our case, we have our original FASTQ files generated in the previous section. 
 75 | 
 76 | ```bash
 77 | rnaseq
 78 | 	├── logs
 79 | 	├── meta
 80 | 	├── raw_data
 81 | 	│   ├── Irrel_kd_1.subset.fq
 82 | 	│   ├── Irrel_kd_2.subset.fq
 83 | 	│   ├── Irrel_kd_3.subset.fq
 84 | 	│   ├── Mov10_oe_1.subset.fq
 85 | 	│   ├── Mov10_oe_2.subset.fq
 86 | 	│   └── Mov10_oe_3.subset.fq
 87 | 	├── results
 88 | 	└── scripts
 89 | ```
 90 | 
 91 | To use the STAR aligner, load the module: 
 92 | 
 93 | ```bash
 94 | $ module load gcc/6.2.0 star/2.5.2b
 95 | ```
 96 | 
 97 | Aligning reads using STAR is a two step process:   
 98 | 
 99 | 1. Create a genome index 
100 | 2. Map reads to the genome
101 | 
102 | > A quick note on shared databases for human and other commonly used model organisms. The O2 cluster has a designated directory at `/n/groups/shared_databases/` in which there are files that can be accessed by any user. These files contain, but are not limited to, genome indices for various tools, reference sequences, tool specific data, and data from public databases, such as NCBI and PDB. So when using a tool that requires a reference of sorts, it is worth taking a quick look here because chances are it's already been taken care of for you. 
103 | >
104 | >```bash
105 | > $ ls -l /n/groups/shared_databases/igenome/
106 | >```
107 | 
108 | ### Creating a genome index
109 | 
110 | For this workshop we are using reads that originate from a small subsection of chromosome 1 (~300,000 reads) and so we are using only chr1 as the reference genome. 
111 | 
112 | To store our genome indices, we will use the `/n/scratch2/` space with large temporary storage capacity. We need to create a directory for the indices within this space:
113 | 
114 | ```bash
115 | $ mkdir -p /n/scratch2/username/chr1_hg38_index
116 | ```
117 | 
118 | The basic options to **generate genome indices** using STAR are as follows:
119 | 
120 | * `--runThreadN`: number of threads
121 | * `--runMode`: genomeGenerate mode
122 | * `--genomeDir`: /path/to/store/genome_indices
123 | * `--genomeFastaFiles`: /path/to/FASTA_file 
124 | * `--sjdbGTFfile`: /path/to/GTF_file
125 | * `--sjdbOverhang`: readlength -1
126 | 
127 | > *NOTE:* In case of reads of varying length, the ideal value for `--sjdbOverhang` is max(ReadLength)-1. In most cases, the default value of 100 will work similarly to the ideal value.
128 | 
129 | Now let's create a job submission script to generate the genome index:
130 | 
131 | ```bash
132 | $ vim ~/rnaseq/scripts/genome_index.run
133 | ```
134 | Within `vim` we now add our shebang line, the SLURM directives, and our STAR command. 
135 | 
136 | ```bash
137 | #!/bin/bash
138 | 
139 | #SBATCH -p short 		# partition name
140 | #SBATCH -t 0-2:00 		# hours:minutes runlimit after which job will be killed
141 | #SBATCH -n 6 		# number of cores requested -- this needs to be greater than or equal to the number of cores you plan to use to run your job
142 | #SBATCH --mem 16G
143 | #SBATCH --job-name STAR_index 		# Job name
144 | #SBATCH -o %j.out			# File to which standard out will be written
145 | #SBATCH -e %j.err 		# File to which standard err will be written
146 | 
147 | cd /n/scratch2/username/
148 | 
149 | module load gcc/6.2.0 star/2.5.2b
150 | 
151 | STAR --runThreadN 6 \
152 | --runMode genomeGenerate \
153 | --genomeDir chr1_hg38_index \
154 | --genomeFastaFiles /n/groups/hbctraining/intro_rnaseq_hpc/reference_data_ensembl38/Homo_sapiens.GRCh38.dna.chromosome.1.fa \
155 | --sjdbGTFfile /n/groups/hbctraining/intro_rnaseq_hpc/reference_data_ensembl38/Homo_sapiens.GRCh38.92.gtf \
156 | --sjdbOverhang 99
157 | ```
158 | 
159 | ```bash
160 | $ sbatch ~/rnaseq/scripts/genome_index.run
161 | ```
162 | 
163 | ### Aligning reads
164 | 
165 | After you have the genome indices generated, you can perform the read alignment. We previously generated the genome indices for you in `/n/groups/hbctraining/intro_rnaseq_hpc/reference_data_ensembl38/ensembl38_STAR_index/` directory so that we don't get held up waiting on the generation of the indices.
166 | 
167 | Create an output directory for our alignment files:
168 | 
169 | ```bash
170 | $ cd ~/rnaseq/raw_data
171 | 
172 | $ mkdir ../results/STAR
173 | ```
174 | 
175 | ### STAR command in interactive bash
176 | 
177 | For now, we're going to work on just one sample to set up our workflow. To start we will use the first replicate in the Mov10 over-expression group, `Mov10_oe_1.subset.fq`. Details on STAR and its functionality can be found in the [user manual](https://github.com/alexdobin/STAR/blob/master/doc/STARmanual.pdf); we encourage you to peruse through to get familiar with all available options.
178 | 
179 | The basic options for aligning reads to the genome using STAR are:
180 | 
181 | * `--runThreadN`: number of threads / cores
182 | * `--readFilesIn`: /path/to/FASTQ_file
183 | * `--genomeDir`: /path/to/genome_indices_directory
184 | * `--outFileNamePrefix`: prefix for all output files
185 | 
186 | Listed below are additional parameters that we will use in our command:
187 | 
188 | * `--outSAMtype`: output filetype (SAM default)
189 | * `--outSAMunmapped`: what to do with unmapped reads
190 | 
191 | > **NOTE:** Default filtering is applied in which the maximum number of multiple alignments allowed for a read is set to 10. If a read exceeds this number there is no alignment output. To change the default you can use `--outFilterMultimapNmax`, but for this lesson we will leave it as default. Also, note that "**STAR’s default parameters are optimized for mammalian genomes.** Other species may require significant modifications of some alignment parameters; in particular, the maximum and minimum intron sizes have to be reduced for organisms with smaller introns" [[1](http://bioinformatics.oxfordjournals.org/content/early/2012/10/25/bioinformatics.bts635.full.pdf+html)].
192 | 
193 | We can access the software by simply using the STAR command followed by the basic parameters described above and any additional parameters. The full command is provided below for you to copy paste into your terminal. If you want to manually enter the command, it is advisable to first type out the full command in a text editor (i.e. [Sublime Text](http://www.sublimetext.com/) or [Notepad++](https://notepad-plus-plus.org/)) on your local machine and then copy paste into the terminal. This will make it easier to catch typos and make appropriate changes. 
194 | 
195 | ```bash
196 | 
197 | STAR --genomeDir /n/groups/hbctraining/intro_rnaseq_hpc/reference_data_ensembl38/ensembl38_STAR_index/ \
198 | --runThreadN 6 \
199 | --readFilesIn Mov10_oe_1.subset.fq \
200 | --outFileNamePrefix ../results/STAR/Mov10_oe_1_ \
201 | --outSAMtype BAM SortedByCoordinate \
202 | --outSAMunmapped Within \
203 | --outSAMattributes Standard 
204 | 
205 | ```
206 | 
207 | ---
208 | *This lesson has been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.*
209 | 


--------------------------------------------------------------------------------
/lessons/STAR_alignment_strategy.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "STAR alignment strategy"
 3 | author: "Meeta Mistry, Mary Piper"
 4 | date: Monday September 10, 2018
 5 | ---
 6 | 
 7 | Approximate time: 20 minutes
 8 | 
 9 | ### STAR Alignment Strategy
10 | 
11 | STAR is shown to have high accuracy and outperforms other aligners by more than a factor of 50 in mapping speed, but it is memory intensive. The algorithm achieves this highly efficient mapping by performing a two-step process:
12 | 
13 | 1. Seed searching
14 | 2. Clustering, stitching, and scoring
15 | 
16 | #### Seed searching
17 | 
18 | For every read that STAR aligns, STAR will search for the longest sequence that exactly matches one or more locations on the reference genome. These longest matching sequences are called the Maximal Mappable Prefixes (MMPs):
19 | 
20 | ![STAR_step1](../img/alignment_STAR_step1.png)
21 | 	
22 | The different parts of the read that are mapped separately are called 'seeds'. So the first MMP that is mapped to the genome is called *seed1*.
23 | 
24 | STAR will then search again for only the unmapped portion of the read to find the next longest sequence that exactly matches the reference genome, or the next MMP, which will be *seed2*. 
25 | 
26 | ![STAR_step2](../img/alignment_STAR_step2.png)
27 | 
28 | This sequential searching of only the unmapped portions of reads underlies the efficiency of the STAR algorithm. STAR uses an uncompressed suffix array (SA) to efficiently search for the MMPs, this allows for quick searching against even the largest reference genomes. Other slower aligners use algorithms that often search for the entire read sequence before splitting reads and performing iterative rounds of mapping.
29 | 
30 | **If STAR does not find an exact matching sequence** for each part of the read due to mismatches or indels, the previous MMPs will be extended.
31 | 
32 | ![STAR_step3](../img/alignment_STAR_step3.png)
33 | 
34 | **If extension does not give a good alignment**, then the poor quality or adapter sequence (or other contaminating sequence) will be soft clipped.
35 | 
36 | ![STAR_step4](../img/alignment_STAR_step4.png)
37 | 
38 | 
39 | #### Clustering, stitching, and scoring
40 | 
41 | The separate seeds are stitched together to create a complete read by first clustering the seeds together based on proximity to a set of 'anchor' seeds, or seeds that are not multi-mapping.
42 | 
43 | Then the seeds are stitched together based on the best alignment for the read (scoring based on mismatches, indels, gaps, etc.). 
44 | 
45 | ![STAR_step5](../img/alignment_STAR_step5.png)
46 | 
47 | ---
48 | *This lesson has been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.*
49 | 


--------------------------------------------------------------------------------
/lessons/counting_reads.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Counting reads
  3 | author: Meeta Mistry, Bob Freeman, Radhika Khetani
  4 | date: 06/06/2017
  5 | ---
  6 | 
  7 | Approximate time: 75 minutes
  8 | 
  9 | ## Learning Objectives:
 10 | 
 11 | * understand how counting tools work
 12 | * generate a count matrix using featureCounts
 13 | 
 14 | 
 15 | ## Counting reads as a measure of gene expression
 16 | <img src="../img/counts-workflow.png" width="400">
 17 | 
 18 | Once we have our reads aligned to the genome, the next step is to count how many reads have mapped to each gene. There are many tools that can use BAM files as input and output the number of reads (counts) associated with each feature of interest (genes, exons, transcripts, etc.). 2 commonly used counting tools are [featureCounts](http://bioinf.wehi.edu.au/featureCounts/) and [htseq-count](http://www-huber.embl.de/users/anders/HTSeq/doc/count.html). 
 19 | 
 20 | * The above tools only report the "raw" counts of reads that **map to a single location** (uniquely mapping) and are best at counting at the **gene level**. Essentially, total read count associated with a gene (*meta-feature*) = the sum of reads associated with each of the exons (*feature*) that "belong" to that gene.
 21 | 
 22 | * There are **other tools** available that are able to account for **multiple transcripts** for a given gene. In this case the counts are not whole numbers, but have fractions. In the simplest example case, if 1 read is associated with 2 transcripts, it can get counted as 0.5 and 0.5 and the resulting count for that transcript is not a whole number.
 23 | 
 24 | * In addition there are **other tools that will count multimapping reads**, but this is a dangerous thing to do since you will be overcounting the total number of reads which can cause issues with normalization and eventually with accuracy of differential gene expression results. 
 25 | 
 26 | **Input for counting = multiple BAM files + 1 GTF file**
 27 | 
 28 | Simply speaking, the genomic coordinates of where the read is mapped (BAM) are cross-referenced with the genomic coordinates of whichever feature you are interested in counting expression of (GTF), it can be exons, genes or transcripts.
 29 | 
 30 | <img src="../img/count-fig2.png" width="600">
 31 | 
 32 | **Output of counting = A count matrix, with genes as rows and samples are columns**
 33 | 
 34 | These are the "raw" counts and will be used in statistical programs downstream for differential gene expression.
 35 | 
 36 | <img src="../img/count-matrix.png" width="300">
 37 | 
 38 | ### Counting using featureCounts
 39 | Today, we will be using the [featureCounts](http://bioinf.wehi.edu.au/featureCounts/) tool to get the *gene* counts. We picked this tool because it is accurate, fast and is relatively easy to use. It counts reads that map to a single location (uniquely mapping) and follows the scheme in the figure below for assigning reads to a gene/exon. 
 40 | 
 41 | <img src="../img/union.png" width="300">
 42 | 
 43 | featureCounts can also take into account whether your data are **stranded** or not. If strandedness is specified, then in addition to considering the genomic coordinates it will also take the strand into account for counting. If your data are stranded always specify it.
 44 | 
 45 | #### Setting up to run featureCounts
 46 | First things first, start an interactive session with 4 cores:
 47 | 	
 48 | ``` bash
 49 | $ srun --pty -p interactive -t 0-12:00 -n 4 --mem 8G --reservation=HBC1 /bin/bash
 50 | ```
 51 | 
 52 | Now, change directories to your rnaseq directory and start by creating 2 directories, (1) a directory for the output and (2) a directory for the bam files:
 53 | 
 54 | ``` bash
 55 | $ cd ~/rnaseq/
 56 | $ mkdir results/counts results/STAR/bams
 57 | ```
 58 | 
 59 | Rather than using the BAM file we generated in the last lesson, let's copy over all of the BAM files that we have already generated for you:
 60 | 	
 61 | ``` bash
 62 | 
 63 | $ cp /n/groups/hbctraining/intro_rnaseq_hpc/bam_STAR/*bam ~/rnaseq/results/STAR/bams
 64 | ```
 65 | featureCounts is not available as a module on O2, but we have already added the path for it to our `$PATH` variable last time. 
 66 | 
 67 | ``` bash
 68 | $ echo $PATH  # You should see /n/app/bcbio/tools/bin/ among other paths
 69 | ```
 70 | 
 71 | > ** If you don't see `/n/app/bcbio/tools/bin/` in your `$PATH` variable, add the following `export` command to your `~/.bashrc` file using vim: `export PATH=/n/app/bcbio/tools/bin/:$PATH`.**
 72 | 
 73 | 
 74 | #### Running featureCounts
 75 | 
 76 | How do we use this tool, what is the command and what options/parameters are available to us?
 77 | 
 78 | ``` bash
 79 | $ featureCounts
 80 | ```
 81 | 
 82 | So, it looks like the usage is `featureCounts [options] -a <annotation_file> -o <output_file> input_file1 [input_file2] ... `, where `-a`, `-o` and input files are required.
 83 | 
 84 | We are going to use the following options:
 85 | 
 86 | `-T 4 # specify 4 cores`
 87 | 
 88 | `-s 2 # these data are "reverse"ly stranded`
 89 | 
 90 | and the following are the values for the required parameters:
 91 | 
 92 | `-a ~/rnaseq/reference_data/chr1-hg19_genes.gtf # required option for specifying path to GTF`
 93 | 
 94 | `-o ~/rnaseq/results/counts/Mov10_featurecounts.txt # required option for specifying path to, and name of the text output (count matrix)`
 95 | 
 96 | `~/rnaseq/results/STAR/bams/*bam # the list of all the bam files we want to collect count information for`
 97 | 
 98 | Let's run this now:
 99 | 
100 | ``` bash
101 | $ featureCounts -T 4 -s 2 \
102 |   -a /n/groups/hbctraining/intro_rnaseq_hpc/reference_data_ensembl38/Homo_sapiens.GRCh38.92.gtf \
103 |   -o ~/rnaseq/results/counts/Mov10_featurecounts.txt \
104 |   ~/rnaseq/results/STAR/bams/*.out.bam
105 | ```
106 | 
107 | > If you wanted to collect the information that is on the screen as the job runs, you can modify the command and add the `2>` redirection at the end. This type of redirection will collect all the information from the terminal/screen into a file.
108 | 
109 | ``` bash
110 | # **DO NOT RUN THIS** 
111 | # note the last line of the command below
112 | 	
113 | $ featureCounts -T 4 -s 2 \
114 |   -a /n/groups/hbctraining/intro_rnaseq_hpc/reference_data_ensembl38/Homo_sapiens.GRCh38.92.gtf \
115 |   -o ~/rnaseq/results/counts/Mov10_featurecounts.txt \
116 |   ~/rnaseq/results/STAR/bams/*.out.bam \
117 |   2> /unix_lesson/rnaseq/results/counts/Mov10_featurecounts.screen-output
118 | ```
119 | #### featureCounts output
120 | 
121 | The output of this tool is 2 files, *a count matrix* and *a summary file* that tabulates how many the reads were "assigned" or counted and the reason they remained "unassigned". Let's take a look at the summary file:
122 | 	
123 | ``` bash
124 | $ less results/counts/Mov10_featurecounts.txt.summary
125 | ```
126 | Now let's look at the count matrix:
127 | 	
128 | ``` bash
129 | $ less results/counts/Mov10_featurecounts.txt
130 | ```	
131 | 
132 | ##### Cleaning up the featureCounts matrix
133 | There is information about the genomic coordinates and the length of the gene, we don't need this for the next step, so we are going to extract the columns that we are interested in.
134 | 	
135 | ``` bash
136 | $ cut -f1,7,8,9,10,11,12 results/counts/Mov10_featurecounts.txt > results/counts/Mov10_featurecounts.Rmatrix.txt
137 | ```
138 | The next step is to clean it up a little further by modifying the header line (we could also do this in R, or in a GUI text editor):
139 | 	
140 | ``` bash
141 | $ vim results/counts/Mov10_featurecounts.Rmatrix.txt
142 | ```
143 | 
144 | Vim has nice shortcuts for cleaning up the header of our file using the following steps: 
145 | 
146 | 1. Move the cursor to the beginning of the document by typing: `gg` (in command mode). 
147 | 2. Remove the first line by typing: `dd` (in command mode).
148 | 2. Remove the file name following the sample name by typing: `:%s/_Aligned.sortedByCoord.out.bam//g` (in command mode).
149 | 3. Remove the path leading up to the file name by typing: `:%s/\/home\/username\/unix_lesson\/rnaseq\/results\/STAR\/bams\///g` (in command mode).
150 | 	
151 | 	> Note that we have a `\` preceding each `/`, which tells vim that we are not using the `/` as part of our search and replace command, but instead the `/` is part of the pattern that we are replacing. This is called *escaping* the `/`.
152 | 
153 | ### Note on counting PE data
154 | 
155 | For paired-end (PE) data, the bam file contains information about whether both read1 and read2 mapped and if they were at roughly the correct distance from each other, that is to say if they were "properly" paired. For most counting tools, **only properly paired reads are considered by default, and each read pair is counted only once as a single "fragment"**. 
156 | 
157 | For counting PE fragments associated with genes, the input bam files need to be sorted by read name (i.e. alignment information about both read pairs in adjoining rows). The alignment tool might sort them for you, but watch out for how the sorting was done. If they are sorted by coordinates (like with STAR), you will need to use `samtools sort` to re-sort them by read name before using as input in featureCounts. If you do not sort you BAM file by read name before using as input, featureCounts assumes that almost all the reads are not properly paired.
158 | 
159 | 
160 | ---
161 | *This lesson has been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.*
162 | 


--------------------------------------------------------------------------------
/lessons/fastqc-troubleshooting.md:
--------------------------------------------------------------------------------
 1 | ## Troubleshooting quality issues of raw data
 2 | 
 3 | While the data for this analysis is quite good, it's unfortunate that that's not always the case. So now that we know a bit about the types of quality issues to check for in the raw RNA-seq data, how do we troubleshoot them?
 4 | 
 5 | <img src="../img/qc_troubleshooting.png" width="800">
 6 |    
 7 | To help think through the troubleshooting, we can arrange the data by the main problems encountered:
 8 | 
 9 | - **Poor quality data**
10 |    - Poor quality at 3' end of sequence
11 |       - **Probable cause(s):** Fluorescent signal decay or phasing issues - expected for Illumina data, but take note of the decrease in quality.
12 |    - Poor quality across sequence
13 |       - **Probable cause(s):** Problems at the sequencing facility - contact them
14 |    - Drop in quality in the middle
15 |       - **Probable cause(s):** Problems at the sequencing facility - contact them
16 |    - Large percentage of sequences with low mean quality scores
17 |       - **Probable cause(s):** Problems at the sequencing facility - contact them
18 |    
19 | - **Issues based on read sequence expectations**
20 |    
21 |    - Unexpected %GC for organism and/or % of each nucleotide does not remain similar across the read (except for first 10-12 bases for RNA-Seq)
22 |       - **Probable cause(s):** Contaminating sequences: different species, adapters, vector, mitochondrial/rRNA
23 |    - High level of sequence duplications 
24 |       - **Probable cause(s):** Low complexity library, too many cycles of PCR amplification / too little starting material
25 |    - Over-represented sequences more than 1-2%, unless expected based on experimental design
26 |       - **Probable cause(s):** Contaminating sequences: adapters, vector, mitochondrial/rRNA
27 | 


--------------------------------------------------------------------------------
/lessons/more_bash_cluster.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Bash_extras"
  3 | author: "Radhika Khetani", "Meeta Mistry"
  4 | date: 2019-10-29
  5 | duration: 30
  6 | ---
  7 | 
  8 | ## Overview
  9 | 
 10 | * [Creating shortcuts or `alias`](#alias)
 11 | * [Copying files using `scp` and `rsync`](#rsync) 
 12 | * [Symbolic Links or "sym links"](#symlink)
 13 | 
 14 | ***
 15 | 
 16 | ## Setting up some `alias`es <a name="alias"></a>
 17 | 
 18 | In your terminal, do the following:
 19 | 
 20 | ```bash
 21 | $ cd
 22 | 
 23 | $ ls -l
 24 | 
 25 | $ ll
 26 | ```
 27 | 
 28 | `ll` should have output the same thing as `ls -l`. Why does it work this way? This is because the HMS-RC folks have internally setup what is called an **alias**. 
 29 | 
 30 | A **shell alias is a shortcut to reference a command**. It can be used to avoid typing long commands. For common patterns it can reduce keystrokes and improve efficiency. A simple example is setting default options on commands to avoid having to type them each time a command is run.
 31 | 
 32 | For example suppose that because you are just starting out on the cluster, and you prefer to confirm deleting a file before using the `rm` command. Remember that the `rm` command supports this with the `-i` option. To avoid forgetting to use the `-i` option each time, an alias can be created so that each time `rm` is run it will use the `-i` option and prompt the user to confirm.
 33 | 
 34 | 
 35 | ```bash
 36 | $ alias rm='rm -i'
 37 | ```
 38 | 
 39 | However, this alias is only going to be available to you while that Terminal window is open. If you wanted to **use that alias all the time, what would you do?** 
 40 | 
 41 | You would add it to `~/.bashrc`! Let's open `~/.bashrc` and add a few commands to it. At the bottom of the file you should see a header titled "User specific aliases". Under that header go ahead and add the alias.
 42 | 
 43 | ```bash
 44 | $ vim ~/.bashrc
 45 | ```
 46 | 
 47 | Add in a line at the end of your `.bashrc` file:
 48 | 
 49 | ```
 50 | alias rm='rm -i'
 51 | ```
 52 | 
 53 | 
 54 | Now, we can source the `.bashrc` file for the alias to take effect and we can try it out. You should see the question `
 55 | remove draft.txt?` and here you can answer `n` for No.
 56 | 
 57 | ```bash
 58 | $ source ~/.bashrc
 59 | 
 60 | $ rm  ~/unix_lesson/other/draft.txt 
 61 | ```
 62 | 
 63 | As we mentioned, aliases are super helpful for long commands that we are repeatedly having to tyoe out. A good example of this is the `srun` command for starting and interactive session. **First exit the interactive session and get on a login node, if you are not there already.**
 64 | 
 65 | ```bash
 66 | $ alias o2i='srun --pty -p interactive -t 0-12:00 --mem 2G --reservation=HBC /bin/bash'
 67 | ```
 68 | 
 69 | Now you can test it out!
 70 | 
 71 | ```bash
 72 | $ o2i
 73 | ```
 74 | 
 75 | Similar to what we did above, you can put this (or a similar) command in the `.bash_profile` file so it is available when you log on next time.
 76 | 
 77 | > ### `.bashrc` versus `.bash_profile`
 78 | > `.bash_profile` is executed for login shells, while `.bashrc` is executed for interactive non-login shells. When you login (type username and password) to O2 the `.bash_profile` is executed. So if you want the alias available **only** when you login, you will want to put it in your `.bash_profile`. 
 79 | 
 80 | ## Copying files to and from the cluster <a name="rsync"></a>
 81 | 
 82 | So far we have used FileZilla to copy files over from O2, but there are other way to do so using the command line interface. When you obtain your data from the sequencing facility, it will likely be stored on some remote computer and they will give you login credentials which will allow you to access it. There are various commands that can be used to help you copy those files from the remote computer over to 1) your local computer, 2) O2, or 3) whatever cluster environment you plan to work on. We present a few options here.
 83 | 
 84 | ### `scp`
 85 | 
 86 | Similar to the `cp` command to copy there is a command that allows you to **securely copy files between computers**. The command is called `scp` and allows files to be copied to, from, or between different hosts. It uses ssh for data transfer and provides the same authentication and same level of security as ssh. 
 87 | 
 88 | In the example below, the first argument is the **location on the remote server** and the second argument is the **destination on your local machine**. 
 89 | 
 90 | > *You can also do this in the opposite direction by swapping the arguments.*
 91 | 
 92 | ```bash
 93 | $ scp username@transfer.rc.hms.harvard.edu:/path/to/file_on_O2 Path/to/directory/local_machine
 94 | ```
 95 | 
 96 | Let's try copying over the README file from your `unix_lesson` folder. **First open up a new terminal window.**  Look and see where you currently are:
 97 | 
 98 | ```bash
 99 | $ pwd
100 | ```
101 | 
102 | Then type in:
103 | 
104 | ```bash
105 | $ scp rc_trainingXX@transfer.rc.hms.harvard.edu:~/unix_lesson/other/draft.txt  .
106 | ```
107 | 
108 | Now see that the file has transferred over:
109 | 
110 | ```bash
111 | $ less draft.txt
112 | ```
113 | 
114 | > **NOTE:** Windows users may encounter a permissions error when using `scp` to copy over locally. We are not sure how to troubleshoot this, but will update materials as we obtain more information.
115 | 
116 | ### `rsync` 
117 | 
118 | `rsync` is used to copy or synchronize data between directories. It has many advantages over `cp`, `scp` etc. It works in a specific direction, i.e. from the first directory **to** the second directory, similar to `cp`.
119 | 
120 | **Salient Features of `rsync`**
121 | 
122 | * If the command (or transfer) is interrupted, you can start it again and *it will restart from where it was interrupted*.
123 | * Once a folder has been synced between 2 locations, the next time you run `rsync` it will *only update and not copy everything over again*. 
124 | * It runs a check to ensure that every file it is "syncing" over is the exact same in both locations. This check is run using a version of ["checksum"](https://en.wikipedia.org/wiki/Checksum) which ensures the data integrity during the data transfer process. 
125 | 
126 | > You can run the checksum function yourself when transferring large datasets without `rsync` using one of the following commands (or similar): `md5`, `md5sum`.
127 | 
128 | 
129 | ### Between directories on the same machine
130 | 
131 | ```bash
132 | #DO NOT RUN
133 | $ rsync -av ~/large_dataset/. /n/groups/dir/groupdata/
134 | ```
135 | 
136 | ### Between different machines
137 | 
138 | When copying over large datasets to or from a remote machine, `rsync` works similarly to `scp`.
139 | 
140 | ```bash
141 | #DO NOT RUN
142 | $ rsync -av -e ssh testfile username@transfer.rc.hms.harvard.edu:~/large_files/
143 | ```
144 | 
145 | * `a` is for archive - means it preserves permissions (owners, groups), times, symbolic links, and devices.
146 | * `v` is for verbosity - means that it prints on the screen what is being copied
147 | * `-e ssh` is for encryption - means that we want to use the ssh protocol for encryption of the file transfer
148 | 
149 | *More helpful information and examples using rsync can be found [at this link](https://www.comentum.com/rsync.html)*
150 | 
151 | > Please do not use O2’s login nodes for transferring large datasets (like fastq files) between your computer and O2 with `rsync` or `scp`. Instead, use the transfer nodes `ssh eCommons@transfer.rc.hms.harvard.edu`.
152 | 
153 | 
154 | ## Symbolic Links or "sym links" <a name="symlink"></a>
155 | 
156 | Symbolic links are like shortcuts you may create on your laptop. A sym link makes it appear as if the linked object is actually there. It can be useful to access a file from multiple locations without creating copies and without using much disk space. (Symlinks are only a few bytes in size.)
157 | 
158 | Let's check out an example of a folder with lots of symlinks.
159 | 
160 | 
161 | ```bash
162 | ls -l /n/app/bcbio/tools/bin/
163 | ```
164 | 
165 | Now, let's create a sym link in our home directory for the same `unix_lesson` folder we had originally copied over.
166 | 
167 | ```bash
168 | $ cd
169 | 
170 | $ ln -s /n/groups/hbctraining/unix_lesson/ unix_lesson_sym
171 | 
172 | $ ls -l
173 | ```
174 | 
175 | We recommend that you create something like this for your raw data so it does not accidentally get corrupted or overwritten. 
176 | 
177 | > Note: a “hard” link (just `ln` without the `-s` option) is very different. Always use “ln -s” unless you really know what you’re doing!
178 | 
179 | ## Additional topics
180 | 
181 | If you are interested in learning more about regular expressions (regex) and the tools `awk` and `sed1`, you can find more information in the ["extra_bash_tools"](https://hbctraining.github.io/Intro-to-shell-flipped/lessons/extra_bash_tools.html) lesson.
182 | 
183 | 
184 | ***
185 | *This lesson has been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.*
186 | 


--------------------------------------------------------------------------------
/lessons/sam.md:
--------------------------------------------------------------------------------
 1 | ## samtools  extras
 2 | 
 3 | To play around with a few `samtools` commands, first change directories into the directory containing all BAM files.
 4 | 
 5 | `$ cd ~/unix_workshop/rnaseq/results/STAR/bams`
 6 | 
 7 | ### Write only mapped reads to file (filter out unmapped reads)
 8 | 
 9 | `$ samtools view -b -h -F 4 Mov10_oe_1_Aligned.sortedByCoord.out.bam > Mov10_oe_1_Aligned.onlyAligned.bam`
10 | 
11 | ### Create a FASTQ file containing only mapped reads
12 | 
13 | `$ bamtofastq -o Mov10_oe_1_Mapped.fastq --no-unaligned Mov10_oe_1_Aligned.onlyMapped.bam`
14 | 
15 | ### Index BAM file
16 | 
17 | `$ samtools index Mov10_oe_1_Aligned.sortedByCoord.out.bam`
18 | 
19 | ### Extract reads from a specific region of the chromosome
20 | 
21 | `$samtools view  Mov10_oe_1_Aligned.sortedByCoord.out.bam chr1:200000-500000`
22 | 
23 | ### Randomly subsample half of the reads into a new BAM file
24 | 
25 | `$ samtools view -s 0.5 -b Mov10_oe_1_Aligned.sortedByCoord.out.bam > Mov10_oe_1_subsample.bam`
26 | 
27 | ### Simple stats for alignment file
28 | 
29 | `$ samtools flagstat Mov10_oe_1_Aligned.sortedByCoord.out.bam`
30 | 
31 | ### Visualizing mismatches
32 | 
33 | `$ samtools view -h Mov10_oe_1_Aligned.sortedByCoord.out.bam | head -n 5 | samtools fillmd -e - ~/unix_workshop/rnaseq/reference_data/chr1.fa`
34 | 
35 | 


--------------------------------------------------------------------------------
/lessons/shell_review.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "The Shell"
  3 | author: "Mary Piper, Radhika Khetani, Meeta Mistry, Jihe Liu"
  4 | date: "October 26, 2020"
  5 | ---
  6 | 
  7 | ## Learning Objectives
  8 | - Review shell commands and concepts
  9 | 
 10 | 
 11 | ## Setting up
 12 | 
 13 | This workshop assumes that you have either a) taken our [Introduction to command-line interface workshop](https://hbctraining.github.io/Shell-for-bioinformatics/schedule/) or b) been working on the command-line and are already fluent with shell/bash. **We ask that you complete the exercises below**, to refresh some basic commands that you will be using over the course of the workshop. For each section we have relevant materials linked as a helpful reference. 
 14 | 
 15 | ### Opening up a terminal window
 16 | 
 17 | > *NOTE: This mandatory pre-work does not require you to login to the O2 cluster.*
 18 | 
 19 | On your local laptop, you will need to open up your terminal window. This will be different depending on what kind of operating system (OS) you are working on.
 20 | 
 21 | **With Mac OS**
 22 | 
 23 | Macs have a utility application called "**Terminal**" for performing tasks on the command line (shell), both locally and on remote machines. 
 24 | 
 25 | Please find and open the Terminal utility on your computers using the *Spotlight Search* at the top right hand corner of your screen.
 26 | 
 27 | **With Windows OS**
 28 | 
 29 | By default, there is no built-in Terminal that uses the bash shell on the Windows OS. So, we will be using a downloaded program called "**Git BASH**" which is part of the [Git for Windows](https://git-for-windows.github.io/) tool set. **Git BASH is a shell/bash emulator.** What this means is that it shows you a very similar interface to, and provides you the functionality of, the Terminal utility found on the Mac and Linux Operating systems.
 30 | 
 31 | Please find and open Git BASH.
 32 | 
 33 | > **Tip** - Windows users can use another program called [Putty](http://www.chiark.greenend.org.uk/~sgtatham/putty/download.html) instead of a *bash emulator* to log in to remote machines, but it is a little more involved and has different capabilities. We encourage you to take a look at it, but we will not be covering it in this workshop.
 34 | 
 35 | ### Downloading the example data folder
 36 | 
 37 | We will be exploring the capabilities of the shell by working with some RNA-Seq data. We need to **download the data to our current folder** using the link below. To do so, follow the step-by-step instructions below.
 38 | 
 39 | **1. Find out what folder we are currently inside**. To do this, we can use the 'print working directory' command:
 40 | 
 41 | ```bash
 42 | $ pwd
 43 | ```
 44 | 
 45 | > On a **Mac** your current folder should be something starting with `/Users/`, like `/Users/marypiper/`.
 46 | > 
 47 | > On a **Windows** machine your current folder should be something starting with `/c/Users/marypiper`. To find this in your File explorer try clicking on PC and navigating to that path.
 48 | 
 49 | _Once you have identified which folder you are in, this is where we will be downloading your data._
 50 | 
 51 | **2. Click on the link below then go to file > download to download the data"**. This will automatically download the folder to your downloads folder. If you downloaded the data previously as a part of the Basic Shell workshop, you do not need to download it again unless you have deleted it.
 52 | 
 53 | * Download data by [clicking here](https://www.dropbox.com/s/x66jksdd4jklpdw/unix_lesson.zip?dl=0).
 54 | 
 55 | **3.** Once you have downloaded the file to the correct location, go back to your **terminal window and type the 'list' command**:
 56 | 
 57 | ```bash
 58 | $ ls
 59 | ```
 60 | 
 61 | > `ls` stands for 'list' and it lists the contents of a directory.
 62 | 
 63 | _You should see `unix_lesson.zip` as part of the output to the screen._
 64 | 
 65 | **4.** Finally, to **decompress the folder**:
 66 | 
 67 | * Double click on unix_lesson.zip on a mac. This will automatically inflate the folder.
 68 | * If you are on windows, press and hold (or right-click) the folder, select Extract All..., and then follow the instructions.
 69 | 
 70 | 
 71 | **5.** Now when you **run the `ls` command** again you should see a folder called `unix_lesson`, which means you are all set with the data download!
 72 | 
 73 | ```bash
 74 | $ ls
 75 | ```
 76 | 
 77 | **6.** Go into the folder for the lesson
 78 | 
 79 | on mac type: 
 80 | ```bash
 81 | $ cd unix_lesson
 82 | ```
 83 | 
 84 | on windows type:
 85 | 
 86 | ```bash
 87 | $ cd unix_lesson/unix_lesson
 88 | ```
 89 | 
 90 | ***
 91 | 
 92 | 
 93 | ## Reviewing shell commands
 94 | 
 95 | ### Shell basics
 96 | We are going to start this review with some basic commands pertaining to navigating around the filesystem. Helpful reference materials are listed below:
 97 | 
 98 | * [Introduction to Shell](https://hbctraining.github.io/Shell-for-bioinformatics//lessons/01_the_filesystem.html)
 99 | * [Wildcards and shortcuts in Shell](https://hbctraining.github.io/Shell-for-bioinformatics/lessons/02_wildcards_shortcuts.html)
100 | * [Examining and creating files](https://hbctraining.github.io/Shell-for-bioinformatics/lessons/03_working_with_files.html)
101 | 
102 | 1. Change directory into the `unix_lesson/` directory.
103 | 2. Take a quick look at the `Mov10_oe_1.subset.fq` file (located in `raw_fastq` directory) using `less` from `unix_lesson/`, without changing directories.
104 | 3. Use a shortcut to move out of the directory to the parent of `unix_lesson/`.
105 | 4. Change directories into the `raw_fastq/` folder with a single command.
106 | 5. What does the `~` in the command prompt mean?
107 | 6. What is the full path to the `unix_lesson` directory?
108 | 8. List all the files in the `raw_fastq` directory.
109 | 8. Modify the above command using the `*` wildcard to only list those files that have "oe" in their names.
110 | 10. How many and which commands have you run so far?
111 | 
112 | ### Searching and redirection
113 | Next, we will search our files for specific patterns and redirect the results to file. Helpful reference materials are listed below:
114 | 
115 | * [Searching and redirection](https://hbctraining.github.io/Shell-for-bioinformatics/lessons/04_searching_files.html)
116 | 
117 | 12. Create a new directory called `shell_review/` within the `unix_lesson/` directory.
118 | 13. Search the file `unix_lesson/reference_data/chr1-hg19_genes.gtf` for lines containing the string "MOV10". Save the output in the `shell_review/` directory with a new name - "Mov10_hg19.gtf".
119 | 14. Use `vim` to open the newly created file `unix_lesson/shell_review/Mov10_hg19.gtf` and add a comment at the top specifying how this file was created and the source of the content. Save the modified file and quit `vim`.
120 | 15. In the new file "Mov10_hg19.gtf", how many lines contain the word "exon"?
121 | 
122 | ### Loops and shell scripts
123 | 
124 | * [Shell scripts and variables in Shell](https://hbctraining.github.io/Shell-for-bioinformatics/lessons/05_shell-scripts_variable.html)
125 | * [Loops and automation](https://hbctraining.github.io/Shell-for-bioinformatics/lessons/06_loops_and_automation.html)
126 | 
127 | 16. Use the `for` loop to iterate over each FASTQ file in `raw_fastq` and do the following:
128 |       * Print the name of the current file
129 |       * Generate a prefix to use for naming our output files, and store it inside a variable called `sample`.
130 |       * Dump out the first 40 lines into a new file that will be saved in `shell_review`
131 | 17. Place the above `for` loop into a shell script using `vim` and run it.
132 | 
133 | ### Permissions
134 | 
135 | * [Interpreting the permissions string](https://hbctraining.github.io/Shell-for-bioinformatics/lessons/07_permissions_and_environment_variables.html#permissions)
136 | 
137 | There is a folder in the HBC training shared space on the O2 cluster called `intro_rnaseq_hpc`. Below we have displayed a long listing of its contents. 
138 | 
139 | ``` bash
140 | total 714
141 | drwxrwsr-x  3 mm573 hbctraining 1111 Aug 22  2017 bam_STAR
142 | drwxrwsr-x  8 mp298 hbctraining 1914 May 21  2018 bam_STAR38
143 | drwxrwsr-x  2 mm573 hbctraining  522 Oct  6  2015 bam_tophat
144 | drwxrwsr-x  2 mm573 hbctraining  240 Oct 19  2015 counts
145 | drwxrwsr-x  2 mm573 hbctraining  260 Oct 19  2015 counts_STAR
146 | -rw-rw-r--  1 mm573 hbctraining 2416 Aug 22  2017 DE_script.R
147 | -rw-rw-r--  1 mm573 hbctraining 2064 Mar 28  2018 DESeq2_script.R
148 | drwxrwsr-x  2 mm573 hbctraining  705 Oct  6  2015 fastqc
149 | drwxrwsr-x  2 mm573 hbctraining  272 Jan 31  2018 full_dataset
150 | -rw-rw-r--  1 mm573 hbctraining  216 Nov 10  2015 install_libraries.R
151 | -rw-rw-r--  1 mm573 hbctraining  117 Oct 19  2015 install_libraries.sh
152 | drwxrwsr-x 78 mm573 hbctraining 1969 Aug 22  2017 R-3.3.1
153 | drwxrwsr-x  3 mp298 hbctraining  234 Feb 27  2019 reference_data_ensembl38
154 | drwxrwsr-x  2 mm573 hbctraining  555 Oct  5  2015 reference_STAR
155 | drwxrwsr-x  2 rsk27 hbctraining  260 Aug 22  2017 salmon.ensembl37.idx
156 | drwxrwsr-x  2 mm573 hbctraining  306 Oct  6  2015 trimmed_fastq
157 | 
158 | ```
159 | 
160 | 18. How many owners have files in this folder?
161 | 19. How many groups?
162 | 20. Are there any executable *files* in this folder?
163 | 21. What kind of access does the user `mm573` have to the `full_dataset/` directory?
164 | 22. You are considered as "other" or everyone else on this system (i.e you are not part of the group `hbctraining`. What command would allow the user `mm573` do to take away your ability to look inside the `full_dataset/` directory?
165 | 
166 | 
167 | ### Environment variables
168 | 
169 | * [Understanding environment variables](https://hbctraining.github.io/Shell-for-bioinformatics/lessons/07_permissions_and_environment_variables.html#environment-variables)
170 | 
171 | 23. Display the contents of the `$HOME` variable on your computer.
172 | 24. Use the `which` command to check where the executable file for the `pwd` command lives in the directory structure.
173 | 25. How does shell know where to find the executable file for the `pwd` command?
174 | 26. Display the contents of the variable that stores the various paths to folders containing executable command files.
175 | 
176 | 
177 | 
178 | ### Review your answers
179 | * [Answer key](shell_review_answer_key.md)
180 | 
181 | ****
182 | 
183 | *This lesson has been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.*
184 | 


--------------------------------------------------------------------------------
/lessons/shell_review_answer_key.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "The Shell Review Answer Key"
  3 | author: "Mary Piper, Radhika Khetani, Meeta Mistry, Jihe Liu"
  4 | date: "October 26, 2020"
  5 | ---
  6 | 
  7 | ## Learning Objectives
  8 | - Review shell commands and concepts
  9 | 
 10 | 
 11 | ## Setting up
 12 | 
 13 | The Introduction to RNA-seq workshop assumes that you have either a) taken our [Introduction to command-line interface workshop](https://hbctraining.github.io/Intro-to-shell-flipped/schedule/) or b) been working on the command-line and are already fluent with shell/bash. **We ask that you complete the exercises below**, to refresh some basic commands that you will be using over the course of the workshop. For each section we have relevant materials linked as a helpful reference. 
 14 | 
 15 | ### Opening up a terminal window
 16 | 
 17 | > *NOTE: This mandatory pre-work does not require you to login to the O2 cluster.*
 18 | 
 19 | On your local laptop, you will need to open up your terminal window. This will be different depending on what kind of operating system (OS) you are working on.
 20 | 
 21 | **With Mac OS**
 22 | 
 23 | Macs have a utility application called "**Terminal**" for performing tasks on the command line (shell), both locally and on remote machines. 
 24 | 
 25 | Please find and open the Terminal utility on your computers using the *Spotlight Search* at the top right hand corner of your screen.
 26 | 
 27 | **With Windows OS**
 28 | 
 29 | By default, there is no built-in Terminal that uses the bash shell on the Windows OS. So, we will be using a downloaded program called "**Git BASH**" which is part of the [Git for Windows](https://git-for-windows.github.io/) tool set. **Git BASH is a shell/bash emulator.** What this means is that it shows you a very similar interface to, and provides you the functionality of, the Terminal utility found on the Mac and Linux Operating systems.
 30 | 
 31 | Please find and open Git BASH.
 32 | 
 33 | > **Tip** - Windows users can use another program called [Putty](http://www.chiark.greenend.org.uk/~sgtatham/putty/download.html) instead of a *bash emulator* to log in to remote machines, but it is a little more involved and has different capabilities. We encourage you to take a look at it, but we will not be covering it in this workshop.
 34 | 
 35 | ### Downloading the example data folder
 36 | 
 37 | The data you will be working with can be downloaded using the link below. Clicking on the link will automatically place a file called `unix_lesson.zip` to your `Downloads` folder on your computer.
 38 | 
 39 | - [Introduction to Shell: Dataset](https://github.com/hbctraining/Training-modules/blob/master/Intro_shell/data/unix_lesson.zip?raw=true)
 40 | 
 41 | Now, in you terminal window change directories into your `Downloads` folder and check that the file is listed there:
 42 | 
 43 | ```bash
 44 | $ cd ~/Downloads
 45 | $ ls -l unix_lesson.zip
 46 | ```
 47 | 
 48 | To decompress the file into a folder called `unix_lesson` we use the `unzip` command:
 49 | 
 50 | ```bash
 51 | $ unzip unix_lesson.zip
 52 | ```
 53 | 
 54 | Check to see that you have the folder `unix_lesson` before proceeding.
 55 | 
 56 | ```bash
 57 | $ ls -l unix_lesson
 58 | ```
 59 | 
 60 | ## Reviewing shell commands
 61 | 
 62 | ### Shell basics
 63 | We are going to start this review with some basic commands pertaining to navigating around the filesystem. Helpful reference materials are listed below:
 64 | 
 65 | * [Introduction to Shell](https://hbctraining.github.io/Intro-to-shell-flipped//lessons/01_the_filesystem.html)
 66 | * [Wildcards and shortcuts in Shell](https://hbctraining.github.io/Intro-to-shell-flipped/lessons/02_wildcards_shortcuts.html)
 67 | * [Examining and creating files](https://hbctraining.github.io/Intro-to-shell-flipped/lessons/03_working_with_files.html)
 68 | 
 69 | 1. Change directory into the `unix_lesson/` directory.
 70 | ```bash
 71 | $ cd unix_lesson
 72 | ```
 73 | 
 74 | 2. Take a quick look at the `Mov10_oe_1.subset.fq` file (located in `raw_fastq` directory) using `less` from `unix_lesson/`, without changing directories.
 75 | ```bash
 76 | $ less raw_fastq/Mov10_oe_1.subset.fq
 77 | ```
 78 | 
 79 | 3. Use a shortcut to move out of the directory to the parent of `unix_lesson/`.
 80 | ```bash
 81 | $ cd ..
 82 | ```
 83 | 
 84 | 4. Change directories into the `raw_fastq/` folder with a single command.
 85 | ```bash
 86 | $ cd unix_lesson/raw_fastq/
 87 | ```
 88 | 
 89 | 5. What does the `~` in the command prompt mean?
 90 | Answer: `~` means home directory.
 91 | 
 92 | 6. What is the full path to the `unix_lesson` directory?
 93 | Answer: `/Users/your_username/Downloads/unix_lesson` (**the result will vary based on your computer's file system**)
 94 | 
 95 | 8. List all the files in the `raw_fastq` directory.
 96 | ```bash
 97 | # (option 1) You can navigate to the `raw_fastq` directory and say
 98 | $ ls -l 
 99 | 
100 | # (option 2) You can identify your location and give the full or relative path to raw_fastq
101 | ```
102 | 
103 | 8. Modify the above command using the `*` wildcard to only list those files that have "oe" in their names.
104 | ```bash
105 | # (option 1) You can navigate to the `raw_fastq` directory and say
106 | $ ls -l *oe* 
107 | 
108 | # (option 2) You can identify your location and give the full or relative path to raw_fastq
109 | ```
110 | 
111 | 10. How many and which commands have you run so far?
112 | ```bash
113 | $ history
114 | ```
115 | Answer: Result will vary based on your activity.
116 | 
117 | ### Searching and redirection
118 | Next, we will search our files for specific patterns and redirect the results to file. Helpful reference materials are listed below:
119 | 
120 | * [Searching and redirection](https://hbctraining.github.io/Intro-to-shell-flipped/lessons/04_searching_files.html)
121 | 
122 | 12. Create a new directory called `shell_review/` within the `unix_lesson/` directory.
123 | ```bash
124 | # First make sure that you navigate to the `unix_lesson/` directory
125 | $ mkdir shell_review
126 | ```
127 | 13. Search the file `unix_lesson/reference_data/chr1-hg19_genes.gtf` for lines containing the string "MOV10". Save the output in the `shell_review/` directory with a new name - "Mov10_hg19.gtf".
128 | ```bash
129 | # First make sure that you navigate to the `unix_lesson/` directory
130 | $ grep MOV10 reference_data/chr1-hg19_genes.gtf > shell_review/Mov10_hg19.gtf
131 | ```
132 | 14. Use `vim` to open the newly created file `unix_lesson/shell_review/Mov10_hg19.gtf` and add a comment at the top specifying how this file was created and the source of the content. Save the modified file and quit `vim`.
133 | 
134 | Answer: 
135 | * Open file - `vim shell_review/Mov10_hg19.gtf`
136 | * Edit file - `i`- Add text using `#` to indicate comment
137 | * Exit edit mode - `esc`
138 | * Save and quit - `:wq`.
139 | 
140 | 15. In the new file "Mov10_hg19.gtf", how many lines contain the word "exon"?
141 | ```bash
142 | # First make sure that you navigate to the `unix_lesson/` directory
143 | $ grep exon shell_review/Mov10_hg19.gtf | wc -l
144 | ```
145 | Answer: 42
146 | 
147 | ### Loops and shell scripts
148 | 
149 | * [Shell scripts and variables in Shell](https://hbctraining.github.io/Intro-to-shell-flipped/lessons/05_shell-scripts_variable.html)
150 | * [Loops and automation](https://hbctraining.github.io/Intro-to-shell-flipped/lessons/06_loops_and_automation.html)
151 | 
152 | 16. Use the `for` loop to iterate over each FASTQ file in `raw_fastq` and do the following:
153 |       * Print the name of the current file
154 |       * Generate a prefix to use for naming our output files, and store it inside a variable called `sample`.
155 |       * Dump out the first 40 lines into a new file that will be saved in `shell_review`
156 |       
157 | ```bash
158 | # First make sure that you navigate to the `raw_fastq/` directory
159 | $ for file in *fq
160 | > do 
161 | > echo $file
162 | > sample=`basename $file .subset.fq`
163 | > head -n 40 $file > ../shell_review/${sample}_first40.fq
164 | > done
165 | ```
166 | 17. Place the above `for` loop into a shell script using `vim` and run it.
167 | 
168 | Answer: Navigate to the `raw_fastq/` directory, and create a script `vim generate_first40.sh`
169 | 
170 | ```bash
171 | #!/bin/bash 
172 | for file in *fq
173 | do 
174 | echo $file
175 | sample=`basename $file .subset.fq`
176 | head -n 40 $file > ../shell_review/${sample}_first40.fq
177 | done
178 | ```
179 | Run the script with the following command:
180 | 
181 | ```bash
182 | $ sh generate_first40.sh
183 | ```
184 | 
185 | ### Permissions
186 | 
187 | * [Interpreting the permissions string](https://hbctraining.github.io/Intro-to-shell-flipped/lessons/07_permissions_and_environment_variables.html#permissions)
188 | 
189 | There is a folder in the HBC training shared space on the O2 cluster called `intro_rnaseq_hpc`. Below we have displayed a long listing of its contents. 
190 | 
191 | ``` bash
192 | total 714
193 | drwxrwsr-x  3 mm573 hbctraining 1111 Aug 22  2017 bam_STAR
194 | drwxrwsr-x  8 mp298 hbctraining 1914 May 21  2018 bam_STAR38
195 | drwxrwsr-x  2 mm573 hbctraining  522 Oct  6  2015 bam_tophat
196 | drwxrwsr-x  2 mm573 hbctraining  240 Oct 19  2015 counts
197 | drwxrwsr-x  2 mm573 hbctraining  260 Oct 19  2015 counts_STAR
198 | -rw-rw-r--  1 mm573 hbctraining 2416 Aug 22  2017 DE_script.R
199 | -rw-rw-r--  1 mm573 hbctraining 2064 Mar 28  2018 DESeq2_script.R
200 | drwxrwsr-x  2 mm573 hbctraining  705 Oct  6  2015 fastqc
201 | drwxrwsr-x  2 mm573 hbctraining  272 Jan 31  2018 full_dataset
202 | -rw-rw-r--  1 mm573 hbctraining  216 Nov 10  2015 install_libraries.R
203 | -rw-rw-r--  1 mm573 hbctraining  117 Oct 19  2015 install_libraries.sh
204 | drwxrwsr-x 78 mm573 hbctraining 1969 Aug 22  2017 R-3.3.1
205 | drwxrwsr-x  3 mp298 hbctraining  234 Feb 27  2019 reference_data_ensembl38
206 | drwxrwsr-x  2 mm573 hbctraining  555 Oct  5  2015 reference_STAR
207 | drwxrwsr-x  2 rsk27 hbctraining  260 Aug 22  2017 salmon.ensembl37.idx
208 | drwxrwsr-x  2 mm573 hbctraining  306 Oct  6  2015 trimmed_fastq
209 | 
210 | ```
211 | 
212 | 18. How many owners have files in this folder?
213 | 
214 | Answer: 3
215 | 
216 | 19. How many groups?
217 | 
218 | Answer: 1
219 | 
220 | 20. Are there any executable *files* in this folder?
221 | 
222 | Answer: No
223 | 
224 | 21. What kind of access does the user `mm573` have to the `full_dataset/` directory?
225 | 
226 | Answer: r(read), w(write/edit), and x(execute).
227 | 
228 | 22. You are considered as "other" or everyone else on this system (i.e you are not part of the group `hbctraining`. What command would allow the user `mm573` do to take away your ability to look inside the `full_dataset/` directory?
229 | 
230 | Answer: chmod o-r full_dataset/ 
231 | 
232 | ### Environment variables
233 | 
234 | * [Understanding environment variables](https://hbctraining.github.io/Intro-to-shell-flipped/lessons/07_permissions_and_environment_variables.html#environment-variables)
235 | 
236 | 23. Display the contents of the `$HOME` variable on your computer.
237 | ```bash
238 | $ echo $HOME
239 | ```
240 | 24. Use the `which` command to check where the executable file for the `pwd` command lives in the directory structure.
241 | ```bash
242 | $ which pwd
243 | ```
244 | 25. How does shell know where to find the executable file for the `pwd` command?
245 | 
246 | Answer: the shell searches through each path in $PATH until it finds an executable file for the `pwd` command.
247 | 
248 | 26. Display the contents of the variable that stores the various paths to folders containing executable command files.
249 | 
250 | ```bash
251 | $ echo $PATH
252 | ```
253 | 
254 | ****
255 | 
256 | *This lesson has been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.*
257 | 


--------------------------------------------------------------------------------
/schedule/README.md:
--------------------------------------------------------------------------------
  1 | # Introduction to bulk RNA-seq: From reads to count matrix
  2 | 
  3 | > **Pre-requisite for this workshop:** The *Basic Data Skills* [Shell for Bioinformatics](https://hbctraining.github.io/Shell-for-bioinformatics/) workshop or a working knowledge of the command line and cluster computing.
  4 | 
  5 | ## Pre-reading
  6 | 
  7 | * [Shell basics review](../lessons/shell_review.md)
  8 | * [Best Practices in Research Data Management (RDM)](../lessons/04a_data_organization.md)
  9 | * [Introduction to RNA-seq](../lessons/01_intro-to-RNAseq.md)
 10 | 
 11 | ## Day 1
 12 | 
 13 | | Time |  Topic  | Instructor |
 14 | |:-----------:|:----------:|:--------:|
 15 | | 09:30 - 09:45 | [Workshop Introduction](../lectures/workshop_intro_slides.pdf) | Will |
 16 | | 09:45 - 10:25 | [Working in an HPC environment - Review](../lessons/03_working_on_HPC.md) | Upen |
 17 | | 10:25 - 11:05 | [Project Organization (using Data Management best practices)](../lessons/04b_data_organization.md) | Will |
 18 | | 11:05 - 11:45 | [Quality Control of Sequence Data: Running FASTQC](../lessons/05_qc_running_fastqc_interactively.md) | Upen |
 19 | | 11:45 - 12:00 | Overview of self-learning materials and homework submission | Will |
 20 | 
 21 | ### Before the next class:
 22 | 
 23 | 1. Please **study the contents** and **work through all the code** within the following lessons:
 24 | 
 25 |  * [Experimental design considerations](../lessons/02_experimental_planning_considerations.md)
 26 |  * [Quality Control of Sequence Data: Running FASTQC on multiple samples](../lessons/06_qc_running_fastqc_sbatch.md)
 27 |  * [Quality Control of Sequence Data: Evaluating FASTQC reports](../lessons/07_qc_fastqc_assessment.md)
 28 | 
 29 |     > **NOTE:** To run through the code above, you will need to be **logged into O2** and **working on a compute node** (i.e. your command prompt should have the word `compute` in it).
 30 |     > 1. Log in using `ssh rc_trainingXX@o2.hms.harvard.edu` and enter your password (replace the "XX" in the username with the number you were [assigned in class](https://docs.google.com/spreadsheets/d/1kBlYowhjjHJC9ZovmbBULmbqozKpprM17vZ2wPlhNg0/edit#gid=0)). 
 31 |     > 2. Once you are on the login node, use `srun --pty -p interactive -t 0-2:30 --mem 1G /bin/bash` to get on a compute node or as specified in the lesson.
 32 |     > 3. Proceed only once your command prompt has the word `compute` in it.
 33 |     > 4. If you log out between lessons (using the `exit` command twice), please follow points 1. and 2. above to log back in and get on a compute node when you restart with the self learning.
 34 | 
 35 | 2. **Complete the exercises**:
 36 |    * Each lesson above contain exercises; please go through each of them.
 37 |    * Add your answers to the questions to [Google forms](https://docs.google.com/forms/d/e/1FAIpQLSdxdSM4528uYTWT7k5c8gYAuCUaTqRkUSI88eUmKg7qyQZZAQ/viewform?usp=sf_link) the **day before the next class**.
 38 |    
 39 | ### Questions?
 40 | * ***If you get stuck due to an error*** while runnning code in the lesson, [email us](mailto:hbctraining@hsph.harvard.edu) 
 41 | 
 42 | ***
 43 | 
 44 | ## Day 2
 45 | 
 46 | | Time |  Topic  | Instructor |
 47 | |:-----------:|:----------:|:--------:|
 48 | | 09:30 - 10:30 | Self-learning lessons review | All |
 49 | | 10:30 - 11:10 | [Expression quantification: Theory and Tools](../lectures/expression_quantification.pdf) | Will |
 50 | | 11:10 - 11:50 | [Quantifying expression using alignment-free methods (Salmon)](../lessons/08_quasi_alignment_salmon.md) | Upen |
 51 | | 11:50 - 12:00 | [Review of workflow](../lectures/workflow_overview.pdf) | Upen |
 52 | 
 53 | ### Before the next class:
 54 | 
 55 | 1. Please **study the contents** and **work through all the code** within the following lessons:
 56 | 
 57 |  * [Quantifying expression using alignment-free methods (Salmon on multiple samples)](../lessons/09_quasi_alignment_salmon_sbatch.md)
 58 |       <details>
 59 |        <summary><i>Click here for a preview of this lesson</i></summary>
 60 |          <br>Now that we know how to run the quantification of one sample with Salmon, this lesson will guide you to run multiple samples by creating a job submission script<br><br>
 61 |        </details>
 62 |  * [QC with Alignment Data](../lessons/10_QC_Qualimap.md)
 63 |       <details>
 64 |        <summary><i>Click here for a preview of this lesson</i></summary>
 65 |          <br>Besides transcript-level quantification, we also want to understand the quality of the mapping, which is not provided in Salmon output. <br><br>This lesson will cover:<br>
 66 |              - Aligning the reads with an aligner, STAR<br>
 67 |              - Assessing QC metrics among samples<br><br>
 68 |        </details>
 69 |  * [Documenting Steps in the Workflow with MultiQC](../lessons/11_multiQC.md)
 70 |       <details>
 71 |        <summary><i>Click here for a preview of this lesson</i></summary>
 72 |          <br>It would be great to have a summary document of all QC results from the previous analysis. <br><br>This lesson will cover:<br>
 73 |              - Generating such a summary report with multiQC<br>
 74 |              - Generating alignment metric with Qualimap<br><br>
 75 |        </details>
 76 | 
 77 |      > **NOTE:** To run through the code above, you will need to be **logged into O2** and **working on a compute node** (i.e. your command prompt should have the word `compute` in it).
 78 |      > 1. Log in using `ssh rc_trainingXX@o2.hms.harvard.edu` and enter your password (replace the "XX" in the username with the number you were assigned in class). 
 79 |      > 2. Once you are on the login node, use `srun --pty -p interactive -t 0-2:30 --mem 8G /bin/bash` to get on a compute node or as specified in the lesson.
 80 |      > 3. Proceed only once your command prompt has the word `compute` in it.
 81 |      > 4. If you log out between lessons (using the `exit` command twice), please follow points 1. and 2. above to log back in and get on a compute node when you restart with the self learning.
 82 | 
 83 | 2. **Complete the exercises**:
 84 |    * Each lesson above contain exercises; please go through each of them.
 85 |    * Add your answers to the questions to [Google forms](https://docs.google.com/forms/d/e/1FAIpQLScxaj3IIO4Bx7FCRw87cCeuTPQyhD_7WR2QU638y8IZDv5r1A/viewform?usp=sf_link) the **day before the next class**.
 86 |    
 87 | ### Questions?
 88 | * ***If you get stuck due to an error*** while runnning code in the lesson, [email us](mailto:hbctraining@hsph.harvard.edu) 
 89 | 
 90 | ***
 91 | 
 92 | ## Day 3
 93 | 
 94 | | Time |  Topic  | Instructor |
 95 | |:-----------:|:----------:|:--------:|
 96 | | 09:30 - 10:10 | Self-learning lessons review | All |
 97 | | 10:10 - 11:10 | [Automating the RNA-seq workflow](../lessons/12_automating_workflow.md) | Will |
 98 | | 11:10 - 11:45 | [Troubleshooting RNA-seq Data Analysis](../lectures/RNA-seq_troubleshooting.pdf)| Upen |
 99 | | 11:45 - 12:00 | [Wrap up](../lectures/workshop_wrapup_slides.pdf) | Will |
100 | 
101 | ***
102 | 
103 | * Downloadable Answer Keys (Day 2 exercises): 
104 |   * [Experimental design (one possible solution)](https://www.dropbox.com/scl/fi/vk6g9qvvosgmjjonoqint/exp_design_table.xlsx?rlkey=rbxkeln9mm0lxf4kdjbrqsidt&st=6sf562u3&dl=0)
105 |   * [sbatch script](https://www.dropbox.com/scl/fi/3y7oa5i1eub7dzajfpko7/mov10_fastqc.run?rlkey=4eii6tc6nrludbjagcdgs1qxi&st=ipmbrx9y&dl=0)
106 |   * [.out file](https://www.dropbox.com/scl/fi/m0f1ux4522sw2flt73aje/22914006.out?rlkey=sizy0vkm0r5fz14uyswrtdeew&st=sk7gh4i5&dl=0)
107 |   * [.err file](https://www.dropbox.com/scl/fi/iye10ysh780danfo6r6v6/22914006.err?rlkey=maeny1p52dmio5ovli8c5ipss&st=lu0iuvcw&dl=0)
108 | 
109 | * Downloadable Answer Keys (Day 3 exercises): 
110 |   * [sbatch script to run salmon for all samples](../answer_key/salmon_all_samples.sbatch)
111 | 
112 | * [Automation Script](../scripts/rnaseq_analysis_on_input_file.sh)
113 | 
114 | ***
115 | 
116 | ## Resources
117 | * [Getting an O2 account](https://harvardmed.atlassian.net/wiki/spaces/O2/pages/1918304257/How+to+request+an+O2+account)
118 | * [Video about statistics behind salmon quantification](https://www.youtube.com/watch?v=TMLIxwDP7sk)
119 | * Advanced bash for working on O2:
120 |   * [Creating shortcuts or aliases](https://hbctraining.github.io/In-depth-NGS-Data-Analysis-Course/sessionVI/lessons/more_bash.html#alias)
121 |   * [Copying files from other remote locations to O2](https://hbctraining.github.io/In-depth-NGS-Data-Analysis-Course/sessionVI/lessons/more_bash.html#rsync)
122 |   * [Creating symbolic links](https://hbctraining.github.io/In-depth-NGS-Data-Analysis-Course/sessionVI/lessons/more_bash.html#symlink)
123 | * [Obtaining reference genomes or transcriptomes](https://hbctraining.github.io/Accessing_public_genomic_data/lessons/accessing_genome_reference_data.html)
124 | * Youtube videos
125 |     * [Hash tables - Paul Programming](https://www.youtube.com/watch?v=MfhjkfocRR0&ab_channel=PaulProgramming)
126 |     * [Suffix arrays - William Fiset](https://www.youtube.com/watch?v=zqKlL3ZpTqs)
127 | ***
128 | 
129 | ## Building on this workshop
130 | * [Introduction to R workshop materials](https://hbctraining.github.io/Intro-to-R-flipped/#lessons)
131 | * [Introduction to Differential Gene Expression analysis workshop materials](https://hbctraining.github.io/Intro-to-DGE/#lessons)
132 | 
133 | ***
134 | *These materials have been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.*
135 | 


--------------------------------------------------------------------------------
/schedule/links-to-lessons.md:
--------------------------------------------------------------------------------
  1 | # Introduction to bulk RNA-seq: From reads to count matrix
  2 | 
  3 | ## Learning Objectives
  4 | 
  5 | - Understand the necessity for, and use of, the command line interface (bash) and HPC for analyzing high-throughput sequencing data.
  6 | - Understand best practices for designing an RNA-seq experiment and analysis the resulting data.
  7 | 
  8 | ## Installations
  9 | 
 10 | ***All:***
 11 | 
 12 | * [FileZilla Client](https://filezilla-project.org/download.php?type=client) (make sure you get ‘FileZilla Client')
 13 | 
 14 | ***Mac users:***
 15 | 
 16 | * Plain text editor like [Sublime text](http://www.sublimetext.com/) or similar
 17 | 
 18 | ***Windows users:***
 19 | 
 20 | * [GitBash](https://git-scm.com/download/win)
 21 | * Plain text editor like [Notepad++](http://notepad-plus-plus.org/) or similar
 22 | 
 23 | ## Notes
 24 | * These materials focus on the use of local computational resources at Harvard, which are **only accessible to Harvard affiliates**
 25 | * Non-Harvard folks can [download the data](https://www.dropbox.com/s/t3lkyz1pz021222/unix_lesson.tar.gz?dl=0) and set up to work on their local clusters (with the help of local system administrators)
 26 | 
 27 | ### Instructions for Harvard researchers with access to HMS-RC's O2 cluster
 28 | 
 29 | To run through the code in the lessons below, you will need to be **logged into O2** and **working on a compute node** (i.e. your command prompt should have the word `compute` in it).
 30 | 
 31 | 1. Log in using `ssh ecommonsID@o2.hms.harvard.edu` and enter your password.
 32 | 2. Once you are on the login node, use `srun --pty -p interactive -t 0-2:30 --mem 1G /bin/bash` to get on a compute node or as specified in the lesson.
 33 | 3. Proceed only once your command prompt has the word `compute` in it.
 34 | 4. If you log out between lessons (using the `exit` command twice), please follow points 1. and 2. above to log back in and get on a compute node when you restart with the self learning.
 35 | 
 36 | ## Lessons
 37 | 
 38 | ### Part 1 
 39 | 1. [Introduction to RNA-seq](../lessons/01_intro-to-RNAseq.md)
 40 | 1. [Shell basics review](../lessons/shell_review.md)
 41 | 1. [Working in an HPC environment - Review](../lessons/03_working_on_HPC.md)
 42 | 1. [Best Practices in Research Data Management (RDM)](../lessons/04a_data_organization.md)
 43 | 1. [Project Organization (using Data Management best practices)](../lessons/04b_data_organization.md)
 44 |      
 45 | ***
 46 | 
 47 | ### Part II
 48 | 1. [Quality Control of Sequence Data: Running FASTQC](../lessons/05_qc_running_fastqc_interactively.md)
 49 | 1. [Experimental design considerations](../lessons/02_experimental_planning_considerations.md)
 50 | 1. [Quality Control of Sequence Data: Running FASTQC on multiple samples](../lessons/06_qc_running_fastqc_sbatch.md)
 51 | 1. [Quality Control of Sequence Data: Evaluating FASTQC reports](../lessons/07_qc_fastqc_assessment.md)
 52 | 
 53 | ***
 54 | 
 55 | ### Part III 
 56 | 1. [Sequence Alignment Theory](../lectures/alignment_quantification.pdf)
 57 | 1. [Quantifying expression using alignment-free methods (Salmon on multiple samples)](../lessons/09_quasi_alignment_salmon_sbatch.md)
 58 | 
 59 | ***
 60 | 
 61 | ### Part IV
 62 | 
 63 | 1. [QC with Alignment Data](../lessons/10_QC_Qualimap.md)
 64 | 1. [Documenting Steps in the Workflow with MultiQC](../lessons/11_multiQC.md)
 65 | 1. [Troubleshooting RNA-seq Data Analysis](../lectures/RNA-seq_troubleshooting.pdf)
 66 | 
 67 | ***
 68 | 
 69 | ### Part V
 70 | 
 71 | 1. [Automating the RNA-seq workflow](../lessons/12_automating_workflow.md)
 72 | 
 73 | ***
 74 | 
 75 | ### Answer Keys
 76 | 
 77 | * [Experimental design (one possible solution)](https://www.dropbox.com/s/524mevuyba34l5b/exp_design_table.xlsx?dl=1)
 78 | * [FASTQC sbatch script](https://www.dropbox.com/s/9wdyhfqpic05l6p/mov10_fastqc.run?dl=1)
 79 | * [FASTQC sbatch script .out file](https://www.dropbox.com/s/l7puf8oahtbwmpk/22914006.out?dl=1)
 80 | * [FASTQC sbatch script .err file](https://www.dropbox.com/s/8a1g6o9t2kxit30/22914006.err?dl=1).
 81 | * [sbatch script to run salmon for all samples](../answer_key/salmon_all_samples.sbatch)
 82 | * [Automation Script](../scripts/rnaseq_analysis_on_input_file.sh)
 83 | 
 84 | ***
 85 |    
 86 | ## Building on this workshop
 87 | * [Introduction to R workshop materials](https://hbctraining.github.io/Intro-to-R-flipped/schedule/links-to-lessons.html)
 88 | * [Bulk RNA-seq Part II (differential gene expression analysis) materials](https://hbctraining.github.io/DGE_workshop_salmon_online/schedule/links-to-lessons.html)
 89 | 
 90 | ***
 91 | 
 92 | ## Resources
 93 | * [Video about statistics behind salmon quantification](https://www.youtube.com/watch?v=TMLIxwDP7sk)
 94 | * Advanced bash for working on O2:
 95 |   * [Creating shortcuts or aliases](https://hbctraining.github.io/In-depth-NGS-Data-Analysis-Course/sessionVI/lessons/more_bash.html#alias)
 96 |   * [Copying files from other remote locations to O2](https://hbctraining.github.io/In-depth-NGS-Data-Analysis-Course/sessionVI/lessons/more_bash.html#rsync)
 97 |   * [Creating symbolic links](https://hbctraining.github.io/In-depth-NGS-Data-Analysis-Course/sessionVI/lessons/more_bash.html#symlink)
 98 | * [Obtaining reference genomes or transcriptomes](https://hbctraining.github.io/Accessing_public_genomic_data/lessons/accessing_genome_reference_data.html)
 99 | 
100 | ***
101 | *These materials have been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.*
102 | 


--------------------------------------------------------------------------------
/scripts/PE-rnaseq_analysis_on_allfiles_for-slurm.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | 
 4 | # This script is for PAIRED-END data which typically have the following naming convention: 
 5 | # - {sample}_R1.fastq and {sample}_R2.fastq 
 6 | # OR
 7 | # - {sample}_1.fastq and {sample}_2.fastq
 8 | 
 9 | # NOTE to change the extension to match the naming convention of your PE files.
10 | 
11 | for fq in ~/unix_lesson/rnaseq/raw_data/*_R1.fq
12 | do
13 | 
14 | sbatch -p short -t 0-2:00 -c 6 --job-name rnaseq-workflow --wrap="sh ~/unix_lesson/rnaseq/scripts/PE-rnaseq_analysis_on_input_file.sh $fq"
15 | sleep 1	# wait 1 second between each job submission
16 |   
17 | done
18 | 


--------------------------------------------------------------------------------
/scripts/PE-rnaseq_analysis_on_input_file.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash/
 2 | 
 3 | # This script is for PAIRED-END data which typically have the following naming convention: 
 4 | # - {sample}_R1.fastq and {sample}_R2.fastq 
 5 | # OR
 6 | # - {sample}_1.fastq and {sample}_2.fastq
 7 | 
 8 | # You will need to identify which format matches your data (i.e .fastq, .fq, .fq.gz) and **modify line 22 and line 26 accordingly!**
 9 | 
10 | 
11 | # USAGE: sh PE-rnaseq_analysis_on_input_file.sh <path/to/sample_file/sample_R1.fastq>
12 | # The script takes as input the R1 fastq file ONLY. It will runs FastQC, STAR, Qualimap and Salmon.
13 | 
14 | 
15 | # initialize a variable with an intuitive name to store the name of the input fastq file for Read1
16 | fq1=$1
17 | 
18 | # grab the path information to use for loading the Read2 fastq file
19 | path=`temp=$( realpath "$fq1" ) && dirname "$temp"`
20 | 
21 | # grab base of filename 
22 | samplename=`basename $fq _R1.fastq`
23 | echo "Starting analysis of sample $samplename"
24 | 
25 | # create a variable to store the read 2 file
26 | fq2=${path}/${samplename}_R2.fastq
27 | 
28 | # change directories to /n/scratch3/ so that all the analysis is stored there.
29 | cd /n/scratch3/users/r/$USER/rnaseq_hbc-workshop/
30 | 
31 | # specify the number of cores to use
32 | cores=6
33 | 
34 | # directory with the genome and transcriptome index files + name of the gene annotation file
35 | genome=/n/groups/hbctraining/intro_rnaseq_hpc/reference_data_ensembl38/ensembl38_STAR_index
36 | transcriptome=/n/groups/hbctraining/rna-seq_2019_02/reference_data/salmon_index
37 | gtf=/n/groups/hbctraining/intro_rnaseq_hpc/reference_data_ensembl38/Homo_sapiens.GRCh38.92.1.gtf
38 | 
39 | # make all of the output directories
40 | # The -p option means mkdir will create the whole path if it 
41 | # does not exist and refrain from complaining if it does exist
42 | mkdir -p results/fastqc/
43 | mkdir -p results/STAR/
44 | mkdir -p results/qualimap/
45 | mkdir -p results/salmon/
46 | 
47 | # set up output filenames and locations
48 | fastqc_out=results/fastqc/
49 | align_out=results/STAR/${samplename}
50 | align_out_bam=results/STAR/${samplename}_Aligned.sortedByCoord.out.bam
51 | qualimap_out=results/qualimap/${samplename}.qualimap
52 | salmon_out=results/salmon/${samplename}.salmon
53 | salmon_mappings=results/salmon/${samplename}_salmon.out
54 | 
55 | # set up the software environment (use version numbers)
56 | module load fastqc/0.11.3
57 | module load gcc/6.2.0  
58 | module load star/2.7.0a
59 | module load samtools/1.3.1
60 | module load java/jdk-1.8u112
61 | module load qualimap/2.2.1
62 | module load salmon/1.4.0
63 | unset DISPLAY
64 | 
65 | echo "Processing file $fq"
66 | 
67 | echo "Starting QC for $samplename"
68 | 
69 | # Run FastQC and move output to the appropriate folder
70 | fastqc -o $fastqc_out $fq1 $fq2
71 | 
72 | 
73 | # Run STAR
74 | STAR --runThreadN $cores --genomeDir $genome --readFilesIn $fq1 $fq2 --outFileNamePrefix $align_out --outSAMtype BAM SortedByCoordinate --outSAMunmapped Within --outSAMattributes Standard
75 | 
76 | # Run Qualimap
77 | qualimap rnaseq \
78 | -outdir $qualimap_out \
79 | -a proportional \
80 | -bam $align_out_bam \
81 | -p strand-specific-reverse \
82 | -gtf $gtf \
83 | --java-mem-size=8G
84 | 
85 | # Run salmon
86 | 
87 | echo "Starting Salmon run for $samplename"
88 | 
89 | salmon quant -i $transcriptome \
90 | -p $cores \
91 | -l A \
92 | -1 $fq1 -2 $fq2 \
93 | -o $salmon_out \
94 | --seqBias \
95 | --useVBOpt
96 | 


--------------------------------------------------------------------------------
/scripts/mov10_fastqc.run:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH -p short 		# partition name
 4 | #SBATCH -t 0-2:00 		# hours:minutes runlimit after which job will be killed
 5 | #SBATCH -c 6 		# number of cores requested -- this needs to be greater than or equal to the number of cores you plan to use to run your job
 6 | #SBATCH --job-name rnaseq_mov10_fastqc 		# Job name
 7 | #SBATCH -o %j.out			# File to which standard out will be written
 8 | #SBATCH -e %j.err 		# File to which standard err will be written
 9 | 
10 | ## Changing directories to where the fastq files are located
11 | cd ~/unix_workshop/rnaseq/raw_data
12 | 
13 | ## Loading modules required for script commands
14 | module load seq/fastqc/0.11.3
15 | 
16 | ## Running FASTQC
17 | fastqc -t 6 *.fq
18 | 
19 | ## Moving files to our results directory
20 | mv *fastqc* ../results/fastqc/
21 | 


--------------------------------------------------------------------------------
/scripts/rnaseq_analysis_on_allfiles_for-slurm.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | for fq in ~/unix_lesson/rnaseq/raw_data/*.fq
 4 | do
 5 | 
 6 | sbatch -p short -t 0-2:00 -c 6 --job-name rnaseq-workflow --wrap="sh ~/unix_lesson/rnaseq/scripts/rnaseq_analysis_on_input_file.sh $fq"
 7 | sleep 1	# wait 1 second between each job submission
 8 |   
 9 | done
10 | 


--------------------------------------------------------------------------------
/scripts/rnaseq_analysis_on_input_file.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash/
 2 | 
 3 | # This script takes a fastq file of RNA-seq data, runs FastQC, STAR, Qualimap and Salmon.
 4 | # USAGE: sh rnaseq_analysis_on_input_file.sh <name of fastq file>
 5 | 
 6 | # change directories to /n/scratch3/ so that all the analysis is stored there.
 7 | cd /n/scratch3/users/r/$USER/rnaseq_hbc-workshop/
 8 | 
 9 | # initialize a variable with an intuitive name to store the name of the input fastq file
10 | fq=$1
11 | 
12 | # grab base of filename for naming outputs
13 | samplename=`basename $fq .subset.fq`
14 | echo "Sample name is $samplename"
15 | 
16 | # specify the number of cores to use
17 | cores=6
18 | 
19 | # directory with the genome and transcriptome index files + name of the gene annotation file
20 | genome=/n/groups/hbctraining/intro_rnaseq_hpc/reference_data_ensembl38/ensembl38_STAR_index
21 | transcriptome=/n/groups/hbctraining/rna-seq_2019_02/reference_data/salmon_index
22 | gtf=/n/groups/hbctraining/intro_rnaseq_hpc/reference_data_ensembl38/Homo_sapiens.GRCh38.92.1.gtf
23 | 
24 | # make all of the output directories
25 | # The -p option means mkdir will create the whole path if it 
26 | # does not exist and refrain from complaining if it does exist
27 | mkdir -p results/fastqc/
28 | mkdir -p results/STAR/
29 | mkdir -p results/qualimap/
30 | mkdir -p results/salmon/
31 | 
32 | # set up output filenames and locations
33 | fastqc_out=results/fastqc/
34 | align_out=results/STAR/${samplename}
35 | align_out_bam=results/STAR/${samplename}_Aligned.sortedByCoord.out.bam
36 | qualimap_out=results/qualimap/${samplename}.qualimap
37 | salmon_out=results/salmon/${samplename}.salmon
38 | salmon_mappings=results/salmon/${samplename}_salmon.out
39 | 
40 | # set up the software environment (use version numbers)
41 | module load fastqc/0.11.3
42 | module load gcc/6.2.0  
43 | module load star/2.7.0a
44 | module load samtools/1.3.1
45 | module load java/jdk-1.8u112
46 | module load qualimap/2.2.1
47 | module load salmon/1.4.0
48 | unset DISPLAY
49 | 
50 | echo "Processing file $fq"
51 | 
52 | echo "Starting QC for $samplename"
53 | 
54 | # Run FastQC and move output to the appropriate folder
55 | fastqc -o $fastqc_out $fq
56 | 
57 | 
58 | # Run STAR
59 | STAR --runThreadN $cores --genomeDir $genome --readFilesIn $fq --outFileNamePrefix $align_out --outSAMtype BAM SortedByCoordinate --outSAMunmapped Within --outSAMattributes Standard
60 | 
61 | # Run Qualimap
62 | qualimap rnaseq \
63 | -outdir $qualimap_out \
64 | -a proportional \
65 | -bam $align_out_bam \
66 | -p strand-specific-reverse \
67 | -gtf $gtf \
68 | --java-mem-size=8G
69 | 
70 | # Run salmon
71 | 
72 | echo "Starting Salmon run for $samplename"
73 | 
74 | salmon quant -i $transcriptome \
75 | -p $cores \
76 | -l A \
77 | -r $fq \
78 | -o $salmon_out \
79 | --seqBias \
80 | --useVBOpt
81 | 


--------------------------------------------------------------------------------
/scripts/salmon_all_files_PE.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH -p priority             # partition name
 4 | #SBATCH -t 0-12:00              # hours:minutes runlimit after which job will be killed
 5 | #SBATCH -c 6            # number of cores requested - what you plan to use to run your job
 6 | #SBATCH --mem 8G
 7 | #SBATCH --job-name salmon_mapping_PE           # Job name
 8 | #SBATCH -o salmon-mapping.out                       # File to which standard out will be written
 9 | #SBATCH -e salmon_mapping.err               # File to which standard err will be written
10 | 
11 | 
12 | # Change directories
13 | cd cd ~/rnaseq/raw_data
14 | 
15 | # Get all sample names from a file that contains the prefix
16 | files=`cut -f 1 samples.csv`
17 | 
18 | for sample in $files
19 | 
20 |   do
21 | 
22 |   salmon quant -i /n/groups/hbctraining/rna-seq_2019_02/reference_data/salmon.ensembl38.idx \
23 |      -l A \
24 |      -r ${sample}_R1.fastq ${sample}_R2.fastq \
25 |      -o ../results/salmon/${sample} \
26 |      -p 6 \
27 |      --seqBias \
28 |      --useVBOpt \
29 |      --numBootstraps 30
30 | 
31 | done
32 | 


--------------------------------------------------------------------------------
/scripts/star_genome_index.run:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH -p short 		# partition name
 4 | #SBATCH -t 0-2:00 		# hours:minutes runlimit after which job will be killed
 5 | #SBATCH -c 6 		# number of cores requested -- this needs to be greater than or equal to the number of cores you plan to use to run your job
 6 | #SBATCH --mem 16G
 7 | #SBATCH --job-name STAR_index 		# Job name
 8 | #SBATCH -o %j.out			# File to which standard out will be written
 9 | #SBATCH -e %j.err 		# File to which standard err will be written
10 | 
11 | cd /n/scratch2/username/
12 | 
13 | module load gcc/6.2.0 star/2.5.4a
14 | 
15 | STAR --runThreadN 6 \
16 | --runMode genomeGenerate \
17 | --genomeDir chr1_hg38_index \
18 | --genomeFastaFiles /n/groups/hbctraining/intro_rnaseq_hpc/reference_data_ensembl38/Homo_sapiens.GRCh38.dna.chromosome.1.fa \
19 | --sjdbGTFfile /n/groups/hbctraining/intro_rnaseq_hpc/reference_data_ensembl38/Homo_sapiens.GRCh38.92.gtf \
20 | --sjdbOverhang 99
21 | 


--------------------------------------------------------------------------------