├── .DS_Store
├── Nanog-idr-merged-dreme.fasta
├── README.md
├── _config.yml
├── assets
    ├── css
    │   └── style.scss
    └── images
    │   └── dna-sequence-1600x800.jpg
├── chipseq_DREME_report.pdf
├── chipseq_MEME ChIP_report.pdf
├── chipseq_TOMTOM_report.pdf
├── dreme_output.png
├── img
    ├── .DS_Store
    ├── Allsamples_NanogSites_profile.png
    ├── Allsamples_Pou5f1Sites_profile.png
    ├── Allsamples_Pou5f1Sites_profile2.png
    ├── CCPlot.png
    ├── CoverageHistogramPlot.png
    ├── FastQC_contam.png
    ├── FastQC_seq_qual.png
    ├── Filezilla_step1.png
    ├── Filezilla_step2.png
    ├── GenomicFeatureEnrichment.png
    ├── H1hesc_Nanog_Rep1_chr12_aln.pdf
    ├── H1hesc_Nanog_Rep1_chr12_aln.png
    ├── Nanog-idr.png
    ├── PCA_deeptools.png
    ├── PeakCorHeatmap.png
    ├── PeakPCA.png
    ├── PeakProfile.png
    ├── Pol2.png
    ├── Pou5f1-idr.png
    ├── QC_bamCorrelate_deeptools.png
    ├── QCsummary.png
    ├── QCsummary_cc.png
    ├── QCsummary_enrich.png
    ├── QCsummary_reads.png
    ├── README.md
    ├── RNAseqWorkflow.png
    ├── R_screenshot.png
    ├── Rap.png
    ├── Ribl.png
    ├── Rip.png
    ├── Rplot.png
    ├── SAM_file.png
    ├── Slide1.jpg
    ├── TSS_Nanog_heatmap.png
    ├── TSS_Nanog_profile.png
    ├── TSS_Pou5f1_heatmap.png
    ├── TSS_Pou5f1_heatmap_and_profile.png
    ├── TSS_Pou5f1_profile.png
    ├── annobar_nanog.png
    ├── annotate-genes.png
    ├── background-subtract.png
    ├── bad_quality.png
    ├── bam_to_bigwig.png
    ├── bedtools-basic.png
    ├── bedtools.png
    ├── bedtools_intersect.png
    ├── bedtools_merge.png
    ├── beta.jpg
    ├── beta.png
    ├── blacklist.png
    ├── boxplot-db.png
    ├── boxplot_diffbind.png
    ├── cc-example.png
    ├── cc1-new.png
    ├── cc1.png
    ├── cc2-new.png
    ├── cc2.png
    ├── cc3-new.png
    ├── cc3.png
    ├── chip-fragments.png
    ├── chip_workflow_combined.png
    ├── chip_workflow_june2017.png
    ├── chip_workflow_june2017_full.png
    ├── chip_workflow_june2017_step1.png
    ├── chip_workflow_june2017_step1_QC.png
    ├── chip_workflow_june2017_step1_align.png
    ├── chip_workflow_june2017_step2.png
    ├── chip_workflow_june2017_step3.png
    ├── chip_workflow_june2017_step4.png
    ├── chip_workflow_june2017_step5.png
    ├── chip_workflow_march2018_step1.png
    ├── chip_workflow_march2018_step5.png
    ├── chip_workflow_sept2018_diffbind.png
    ├── chipseq_analysis_workflow_formats.png
    ├── chipseq_analysis_workflow_gen.png
    ├── chipseq_analysis_workflow_samples.png
    ├── chipseq_analysis_workflow_tools.png
    ├── chipseq_exp_controls.png
    ├── chipseq_exp_design.png
    ├── chipseq_exp_peaks.png
    ├── chipseq_experimental_workflow.png
    ├── chipseq_overall.png
    ├── chipseq_sample_workflow.png
    ├── chipseq_trimmed_fastqc.png
    ├── chipseq_workflow_QC.png
    ├── chipseq_workflow_QC_partial.png
    ├── chipseq_workflow_align_partial.png
    ├── chipseq_workflow_general.png
    ├── combine-for-merge.png
    ├── compareCluster.png
    ├── compareCluster_2018.png
    ├── computeMatrix_modes.png
    ├── computeMatrix_overview.png
    ├── corr_curve.png
    ├── count_matrix.png
    ├── covplot.png
    ├── cross-corr-1.png
    ├── cross-corr-2.png
    ├── cross-corr-3.png
    ├── cross-correlation-legend.png
    ├── cross-correlation.png
    ├── ctcf.png
    ├── data_life_cycle_gouldv2.png
    ├── db-heatmap.png
    ├── de_norm_counts_var.png
    ├── de_variation.png
    ├── decision_tree-2.png
    ├── deepTools_coverageplots.png
    ├── deepTools_fingerprints.png
    ├── deepTools_pcaplot.png
    ├── deepTools_scatterplot.png
    ├── deeptools_heatmap.png
    ├── deeptools_heatmap_nolabels.png
    ├── density_profileplots.png
    ├── deseq2-pca.png
    ├── diff-peaks.png
    ├── diffpeak-software.pdf
    ├── diffpeaks-software.png
    ├── dotplot.png
    ├── dotplot_2018.png
    ├── dreme_input.png
    ├── dreme_output.png
    ├── dreme_processing.png
    ├── exp_design.png
    ├── fastqc_input_rep1.png
    ├── feature-distribution.png
    ├── filezilla_diffbind.png
    ├── filezilla_login.png
    ├── filezilla_setup.png
    ├── gProfiler.png
    ├── genemania.png
    ├── getfasta.png
    ├── good_quality.png
    ├── great_annot.png
    ├── great_job_desc.png
    ├── great_region_assoc.png
    ├── great_selection_go.png
    ├── gvng.jpg
    ├── heatmap_diffbind.png
    ├── heatmap_profileplots.png
    ├── idr-idr.png
    ├── idr-pool.png
    ├── idr-rep1-rep2.png
    ├── idr_figure.png
    ├── idr_pipeline.png
    ├── idr_samples.png
    ├── igv-1.png
    ├── igv_encode.png
    ├── igv_encode_nanog.png
    ├── igv_screenshot.png
    ├── input.png
    ├── kegg-dotplot.png
    ├── kegg-dotplot_2018.png
    ├── lambda.png
    ├── macs_workflow.png
    ├── map_table.png
    ├── maplot.png
    ├── maplotXY.png
    ├── maplot_diffbind.png
    ├── maplot_xy_diffbind.png
    ├── mappable.png
    ├── meme_chip_output.png
    ├── meme_suite.png
    ├── merge-glyph.png
    ├── model-macs.png
    ├── model.png
    ├── model_shift.png
    ├── nano-awesome.png
    ├── nano1-old.png
    ├── nano1.png
    ├── nano2.png
    ├── nanog_binding.png
    ├── narrowPeak.png
    ├── oct_sox_nanog.jpg
    ├── orchestra-outline.png
    ├── pca_deseq2_diffbind.png
    ├── pca_diffbind.png
    ├── pcaplot.png
    ├── pcaplotDeseq2.png
    ├── pcaplotEdgeR.png
    ├── peak_detection.png
    ├── peak_shift.png
    ├── peak_shift2.png
    ├── peak_shift3.png
    ├── permission-directory.png
    ├── pie.png
    ├── plos_chipseq.png
    ├── plos_chipseq_arrow.png
    ├── plotCoverage_deeptools.png
    ├── plotFingerprint_deeptools.png
    ├── pseudo_count_comparison-cufflinks.png
    ├── pseudo_count_comparison-sailfish.png
    ├── pseudo_count_comparison-sailfish_sm.png
    ├── pseudo_count_comparison-star.png
    ├── pseudo_count_comparison-star_sm.png
    ├── pseudo_count_comparison.gif
    ├── pseudo_count_comparison.png
    ├── pseudorep-workflow.png
    ├── putty-1.PNG
    ├── putty-2.PNG
    ├── putty-5.PNG
    ├── puttyssh.png
    ├── read-density.png
    ├── read-density2.png
    ├── rna-chip.png
    ├── rnaseq_workflow.png
    ├── rnaseq_workflow_FASTQC.png
    ├── rnaseq_workflow_trimming.png
    ├── rstudio-screenshot.png
    ├── salmon_quasialignment.png
    ├── salmon_rstudio.png
    ├── sam_bam.png
    ├── sam_bam3.png
    ├── sbs_illumina.png
    ├── selfrep-workflow.png
    ├── selfrep-workflow2.png
    ├── spp-fig1.png
    ├── spp-fig2.png
    ├── star.png
    ├── tomtom_output.png
    ├── tss-dist.png
    ├── tss_distance.png
    ├── union.png
    ├── upsetR.png
    ├── upsetRhighes.png
    ├── upsetplot.png
    ├── venn-db.png
    ├── venn-deseq-edger.png
    ├── venn_methods.png
    ├── vennpie.png
    ├── vim_insert.png
    ├── vim_postsave.png
    ├── vim_quit.png
    ├── vim_save.png
    ├── vim_spider.png
    ├── vim_spider_number.png
    ├── workflow-peakcalling.png
    ├── workflow_alignment.png
    └── xkcd.png
├── lectures
    ├── ChIP-seq_troubleshooting.pdf
    ├── ChIP-seq_troubleshooting_2019.pdf
    ├── ChIP-seq_workflow_scope.pdf
    ├── Fileformats.pdf
    ├── Intro_to_workshop.pdf
    ├── Introduction to ChIP-seq 2019.pdf
    ├── Introduction_to_ChIP-seq.pdf
    ├── Wrap-up_new.pdf
    └── alignment_theory.pdf
├── lessons
    ├── 01_Intro_chipseq_data_organization.md
    ├── 02_QC_FASTQC.md
    ├── 03_align_and_filtering.md
    ├── 04_automation.md
    ├── 05_peak_calling_macs.md
    ├── 06_combine_chipQC_and_metrics.md
    ├── 07_handling-replicates-idr.md
    ├── 08_diffbind_differential_peaks.md
    ├── 10_data_visualization.md
    ├── 11_qualitative_assessment_IGV.md
    ├── 12_functional_analysis.md
    ├── CC_metrics_extra.md
    ├── README.md
    ├── chipseeker_visualization.md
    ├── compare_callers_IGV.md
    ├── data_visualization_with_bedtools.md
    ├── extra_intro_to chipseq.md
    ├── handling-replicates-bedtools.md
    ├── integrating_rna-seq_and_chip-seq.md
    ├── motif_analysis_prep.md
    ├── orchestra_mounting.md
    ├── peak_calling_spp.md
    ├── qc_deeptools.md
    ├── shell_review.md
    └── web_based_functional_analysis.md
├── samplesheet_chr12.csv
└── schedule
    ├── 2-day.md
    └── 3-day.md


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/.DS_Store


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | > **NOTE**: The materials in this repository are **no longer actively maintained.**  More recent content can be found at: https://hbctraining.github.io/Intro-to-ChIPseq-flipped/
 2 | 
 3 | ## OLD - Introduction to ChIP-seq using high performance computing
 4 | 
 5 | 
 6 | | Audience | Computational Skills | Prerequisites | Duration |
 7 | :----------|:----------|:----------|:----------|
 8 | | Biologists | Beginner/Intermediate | None | 3-day workshop (~19.5 hours of trainer-led time)|
 9 | 
10 | ### Description
11 | 
12 | This repository has teaching materials for a 3-day Introduction to ChIP-sequencing data analysis workshop. This workshop focuses on teaching basic computational skills to enable the effective use of an high-performance computing environment to implement a ChIP-seq data analysis workflow. It includes an introduction to shell (bash) and shell scripting. In addition to running the ChIP-seq workflow from FASTQ files to peak calls and nearest gene annotations, the workshop covers best practice guidlelines for ChIP-seq experimental design and data organization/management and quality control.
13 | 
14 | > These materials were developed for a trainer-led workshop, but are also amenable to self-guided learning.
15 | 
16 | ### Learning Objectives
17 | 
18 | 1.	Understand the necessity for, and use of, the command line interface (bash) and HPC for analyzing high-throughput sequencing data.
19 | 2.	Understand best practices for designing a ChIP-seq experiment and analysis the resulting data.
20 | 
21 | ### Lessons
22 | **[Click here](schedule/2-day.md) for links to lessons and the suggested schedule**
23 | 
24 | ### Dataset
25 | 
26 | ### Installation Requirements
27 | 
28 | Download the most recent versions of R and RStudio for your laptop:
29 | 
30 |  - [R](http://lib.stat.cmu.edu/R/CRAN/) (version 3.5.0 or above)
31 |  - [RStudio](https://www.rstudio.com/products/rstudio/download/#download)
32 |  
33 | > **NOTE**: When installing the following packages, if you are asked to select (a/s/n) or (y/n), please select “a” or "y" as applicable.
34 | 
35 | (1) Install the below packages on your laptop from CRAN. You DO NOT have to go to the CRAN webpage; you can use the following function to install them:
36 | 
37 | 
38 | ```r
39 | install.packages("BiocManager")
40 | install.packages("tidyverse")
41 | ```
42 | 
43 | **Note that these package names are case sensitive!**
44 | 
45 | 
46 | (2) Install the below packages from Bioconductor. Load BiocManager, then run BiocManager's `install()` function 7 times for the 7 packages:
47 | 
48 | ```r
49 | library(BiocManager)
50 | install("insert_first_package_name_in_quotations")
51 | install("insert_second_package_name_in_quotations")
52 | & so on ...
53 | ```
54 | 
55 | Note that these package names are case sensitive!
56 | 
57 | ```r
58 | ChIPQC
59 | ChIPseeker
60 | DiffBind
61 | clusterProfiler
62 | AnnotationDbi
63 | TxDb.Hsapiens.UCSC.hg19.knownGene
64 | EnsDb.Hsapiens.v75
65 | org.Hs.eg.db
66 | ```
67 | 
68 | > **NOTE:** The library used for the annotations associated with genes (here we are using `TxDb.Hsapiens.UCSC.hg19.knownGene` and `EnsDb.Hsapiens.v75`) will change based on organism (e.g. if studying mouse, would need to install and load `TxDb.Mmusculus.UCSC.mm10.knownGene`). The list of different organism packages are given [here](https://github.com/hbctraining/Training-modules/raw/master/DGE-functional-analysis/img/available_annotations.png).
69 | 
70 | (3) Finally, please check that all the packages were installed successfully by **loading them one at a time** using the `library()` function.  
71 | 
72 | ```r
73 | library(tidyverse)
74 | library(ChIPQC)
75 | library(ChIPseeker)
76 | library(DiffBind)
77 | library(clusterProfiler)
78 | library(AnnotationDbi)
79 | library(TxDb.Hsapiens.UCSC.hg19.knownGene)
80 | library(EnsDb.Hsapiens.v75)
81 | ```
82 | 
83 | (4) Once all packages have been loaded, run sessionInfo().  
84 | 
85 | ```r
86 | sessionInfo()
87 | ```
88 | 
89 | ***
90 | *These materials have been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.*
91 | 
92 | * *Some materials used in these lessons were derived from work that is Copyright © Data Carpentry (http://datacarpentry.org/). 
93 | All Data Carpentry instructional material is made available under the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0).*
94 | 
95 | 


--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-cayman
2 | title: Introduction to ChIP-Seq using high-performance computing
3 | google_analytics: UA-150953419-1
4 | 


--------------------------------------------------------------------------------
/assets/css/style.scss:
--------------------------------------------------------------------------------
1 | ---
2 | ---
3 | 
4 | @import "{{ site.theme }}";
5 | 
6 | .page-header { color: #fff; text-align: center; background-image: url("../images/dna-sequence-1600x800.jpg"); }
7 | 
8 | .main-content h1, .main-content h2, .main-content h3, .main-content h4, .main-content h5, .main-content h6 { margin-top: 2rem; margin-bottom: 1rem; font-weight: normal; color: #000000; }
9 | 


--------------------------------------------------------------------------------
/assets/images/dna-sequence-1600x800.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/assets/images/dna-sequence-1600x800.jpg


--------------------------------------------------------------------------------
/chipseq_DREME_report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/chipseq_DREME_report.pdf


--------------------------------------------------------------------------------
/chipseq_MEME ChIP_report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/chipseq_MEME ChIP_report.pdf


--------------------------------------------------------------------------------
/chipseq_TOMTOM_report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/chipseq_TOMTOM_report.pdf


--------------------------------------------------------------------------------
/dreme_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/dreme_output.png


--------------------------------------------------------------------------------
/img/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/.DS_Store


--------------------------------------------------------------------------------
/img/Allsamples_NanogSites_profile.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/Allsamples_NanogSites_profile.png


--------------------------------------------------------------------------------
/img/Allsamples_Pou5f1Sites_profile.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/Allsamples_Pou5f1Sites_profile.png


--------------------------------------------------------------------------------
/img/Allsamples_Pou5f1Sites_profile2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/Allsamples_Pou5f1Sites_profile2.png


--------------------------------------------------------------------------------
/img/CCPlot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/CCPlot.png


--------------------------------------------------------------------------------
/img/CoverageHistogramPlot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/CoverageHistogramPlot.png


--------------------------------------------------------------------------------
/img/FastQC_contam.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/FastQC_contam.png


--------------------------------------------------------------------------------
/img/FastQC_seq_qual.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/FastQC_seq_qual.png


--------------------------------------------------------------------------------
/img/Filezilla_step1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/Filezilla_step1.png


--------------------------------------------------------------------------------
/img/Filezilla_step2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/Filezilla_step2.png


--------------------------------------------------------------------------------
/img/GenomicFeatureEnrichment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/GenomicFeatureEnrichment.png


--------------------------------------------------------------------------------
/img/H1hesc_Nanog_Rep1_chr12_aln.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/H1hesc_Nanog_Rep1_chr12_aln.pdf


--------------------------------------------------------------------------------
/img/H1hesc_Nanog_Rep1_chr12_aln.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/H1hesc_Nanog_Rep1_chr12_aln.png


--------------------------------------------------------------------------------
/img/Nanog-idr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/Nanog-idr.png


--------------------------------------------------------------------------------
/img/PCA_deeptools.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/PCA_deeptools.png


--------------------------------------------------------------------------------
/img/PeakCorHeatmap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/PeakCorHeatmap.png


--------------------------------------------------------------------------------
/img/PeakPCA.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/PeakPCA.png


--------------------------------------------------------------------------------
/img/PeakProfile.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/PeakProfile.png


--------------------------------------------------------------------------------
/img/Pol2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/Pol2.png


--------------------------------------------------------------------------------
/img/Pou5f1-idr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/Pou5f1-idr.png


--------------------------------------------------------------------------------
/img/QC_bamCorrelate_deeptools.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/QC_bamCorrelate_deeptools.png


--------------------------------------------------------------------------------
/img/QCsummary.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/QCsummary.png


--------------------------------------------------------------------------------
/img/QCsummary_cc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/QCsummary_cc.png


--------------------------------------------------------------------------------
/img/QCsummary_enrich.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/QCsummary_enrich.png


--------------------------------------------------------------------------------
/img/QCsummary_reads.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/QCsummary_reads.png


--------------------------------------------------------------------------------
/img/README.md:
--------------------------------------------------------------------------------
1 | ###All images for Session V of NGS Data Analysis Course
2 | 


--------------------------------------------------------------------------------
/img/RNAseqWorkflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/RNAseqWorkflow.png


--------------------------------------------------------------------------------
/img/R_screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/R_screenshot.png


--------------------------------------------------------------------------------
/img/Rap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/Rap.png


--------------------------------------------------------------------------------
/img/Ribl.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/Ribl.png


--------------------------------------------------------------------------------
/img/Rip.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/Rip.png


--------------------------------------------------------------------------------
/img/Rplot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/Rplot.png


--------------------------------------------------------------------------------
/img/SAM_file.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/SAM_file.png


--------------------------------------------------------------------------------
/img/Slide1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/Slide1.jpg


--------------------------------------------------------------------------------
/img/TSS_Nanog_heatmap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/TSS_Nanog_heatmap.png


--------------------------------------------------------------------------------
/img/TSS_Nanog_profile.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/TSS_Nanog_profile.png


--------------------------------------------------------------------------------
/img/TSS_Pou5f1_heatmap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/TSS_Pou5f1_heatmap.png


--------------------------------------------------------------------------------
/img/TSS_Pou5f1_heatmap_and_profile.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/TSS_Pou5f1_heatmap_and_profile.png


--------------------------------------------------------------------------------
/img/TSS_Pou5f1_profile.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/TSS_Pou5f1_profile.png


--------------------------------------------------------------------------------
/img/annobar_nanog.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/annobar_nanog.png


--------------------------------------------------------------------------------
/img/annotate-genes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/annotate-genes.png


--------------------------------------------------------------------------------
/img/background-subtract.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/background-subtract.png


--------------------------------------------------------------------------------
/img/bad_quality.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/bad_quality.png


--------------------------------------------------------------------------------
/img/bam_to_bigwig.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/bam_to_bigwig.png


--------------------------------------------------------------------------------
/img/bedtools-basic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/bedtools-basic.png


--------------------------------------------------------------------------------
/img/bedtools.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/bedtools.png


--------------------------------------------------------------------------------
/img/bedtools_intersect.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/bedtools_intersect.png


--------------------------------------------------------------------------------
/img/bedtools_merge.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/bedtools_merge.png


--------------------------------------------------------------------------------
/img/beta.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/beta.jpg


--------------------------------------------------------------------------------
/img/beta.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/beta.png


--------------------------------------------------------------------------------
/img/blacklist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/blacklist.png


--------------------------------------------------------------------------------
/img/boxplot-db.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/boxplot-db.png


--------------------------------------------------------------------------------
/img/boxplot_diffbind.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/boxplot_diffbind.png


--------------------------------------------------------------------------------
/img/cc-example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/cc-example.png


--------------------------------------------------------------------------------
/img/cc1-new.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/cc1-new.png


--------------------------------------------------------------------------------
/img/cc1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/cc1.png


--------------------------------------------------------------------------------
/img/cc2-new.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/cc2-new.png


--------------------------------------------------------------------------------
/img/cc2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/cc2.png


--------------------------------------------------------------------------------
/img/cc3-new.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/cc3-new.png


--------------------------------------------------------------------------------
/img/cc3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/cc3.png


--------------------------------------------------------------------------------
/img/chip-fragments.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/chip-fragments.png


--------------------------------------------------------------------------------
/img/chip_workflow_combined.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/chip_workflow_combined.png


--------------------------------------------------------------------------------
/img/chip_workflow_june2017.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/chip_workflow_june2017.png


--------------------------------------------------------------------------------
/img/chip_workflow_june2017_full.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/chip_workflow_june2017_full.png


--------------------------------------------------------------------------------
/img/chip_workflow_june2017_step1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/chip_workflow_june2017_step1.png


--------------------------------------------------------------------------------
/img/chip_workflow_june2017_step1_QC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/chip_workflow_june2017_step1_QC.png


--------------------------------------------------------------------------------
/img/chip_workflow_june2017_step1_align.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/chip_workflow_june2017_step1_align.png


--------------------------------------------------------------------------------
/img/chip_workflow_june2017_step2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/chip_workflow_june2017_step2.png


--------------------------------------------------------------------------------
/img/chip_workflow_june2017_step3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/chip_workflow_june2017_step3.png


--------------------------------------------------------------------------------
/img/chip_workflow_june2017_step4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/chip_workflow_june2017_step4.png


--------------------------------------------------------------------------------
/img/chip_workflow_june2017_step5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/chip_workflow_june2017_step5.png


--------------------------------------------------------------------------------
/img/chip_workflow_march2018_step1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/chip_workflow_march2018_step1.png


--------------------------------------------------------------------------------
/img/chip_workflow_march2018_step5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/chip_workflow_march2018_step5.png


--------------------------------------------------------------------------------
/img/chip_workflow_sept2018_diffbind.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/chip_workflow_sept2018_diffbind.png


--------------------------------------------------------------------------------
/img/chipseq_analysis_workflow_formats.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/chipseq_analysis_workflow_formats.png


--------------------------------------------------------------------------------
/img/chipseq_analysis_workflow_gen.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/chipseq_analysis_workflow_gen.png


--------------------------------------------------------------------------------
/img/chipseq_analysis_workflow_samples.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/chipseq_analysis_workflow_samples.png


--------------------------------------------------------------------------------
/img/chipseq_analysis_workflow_tools.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/chipseq_analysis_workflow_tools.png


--------------------------------------------------------------------------------
/img/chipseq_exp_controls.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/chipseq_exp_controls.png


--------------------------------------------------------------------------------
/img/chipseq_exp_design.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/chipseq_exp_design.png


--------------------------------------------------------------------------------
/img/chipseq_exp_peaks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/chipseq_exp_peaks.png


--------------------------------------------------------------------------------
/img/chipseq_experimental_workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/chipseq_experimental_workflow.png


--------------------------------------------------------------------------------
/img/chipseq_overall.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/chipseq_overall.png


--------------------------------------------------------------------------------
/img/chipseq_sample_workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/chipseq_sample_workflow.png


--------------------------------------------------------------------------------
/img/chipseq_trimmed_fastqc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/chipseq_trimmed_fastqc.png


--------------------------------------------------------------------------------
/img/chipseq_workflow_QC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/chipseq_workflow_QC.png


--------------------------------------------------------------------------------
/img/chipseq_workflow_QC_partial.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/chipseq_workflow_QC_partial.png


--------------------------------------------------------------------------------
/img/chipseq_workflow_align_partial.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/chipseq_workflow_align_partial.png


--------------------------------------------------------------------------------
/img/chipseq_workflow_general.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/chipseq_workflow_general.png


--------------------------------------------------------------------------------
/img/combine-for-merge.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/combine-for-merge.png


--------------------------------------------------------------------------------
/img/compareCluster.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/compareCluster.png


--------------------------------------------------------------------------------
/img/compareCluster_2018.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/compareCluster_2018.png


--------------------------------------------------------------------------------
/img/computeMatrix_modes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/computeMatrix_modes.png


--------------------------------------------------------------------------------
/img/computeMatrix_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/computeMatrix_overview.png


--------------------------------------------------------------------------------
/img/corr_curve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/corr_curve.png


--------------------------------------------------------------------------------
/img/count_matrix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/count_matrix.png


--------------------------------------------------------------------------------
/img/covplot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/covplot.png


--------------------------------------------------------------------------------
/img/cross-corr-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/cross-corr-1.png


--------------------------------------------------------------------------------
/img/cross-corr-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/cross-corr-2.png


--------------------------------------------------------------------------------
/img/cross-corr-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/cross-corr-3.png


--------------------------------------------------------------------------------
/img/cross-correlation-legend.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/cross-correlation-legend.png


--------------------------------------------------------------------------------
/img/cross-correlation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/cross-correlation.png


--------------------------------------------------------------------------------
/img/ctcf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/ctcf.png


--------------------------------------------------------------------------------
/img/data_life_cycle_gouldv2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/data_life_cycle_gouldv2.png


--------------------------------------------------------------------------------
/img/db-heatmap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/db-heatmap.png


--------------------------------------------------------------------------------
/img/de_norm_counts_var.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/de_norm_counts_var.png


--------------------------------------------------------------------------------
/img/de_variation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/de_variation.png


--------------------------------------------------------------------------------
/img/decision_tree-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/decision_tree-2.png


--------------------------------------------------------------------------------
/img/deepTools_coverageplots.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/deepTools_coverageplots.png


--------------------------------------------------------------------------------
/img/deepTools_fingerprints.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/deepTools_fingerprints.png


--------------------------------------------------------------------------------
/img/deepTools_pcaplot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/deepTools_pcaplot.png


--------------------------------------------------------------------------------
/img/deepTools_scatterplot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/deepTools_scatterplot.png


--------------------------------------------------------------------------------
/img/deeptools_heatmap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/deeptools_heatmap.png


--------------------------------------------------------------------------------
/img/deeptools_heatmap_nolabels.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/deeptools_heatmap_nolabels.png


--------------------------------------------------------------------------------
/img/density_profileplots.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/density_profileplots.png


--------------------------------------------------------------------------------
/img/deseq2-pca.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/deseq2-pca.png


--------------------------------------------------------------------------------
/img/diff-peaks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/diff-peaks.png


--------------------------------------------------------------------------------
/img/diffpeak-software.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/diffpeak-software.pdf


--------------------------------------------------------------------------------
/img/diffpeaks-software.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/diffpeaks-software.png


--------------------------------------------------------------------------------
/img/dotplot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/dotplot.png


--------------------------------------------------------------------------------
/img/dotplot_2018.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/dotplot_2018.png


--------------------------------------------------------------------------------
/img/dreme_input.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/dreme_input.png


--------------------------------------------------------------------------------
/img/dreme_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/dreme_output.png


--------------------------------------------------------------------------------
/img/dreme_processing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/dreme_processing.png


--------------------------------------------------------------------------------
/img/exp_design.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/exp_design.png


--------------------------------------------------------------------------------
/img/fastqc_input_rep1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/fastqc_input_rep1.png


--------------------------------------------------------------------------------
/img/feature-distribution.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/feature-distribution.png


--------------------------------------------------------------------------------
/img/filezilla_diffbind.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/filezilla_diffbind.png


--------------------------------------------------------------------------------
/img/filezilla_login.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/filezilla_login.png


--------------------------------------------------------------------------------
/img/filezilla_setup.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/filezilla_setup.png


--------------------------------------------------------------------------------
/img/gProfiler.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/gProfiler.png


--------------------------------------------------------------------------------
/img/genemania.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/genemania.png


--------------------------------------------------------------------------------
/img/getfasta.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/getfasta.png


--------------------------------------------------------------------------------
/img/good_quality.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/good_quality.png


--------------------------------------------------------------------------------
/img/great_annot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/great_annot.png


--------------------------------------------------------------------------------
/img/great_job_desc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/great_job_desc.png


--------------------------------------------------------------------------------
/img/great_region_assoc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/great_region_assoc.png


--------------------------------------------------------------------------------
/img/great_selection_go.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/great_selection_go.png


--------------------------------------------------------------------------------
/img/gvng.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/gvng.jpg


--------------------------------------------------------------------------------
/img/heatmap_diffbind.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/heatmap_diffbind.png


--------------------------------------------------------------------------------
/img/heatmap_profileplots.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/heatmap_profileplots.png


--------------------------------------------------------------------------------
/img/idr-idr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/idr-idr.png


--------------------------------------------------------------------------------
/img/idr-pool.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/idr-pool.png


--------------------------------------------------------------------------------
/img/idr-rep1-rep2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/idr-rep1-rep2.png


--------------------------------------------------------------------------------
/img/idr_figure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/idr_figure.png


--------------------------------------------------------------------------------
/img/idr_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/idr_pipeline.png


--------------------------------------------------------------------------------
/img/idr_samples.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/idr_samples.png


--------------------------------------------------------------------------------
/img/igv-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/igv-1.png


--------------------------------------------------------------------------------
/img/igv_encode.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/igv_encode.png


--------------------------------------------------------------------------------
/img/igv_encode_nanog.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/igv_encode_nanog.png


--------------------------------------------------------------------------------
/img/igv_screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/igv_screenshot.png


--------------------------------------------------------------------------------
/img/input.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/input.png


--------------------------------------------------------------------------------
/img/kegg-dotplot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/kegg-dotplot.png


--------------------------------------------------------------------------------
/img/kegg-dotplot_2018.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/kegg-dotplot_2018.png


--------------------------------------------------------------------------------
/img/lambda.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/lambda.png


--------------------------------------------------------------------------------
/img/macs_workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/macs_workflow.png


--------------------------------------------------------------------------------
/img/map_table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/map_table.png


--------------------------------------------------------------------------------
/img/maplot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/maplot.png


--------------------------------------------------------------------------------
/img/maplotXY.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/maplotXY.png


--------------------------------------------------------------------------------
/img/maplot_diffbind.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/maplot_diffbind.png


--------------------------------------------------------------------------------
/img/maplot_xy_diffbind.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/maplot_xy_diffbind.png


--------------------------------------------------------------------------------
/img/mappable.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/mappable.png


--------------------------------------------------------------------------------
/img/meme_chip_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/meme_chip_output.png


--------------------------------------------------------------------------------
/img/meme_suite.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/meme_suite.png


--------------------------------------------------------------------------------
/img/merge-glyph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/merge-glyph.png


--------------------------------------------------------------------------------
/img/model-macs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/model-macs.png


--------------------------------------------------------------------------------
/img/model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/model.png


--------------------------------------------------------------------------------
/img/model_shift.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/model_shift.png


--------------------------------------------------------------------------------
/img/nano-awesome.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/nano-awesome.png


--------------------------------------------------------------------------------
/img/nano1-old.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/nano1-old.png


--------------------------------------------------------------------------------
/img/nano1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/nano1.png


--------------------------------------------------------------------------------
/img/nano2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/nano2.png


--------------------------------------------------------------------------------
/img/nanog_binding.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/nanog_binding.png


--------------------------------------------------------------------------------
/img/narrowPeak.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/narrowPeak.png


--------------------------------------------------------------------------------
/img/oct_sox_nanog.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/oct_sox_nanog.jpg


--------------------------------------------------------------------------------
/img/orchestra-outline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/orchestra-outline.png


--------------------------------------------------------------------------------
/img/pca_deseq2_diffbind.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/pca_deseq2_diffbind.png


--------------------------------------------------------------------------------
/img/pca_diffbind.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/pca_diffbind.png


--------------------------------------------------------------------------------
/img/pcaplot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/pcaplot.png


--------------------------------------------------------------------------------
/img/pcaplotDeseq2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/pcaplotDeseq2.png


--------------------------------------------------------------------------------
/img/pcaplotEdgeR.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/pcaplotEdgeR.png


--------------------------------------------------------------------------------
/img/peak_detection.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/peak_detection.png


--------------------------------------------------------------------------------
/img/peak_shift.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/peak_shift.png


--------------------------------------------------------------------------------
/img/peak_shift2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/peak_shift2.png


--------------------------------------------------------------------------------
/img/peak_shift3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/peak_shift3.png


--------------------------------------------------------------------------------
/img/permission-directory.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/permission-directory.png


--------------------------------------------------------------------------------
/img/pie.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/pie.png


--------------------------------------------------------------------------------
/img/plos_chipseq.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/plos_chipseq.png


--------------------------------------------------------------------------------
/img/plos_chipseq_arrow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/plos_chipseq_arrow.png


--------------------------------------------------------------------------------
/img/plotCoverage_deeptools.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/plotCoverage_deeptools.png


--------------------------------------------------------------------------------
/img/plotFingerprint_deeptools.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/plotFingerprint_deeptools.png


--------------------------------------------------------------------------------
/img/pseudo_count_comparison-cufflinks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/pseudo_count_comparison-cufflinks.png


--------------------------------------------------------------------------------
/img/pseudo_count_comparison-sailfish.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/pseudo_count_comparison-sailfish.png


--------------------------------------------------------------------------------
/img/pseudo_count_comparison-sailfish_sm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/pseudo_count_comparison-sailfish_sm.png


--------------------------------------------------------------------------------
/img/pseudo_count_comparison-star.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/pseudo_count_comparison-star.png


--------------------------------------------------------------------------------
/img/pseudo_count_comparison-star_sm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/pseudo_count_comparison-star_sm.png


--------------------------------------------------------------------------------
/img/pseudo_count_comparison.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/pseudo_count_comparison.gif


--------------------------------------------------------------------------------
/img/pseudo_count_comparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/pseudo_count_comparison.png


--------------------------------------------------------------------------------
/img/pseudorep-workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/pseudorep-workflow.png


--------------------------------------------------------------------------------
/img/putty-1.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/putty-1.PNG


--------------------------------------------------------------------------------
/img/putty-2.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/putty-2.PNG


--------------------------------------------------------------------------------
/img/putty-5.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/putty-5.PNG


--------------------------------------------------------------------------------
/img/puttyssh.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/puttyssh.png


--------------------------------------------------------------------------------
/img/read-density.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/read-density.png


--------------------------------------------------------------------------------
/img/read-density2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/read-density2.png


--------------------------------------------------------------------------------
/img/rna-chip.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/rna-chip.png


--------------------------------------------------------------------------------
/img/rnaseq_workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/rnaseq_workflow.png


--------------------------------------------------------------------------------
/img/rnaseq_workflow_FASTQC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/rnaseq_workflow_FASTQC.png


--------------------------------------------------------------------------------
/img/rnaseq_workflow_trimming.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/rnaseq_workflow_trimming.png


--------------------------------------------------------------------------------
/img/rstudio-screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/rstudio-screenshot.png


--------------------------------------------------------------------------------
/img/salmon_quasialignment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/salmon_quasialignment.png


--------------------------------------------------------------------------------
/img/salmon_rstudio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/salmon_rstudio.png


--------------------------------------------------------------------------------
/img/sam_bam.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/sam_bam.png


--------------------------------------------------------------------------------
/img/sam_bam3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/sam_bam3.png


--------------------------------------------------------------------------------
/img/sbs_illumina.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/sbs_illumina.png


--------------------------------------------------------------------------------
/img/selfrep-workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/selfrep-workflow.png


--------------------------------------------------------------------------------
/img/selfrep-workflow2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/selfrep-workflow2.png


--------------------------------------------------------------------------------
/img/spp-fig1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/spp-fig1.png


--------------------------------------------------------------------------------
/img/spp-fig2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/spp-fig2.png


--------------------------------------------------------------------------------
/img/star.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/star.png


--------------------------------------------------------------------------------
/img/tomtom_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/tomtom_output.png


--------------------------------------------------------------------------------
/img/tss-dist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/tss-dist.png


--------------------------------------------------------------------------------
/img/tss_distance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/tss_distance.png


--------------------------------------------------------------------------------
/img/union.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/union.png


--------------------------------------------------------------------------------
/img/upsetR.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/upsetR.png


--------------------------------------------------------------------------------
/img/upsetRhighes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/upsetRhighes.png


--------------------------------------------------------------------------------
/img/upsetplot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/upsetplot.png


--------------------------------------------------------------------------------
/img/venn-db.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/venn-db.png


--------------------------------------------------------------------------------
/img/venn-deseq-edger.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/venn-deseq-edger.png


--------------------------------------------------------------------------------
/img/venn_methods.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/venn_methods.png


--------------------------------------------------------------------------------
/img/vennpie.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/vennpie.png


--------------------------------------------------------------------------------
/img/vim_insert.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/vim_insert.png


--------------------------------------------------------------------------------
/img/vim_postsave.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/vim_postsave.png


--------------------------------------------------------------------------------
/img/vim_quit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/vim_quit.png


--------------------------------------------------------------------------------
/img/vim_save.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/vim_save.png


--------------------------------------------------------------------------------
/img/vim_spider.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/vim_spider.png


--------------------------------------------------------------------------------
/img/vim_spider_number.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/vim_spider_number.png


--------------------------------------------------------------------------------
/img/workflow-peakcalling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/workflow-peakcalling.png


--------------------------------------------------------------------------------
/img/workflow_alignment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/workflow_alignment.png


--------------------------------------------------------------------------------
/img/xkcd.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/img/xkcd.png


--------------------------------------------------------------------------------
/lectures/ChIP-seq_troubleshooting.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/lectures/ChIP-seq_troubleshooting.pdf


--------------------------------------------------------------------------------
/lectures/ChIP-seq_troubleshooting_2019.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/lectures/ChIP-seq_troubleshooting_2019.pdf


--------------------------------------------------------------------------------
/lectures/ChIP-seq_workflow_scope.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/lectures/ChIP-seq_workflow_scope.pdf


--------------------------------------------------------------------------------
/lectures/Fileformats.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/lectures/Fileformats.pdf


--------------------------------------------------------------------------------
/lectures/Intro_to_workshop.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/lectures/Intro_to_workshop.pdf


--------------------------------------------------------------------------------
/lectures/Introduction to ChIP-seq 2019.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/lectures/Introduction to ChIP-seq 2019.pdf


--------------------------------------------------------------------------------
/lectures/Introduction_to_ChIP-seq.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/lectures/Introduction_to_ChIP-seq.pdf


--------------------------------------------------------------------------------
/lectures/Wrap-up_new.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/lectures/Wrap-up_new.pdf


--------------------------------------------------------------------------------
/lectures/alignment_theory.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbctraining/Intro-to-ChIPseq/684775b8bb727cb25ca031966eb92b8123be29e6/lectures/alignment_theory.pdf


--------------------------------------------------------------------------------
/lessons/01_Intro_chipseq_data_organization.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Introduction to ChIP-seq and directory setup"
  3 | author: "Mary Piper, Radhika Khetani, Meeta Mistry"
  4 | date: "March 14th, 2018"
  5 | ---
  6 | 
  7 | Approximate time: 45 minutes
  8 | 
  9 | ## Learning Objectives
 10 | 
 11 | - Describe the best practices for designing a ChIP-seq experiment
 12 | - Recognize the need for data management and project organization
 13 | 
 14 | ## Introduction to ChIP-seq
 15 | Chromatin immunoprecipitation (ChIP) experiments are performed to identify DNA bound to specific (chromatin) proteins of interest. The first step involves isolating the chromatin and immunoprecipitating (IP) fragements with an antibody against the protein of interest. In ChIP-seq, the immunoprecipitated DNA fragments are then sequenced, followed by identification of enriched regions of DNA or peaks. These peak calls can then be used to make biological inferences by determining the associated genomic features and/or over-represented sequence motifs. 
 16 | 
 17 | ![chipseq_overview](../img/chipseq_overall.png)
 18 | 
 19 | 
 20 | ## Setting up
 21 | 
 22 | Since we are going to be working with this data on our remote server, **O2**, we first need to log onto the server. 
 23 | 
 24 | Type in the following command with your username to login:
 25 | 
 26 | ```bash
 27 | ssh username@o2.hms.harvard.edu
 28 | ```
 29 | 
 30 | Next we will start an interactive session on O2 with 2 cores (add the `-n 2`):
 31 | 
 32 | ```bash
 33 | $ srun --pty -p interactive -t 0-12:00 --mem 1G -n 1 --reservation=HBC2 /bin/bash
 34 | ```
 35 | 
 36 | **Make sure that your command prompt is now preceded by a character string that contains the word "compute".**
 37 | 
 38 | >**NOTE:** We are using the `--reservation` argument during class since we have a dedicated set of computers reserved so that commands run quickly. When starting an interactive session outside of class you will need to leave out this argument. You may also want to increase othe resources (i.e. memory and number of cores) depending on what you plan on doing.
 39 | >
 40 | >```bash
 41 | >$ srun --pty -p interactive -t 0-12:00 --mem 8G -n 2 /bin/bash
 42 | >```
 43 | 
 44 | 
 45 | ## Data Management
 46 | 
 47 | One of the most important parts of research that involves large amounts of data, is how best to manage it. We tend to prioritize the analysis, but there are many other important aspects that are  often overlooked in the excitement to get a first look at new data. 
 48 | 
 49 | The data management lifecycle displayed below, courtesy of the [HMS Data Management Working Group](https://datamanagement.hms.harvard.edu/hms-data-management-working-group), illustrates some things to consider beyond the data creation and analysis components:
 50 | 
 51 | <img src="../img/data_life_cycle_gouldv2.png" width="350">
 52 | 
 53 | _Image aquired from the [Harvard Biomedical Data Management Website](https://datamanagement.hms.harvard.edu/hms-data-lifecycle)_
 54 | 
 55 | We will cover some parts of this lifecycle by talking about best practices for the **Research** half of the above lifecycle. Later in this workshop we will talk a little more about the data storage. For more information about the full lifecycle and more guidelines for data management, please look at the resources linked below.
 56 | 
 57 | **Resources**
 58 | 
 59 | * The [HMS Data Management Working Group's website](https://datamanagement.hms.harvard.edu/)
 60 | * A guide from the [Harvard library](http://guides.library.harvard.edu/dmp).
 61 | * Sign-up for the [DMWG quarterly newsletter](https://harvard.us13.list-manage.com/subscribe?u=d3fee19ad91470512cdd564dd&id=13642f2d02) for helpful tips, classes and events related to data management
 62 | 
 63 | 
 64 | ### Planning
 65 | 
 66 | You should approach your sequencing project in a very similar way to how you do a biological experiment, and ideally, begins with **experimental design**. We're going to assume that you've already designed a beautiful sequencing experiment to address your biological question, collected appropriate samples, and that you have enough statistical power.
 67 | 
 68 | During this stage it is important to keep track of how the experiment was performed and clearly tracking the source of starting materials and kits used. It is also best practice to include information about any small variations within the experiment or variation relative to standard experiments. 
 69 | 
 70 | ### Organization
 71 | 
 72 | Every computational analysis you do is going to spawn many files, and inevitability you'll want to run some of those analyses again. For each experiment you work on and analyze data for, it is considered best practice to get organized by creating a planned storage space (directory structure).
 73 | 
 74 | We will start by creating a directory that we can use for the rest of the ChIP-seq session.
 75 | 
 76 | First, make sure that you are in your home directory.
 77 | 
 78 | ```bash
 79 | $ cd
 80 | $ pwd
 81 | ```
 82 | This should return `/home/username`.
 83 | 
 84 | Create a `chipseq` directory and change directories into it:
 85 | 
 86 | ```bash
 87 | $ mkdir chipseq
 88 | 
 89 | $ cd chipseq
 90 | ```
 91 | 
 92 | Now that we have a project directory, we can set up the following structure within it to keep files organized.
 93 | 
 94 | ```bash
 95 | chipseq/
 96 | ├── logs/
 97 | ├── meta/
 98 | ├── raw_data/
 99 | ├── reference_data/
100 | ├── results/
101 | │   ├── bowtie2/
102 | │   └── fastqc/
103 | └── scripts/
104 | ```
105 | 
106 | ```bash
107 | $ mkdir raw_data reference_data scripts logs meta
108 | 
109 | $ mkdir -p results/fastqc results/bowtie2
110 | 
111 | $ tree     # this will show you the directory structure you just created
112 | ```
113 | 
114 | > **NOTE:** We are using the parents flag (`-p` or `--parents`) with `mkdir` to complete the file path by creating any parent directories that do not exist. In our case, we have not yet created the `results` directory and so since it does not exist it will be created. This flag can be very useful when scripting workflows. 
115 | 
116 | **This is a generic directory structure and can be tweaked based on personal preference and analysis workflow.**
117 | 
118 | - `logs`: to keep track of the commands run and the specific parameters used, but also to have a record of any standard output that is generated while running the command. 
119 | - `meta`: for any information that describes the samples you are using, which we refer to as [metadata](https://datamanagement.hms.harvard.edu/metadata-overview). We will discuss this in more detail as it pertains to our example dataset, later in this lesson.
120 | - `raw_data`: for any **unmodified** (raw) data obtained prior to computational analysis here, e.g. FASTQ files from the sequencing center. We strongly recommend leaving this directory unmodified through the analysis.
121 | - `reference_data`: for known information related to the reference genome that will be used in the analysis, e.g. genome sequence (FASTA), gene annotation file (GTF) associated with the genome.
122 | - `results`: for output from the different tools you implement in your workflow. Create sub-folders specific to each tool/step of the workflow within this folder. 
123 | - `scripts`: for scripts that you write and use to run analyses/workflow.
124 | 
125 | 
126 | Now that we have the directory structure created, let's copy over the data to perform our quality control and alignment, including our FASTQ files and reference data files:
127 | 
128 | ```bash
129 | $ cp /n/groups/hbctraining/chip-seq/raw_fastq/*fastq raw_data/
130 | 
131 | $ cp /n/groups/hbctraining/chip-seq/reference_data/chr12* reference_data/
132 | ```
133 | 
134 | Now we are all set up for our analysis!
135 | 
136 | > #### File naming conventions
137 | > 
138 | > Another aspect of staying organized is making sure that all the filenames in an analysis are as consistent as possible, and are not things like `alignment1.bam`, but more like `20170823_kd_rep1_gmap-1.4.bam`. [This link](https://datamanagement.hms.harvard.edu/file-naming-conventions) and [this slideshow](http://www2.stat.duke.edu/~rcs46/lectures_2015/01-markdown-git/slides/naming-slides/naming-slides.pdf) have some good guidelines for file naming dos and don'ts.
139 | 
140 | 
141 | ### Documentation
142 | 
143 | **Documentation doesn't stop at the sequencer!** Keeping notes on what happened in what order, and what was done, is essential for reproducible research.
144 | 
145 | #### Log files
146 | 
147 | In your lab notebook, you likely keep track of the different reagents and kits used for a specific protocol. Similarly, recording information about the tools and parameters is important for documenting your computational experiments. 
148 | 
149 | - **Make note of the software you use.** Do your research and find out what tools are best for the data you are working with. Don't just work with tools that you are able to easily install.
150 | - **Keep track of software versions.** Keep up with the literature and make sure you are using the most up-to-date versions.
151 | - **Record information on parameters used and summary statistics** at every step (e.g., how many adapters were removed, how many reads did not align)
152 |     - A general rule of thumb is to test on a single sample or a subset of the data before running your entire dataset through. This will allow you to debug quicker and give you a chance to also get a feel for the tool and the different parameters.
153 |     - Different tools have different ways of reporting log messages and you might have to experiment a bit to figure out what output to capture. You can redirect standard output with the `>` symbol which is equivalent to `1> (standard out)`; other tools might require you to use `2>` to re-direct the `standard error` instead.
154 |     
155 | #### README files
156 | 
157 | After setting up the directory structure and when the analysis is running it is useful to have a **[README file](https://datamanagement.hms.harvard.edu/readme-files) within your project directory**. This file will usually contain a quick one line summary about the project and any other lines that follow will describe the files/directories found within it. An example README is shown below. Within each sub-directory you can also include README files to describe the analysis and the files that were generated.
158 | 
159 | ```
160 | ## README ##
161 | ## This directory contains data generated during the Intro to ChIP-seq course
162 | ## Date: 
163 | 
164 | There are six subdirectories in this directory:
165 | 
166 | raw_data : contains raw data
167 | meta:  contains...
168 | logs:
169 | reference_data:
170 | results:
171 | scripts:
172 | ```
173 | 
174 | *** 
175 | 
176 | ### Homework Exercise
177 | 
178 | - Create a README for the `chipseq/` folder (hint: use `vim` to create the file). Give a short description of the project and as homework add brief descriptions of the types of files you will be storing within each of the sub-directories. 
179 | 
180 | ***
181 | 
182 | 
183 | ## Exploring the example dataset
184 | 
185 | Our goal for this session is to compare the the binding profiles of [Nanog](https://www.nature.com/stemcells/2009/0909/090910/full/stemcells.2009.118.html) and [Pou5f1](https://www.nature.com/articles/7290134) (Oct4). The ChIP was performed on H1 human embryonic stem cell line (h1-ESC) cells, and sequenced using Illumina. The datasets were obtained from the [HAIB TFBS ENCODE collection](http://hgdownload.cse.ucsc.edu/goldenpath/hg19/encodeDCC/wgEncodeHaibTfbs/). These 2 transcription factors are involved in **stem cell pluripotency** and one of the goals is to understand their roles, individually and together, in transriptional regulation. 
186 | 
187 | Two replicates were collected and each was divided into 3 aliquots for the following:
188 | 
189 | - Nanog IP
190 | - Pou5f1 IP
191 | - Control input DNA
192 | 
193 | <img src="../img/chipseq_exp_design.png" width="500">
194 | 
195 | For these 6 samples, we will be using reads from only a 32.8 Mb of chromosome 12 (chr12:1,000,000-33,800,000), so we can get through the workflow in a reasonable amount of time. 
196 | 	
197 | 
198 | ***
199 | *This lesson has been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.*
200 | 


--------------------------------------------------------------------------------
/lessons/02_QC_FASTQC.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "FastQC for quality assessment"
  3 | author: "Mary Piper, Radhika Khetani"
  4 | date: "April 17th, 2019"
  5 | ---
  6 | 
  7 | Contributors: Mary Piper, Radhika Khetani
  8 | 
  9 | Approximate time: 55 minutes
 10 | 
 11 | ## Learning Objectives
 12 | 
 13 | * Become familiar with the Illumina sequencing technology
 14 | * Understanding how to use modules in the cluster environment
 15 | * Evaluate the quality of your sequencing data using FastQC
 16 | 
 17 | ## Quality control of sequence reads
 18 | 
 19 | <img src="../img/chip_workflow_june2017_step1_QC.png" width="400">
 20 | 
 21 | Now that we have our files and directory structure, we are ready to begin our ChIP-seq analysis. For any NGS analysis method, our first step in the workflow is to explore the quality of our reads prior to aligning them to the reference genome and proceeding with downstream analyses. 
 22 | 
 23 | ### Understanding the Illumina sequencing technology
 24 | 
 25 | Before we can assess the quality of our reads, it would be helpful to know a little bit about how these reads were generated. Since our data was sequenced on an Illumina sequencer we will introduce you to their Sequencing by Synthesis methodology, however keep in mind there are other technologies and the way reads are generated will vary (as will the associated biases observed in your data). 
 26 | 
 27 | <img src="../img/sbs_illumina.png" width="700">
 28 | 
 29 | An **animation of the Sequencing by Synthesis is most helpful** (rather than reading through lines of text), and so we would like you to take five minutes and watch [this YouTube video](https://www.youtube.com/watch?v=fCd6B5HRaZ8&t=3s) from Illumina.
 30 | 
 31 | ### Unmapped read data (FASTQ)
 32 | 
 33 | The [FASTQ](https://en.wikipedia.org/wiki/FASTQ_format) file format is the defacto file format for sequence reads generated from next-generation sequencing technologies. This file format evolved from FASTA in that it contains sequence data, but also contains quality information. Similar to FASTA, the FASTQ file begins with a header line. The difference is that the FASTQ header is denoted by a `@` character. For a single record (sequence read) there are four lines, each of which are described below:
 34 | 
 35 | |Line|Description|
 36 | |----|-----------|
 37 | |1|Always begins with '@' and then information about the read|
 38 | |2|The actual DNA sequence|
 39 | |3|Always begins with a '+' and sometimes the same info in line 1|
 40 | |4|Has a string of characters which represent the quality scores; must have same number of characters as line 2|
 41 | 
 42 | Let's use the following read as an example:
 43 | 
 44 | ```
 45 | @HWI-ST330:304:H045HADXX:1:1101:1111:61397
 46 | CACTTGTAAGGGCAGGCCCCCTTCACCCTCCCGCTCCTGGGGGANNNNNNNNNNANNNCGAGGCCCTGGGGTAGAGGGNNNNNNNNNNNNNNGATCTTGG
 47 | +
 48 | @?@DDDDDDHHH?GH:?FCBGGB@C?DBEGIIIIAEF;FCGGI#########################################################
 49 | ```
 50 | 
 51 | As mentioned previously, line 4 has characters encoding the quality of each nucleotide in the read. The legend below provides the mapping of quality scores (Phred-33) to the quality encoding characters. *Different quality encoding scales exist (differing by offset in the ASCII table), but note the most commonly used one is fastqsanger.*
 52 | 
 53 |  ```
 54 |  Quality encoding: !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHI
 55 |                    |         |         |         |         |
 56 |     Quality score: 0........10........20........30........40                                
 57 | ```
 58 |  
 59 | Using the quality encoding character legend, the first nucelotide in the read (C) is called with a quality score of 31 and our Ns are called with a score of 2. **As you can tell by now, this is a bad read.** 
 60 | 
 61 | Each quality score represents the probability that the corresponding nucleotide call is incorrect. This quality score is logarithmically based and is calculated as:
 62 | 
 63 | 	Q = -10 x log10(P), where P is the probability that a base call is erroneous
 64 | 
 65 | These probabaility values are the results from the base calling algorithm and dependent on how much signal was captured for the base incorporation. The score values can be interpreted as follows:
 66 | 
 67 | |Phred Quality Score |Probability of incorrect base call |Base call accuracy|
 68 | |:-------------------|:---------------------------------:|-----------------:|
 69 | |10	|1 in 10 |	90%|
 70 | |20	|1 in 100|	99%|
 71 | |30	|1 in 1000|	99.9%|
 72 | |40	|1 in 10,000|	99.99%|
 73 | |50	|1 in 100,000|	99.999%|
 74 | |60	|1 in 1,000,000|	99.9999%|
 75 | 
 76 | Therefore, for the first nucleotide in the read (C), there is less than a 1 in 1000 chance that the base was called incorrectly. Whereas, for the the end of the read there is greater than 50% probabaility that the base is called incorrectly.
 77 | 
 78 | ## Assessing quality with FastQC
 79 | 
 80 | Now we understand what information is stored in a FASTQ file, the next step is to examine quality metrics for our data.
 81 | 
 82 | [FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) provides a simple way to do some quality control checks on raw sequence data coming from high throughput sequencing pipelines. It provides a modular set of analyses which you can use to give a quick impression of whether your data has any problems of which you should be aware before doing any further analysis.
 83 | 
 84 | The main functions of FastQC are:
 85 | 
 86 | * Import of data from BAM, SAM or FastQ files (any variant)
 87 | * Providing a quick overview to tell you in which areas there may be problems
 88 | * Summary graphs and tables to quickly assess your data
 89 | * Export of results to an HTML based permanent report
 90 | * Offline operation to allow automated generation of reports without running the interactive application
 91 | 
 92 | ### Run FastQC  
 93 | 
 94 | Let's run FastQC on all of our files. 
 95 | 
 96 | Change directories to the `raw_data` folder and check the contents
 97 | 
 98 | ```bash
 99 | $ cd ~/chipseq/raw_data 
100 | 
101 | $ ls -l
102 | ```
103 | 
104 | Before we start using any software, we either have to check if it's available on the cluster, and if it is we have to load it into our environment (or `$PATH`). On the O2 cluster, we can check for and load tools using modules. 
105 | 
106 | If we check which modules we currently have loaded, we should not see FastQC.
107 | 
108 | ```bash
109 | $ module list
110 | ```
111 | 
112 | > **NOTE:** If we check our $PATH variable, we will see that the FastQC program is not in our $PATH (i.e. its not in a directory that unix will automatically check to run commands/programs).
113 | >
114 | > ```bash
115 | > $ echo $PATH
116 | > ```
117 | 
118 | To find the FastQC module to load we need to search the versions available:
119 | 
120 | ```bash
121 | $ module spider
122 | ```
123 | 
124 | Then we can load the FastQC module:
125 | 
126 | ```bash
127 | $ module load fastqc/0.11.5
128 | ```
129 | 
130 | Once a module for a tool is loaded, you have essentially made it directly available to you like any other basic UNIX command.
131 | 
132 | ```bash
133 | $ module list
134 | 
135 | $ echo $PATH
136 | ```
137 | 
138 | FastQC will accept multiple file names as input, so we can use the `*.fq` wildcard.
139 | 
140 | ```bash
141 | $ fastqc *.fastq
142 | ```
143 | 
144 | *Did you notice how each file was processed serially? How do we speed this up?*
145 | 
146 | Exit the interactive session and once you are on a "login node," start a new interactive session with 6 cores. Now we can use the multi-threading functionality of FastQC to speed this up by running 6 jobs at once, one job for one file.
147 | 
148 | ```bash
149 | $ exit  #exit the current interactive session
150 | 
151 | $ srun --pty -c 6 -p interactive -t 0-12:00 --mem 1G --reservation=HBC2 /bin/bash  #start a new one with 6 cpus (-n 6) and 1G RAM (--mem 1G)
152 | 
153 | $ module load fastqc/0.11.5  #reload the module for the new session
154 | 
155 | $ cd ~/chipseq/raw_data
156 | 
157 | $ fastqc -t 6 *.fastq  #note the extra parameter we specified for 6 threads
158 | ```
159 | 
160 | How did I know about the -t argument for FastQC?
161 | 
162 | ```bash
163 | $ fastqc --help
164 | ```
165 | 
166 | Now, move all of the `fastqc` files to the `results/fastqc` directory:
167 | 
168 | ```bash
169 | $ mv *fastqc* ../results/fastqc/
170 | ```
171 | 
172 | ### FastQC Results
173 |    
174 | Let's take a closer look at the files generated by FastQC:
175 |    
176 | `$ ls -lh ../results/fastqc/`
177 | 
178 | #### HTML reports
179 | The .html files contain the final reports generated by fastqc, let's take a closer look at them. Transfer the file for `H1hesc_Input_Rep1_chr12.fastq` over to your laptop via *FileZilla*.
180 | 
181 | ##### Filezilla - Step 1
182 | 
183 | Open *FileZilla*, and click on the File tab. Choose 'Site Manager'.
184 |  
185 | <img src="../img/filezilla_setup.png" width="500">	
186 | 
187 | ##### Filezilla - Step 2
188 | 
189 | Within the 'Site Manager' window, do the following: 
190 | 
191 | 1. Click on 'New Site', and name it something intuitive (e.g. O2)
192 | 2. Host: transfer.rc.hms.harvard.edu 
193 | 3. Protocol: SFTP - SSH File Transfer Protocol
194 | 4. Logon Type: Normal
195 | 5. User: training_account
196 | 6. Password: password for training_account
197 | 7. Click 'Connect'
198 | 
199 | <img src="../img/filezilla_login.png" width="500">	
200 | 	
201 | The **"Per base sequence quality"** plot is the most important analysis module in FastQC for ChIP-seq; it provides the distribution of quality scores across all bases at each position in the reads. This information can help determine whether there were any problems at the sequencing facility during the sequencing of your data. Generally, we expect a decrease in quality towards the ends of the reads, but we shouldn't see any quality drops at the beginning or in the middle of the reads.
202 | 
203 | ![FastQC_seq_qual](../img/FastQC_seq_qual.png)
204 | 
205 | Based on the sequence quality plot, we see the majority of the reads have high quality, but the whiskers drop into the poor quality regions, indicating that a significant number of reads have low quality bases across the reads. The poor quality reads in the middle of the sequence would be concerning if this was our dataset, and we would probably want to contact the sequencing facility. However, this dataset was created artifically, so does not indicate a problem at the sequencing facility. Trimming could be performed from both ends of the sequences, or we can use an alignment tool that can ignore these poor quality bases at the ends of reads (soft clip). 
206 | 
207 | This is the main plot explored for ChIP-seq, but if you would like to go through the remaining plots/metrics, FastQC has a really well documented [manual page](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) with [more details](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/) about all the plots in the report. For ChIP-seq data, we recommend checking the following metrics:
208 | 
209 | - **Per base sequence composition:** You can expect to encounter read start sequence biases in this plot if fragmenting with transposases. 
210 | - **Sequence duplication levels:** This plot can help you get an idea of what proportion of your library corresponds to duplicates. Duplicates are typically removed (even if there is a chance that they are biological duplicates); therefore, if there is a large amount of duplication, the number of reads available for mapping and/or for peak calling will be reduced.
211 | - **Over-represented sequences:**  Over-represented sequences are either highly biologically significant or represent biases. With ChIP-seq you expect to see over-represented sequences in the IP sample because that's exactly what you're doing - enriching for particular sequences based on binding affinity. However, lack of over-represented sequences doesn’t mean you have a bad experiment. If you see over-represented sequences in the input, that usually reflects some bias in the protocol to specific regions.
212 | 
213 | We recommend looking at [this post](http://bioinfo-core.org/index.php/9th_Discussion-28_October_2010) for more information on what bad plots look like and what they mean for your data. Also, FastQC is just an indicator of what's going on with your data, don't take the "PASS"es and "FAIL"s too seriously.
214 | 
215 | > **We also have a [slidedeck](https://github.com/hbctraining/Intro-to-rnaseq-hpc-O2/raw/master/lectures/error_profiles_mm.pdf) of error profiles for Illumina sequencing, where we discuss specific FASTQC plots and possible sources of these types of errors.**
216 | 
217 | ***
218 | *This lesson has been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.*
219 | 
220 | 


--------------------------------------------------------------------------------
/lessons/03_align_and_filtering.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Alignment and filtering"
  3 | author: "Mary Piper, Radhika Khetani"
  4 | date: "April 17th, 2019"
  5 | ---
  6 | 
  7 | Contributors: Mary Piper, Radhika Khetani, Meeta Mistry
  8 | 
  9 | Approximate time: 45 minutes
 10 | 
 11 | ## Learning Objectives
 12 | 
 13 | * Perform alignment of reads to the genome using Bowtie2
 14 | * Examining a SAM file and understanding the information stored in it
 15 | * Filtering aligned reads to keep only uniquely mapped ones
 16 | 
 17 | 
 18 | ## Alignment to Genome
 19 | 
 20 | Now that we have assessed the quality of our sequence data, we are ready to align the reads to the reference genome. [Bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml) is a fast and accurate alignment tool that indexes the genome with an FM Index based on the Burrows-Wheeler Transform method to keep memory requirements low for the alignment process. *Bowtie2* supports gapped, local and paired-end alignment modes and works best for reads that are at least 50 bp (shorter read lengths should use Bowtie1). By default, Bowtie2 will perform a global end-to-end read alignment, which is best for quality-trimmed reads. However, it also has a local alignment mode, which will perform soft-clipping for the removal of poor quality bases or adapters from untrimmed reads. We will use this option since we did not trim our reads.
 21 | 
 22 | > _**NOTE:** Our reads are only 36 bp, so technically we should explore alignment with [bwa](http://bio-bwa.sourceforge.net/) or Bowtie1 to see if it is better. However, since it is rare that you will have sequencing reads with less than 50 bp, we will show you how to perform alignment using Bowtie2._
 23 | 
 24 | <img src="../img/chip_workflow_june2017_step1_align.png" width="400">
 25 | 
 26 | > #### How do other aligners compare?
 27 | > In this workshop we are using Bowtie2 to align our reads, but there are a number of other options. We have explored the use of [bwa](http://bio-bwa.sourceforge.net/) for ChIP-seq analysis and found some differences. For **bwa**, the mapping rates are higher (~ 2%), with an equally similar increase in the number of duplicate mappings identified. Post-filtering this translates to a significantly higher number of mapped reads and results in a much larger number of peaks being called (30% increase). When we compare the peak calls generated from the different aligners, the **bwa** peak calls are a superset of those called from the Bowtie2 aligments. Whether or not these additional peaks are true positives, is something that is yet to be determined. 
 28 | 
 29 | ### Creating a Bowtie2 index
 30 | 
 31 | To perform the Bowtie2 alignment, a genome index is required. The index is analagous to the index in the back of a book. By indexing the genome, we have organized it in a manner that now allows for efficient search and retrieval of matches of the query (sequence read) to the genome. **We previously generated the genome indices for you**, and they exist in the `reference_data` directory.
 32 | 
 33 | However, if you needed to create a genome index yourself, you would use the following command:
 34 | 
 35 | ```bash
 36 | # DO NOT RUN
 37 | 
 38 | bowtie2-build <path_to_reference_genome.fa> <prefix_to_name_indexes>
 39 | ```
 40 | 
 41 | > A quick note on shared databases for human and other commonly used model organisms. The O2 cluster has a designated directory at `/n/groups/shared_databases/` in which there are files that can be accessed by any user. These files contain, but are not limited to, genome indices for various tools, reference sequences, tool specific data, and data from public databases, such as NCBI and PDB. So when using a tool that requires a reference of sorts, it is worth taking a quick look here because chances are it's already been taken care of for you. 
 42 | 
 43 | >```bash
 44 | >$ ls -l /n/groups/shared_databases/igenome/
 45 | >```
 46 | 
 47 | ### Aligning reads to the genome with Bowtie2
 48 | 
 49 | Since we have our indices already created, we can get started with read alignment. Change directories to the `bowtie2` folder:
 50 | 
 51 | ```bash
 52 | $ cd ~/chipseq/results/bowtie2
 53 | ```
 54 | 
 55 | Now let's load the module. We can find out more on the module on O2:
 56 | 
 57 | ```bash
 58 | $ module spider bowtie2
 59 | ```
 60 | You will notice that before we load this module we also need to load the gcc compiler (as will be the case for many of the NGS analysis tools on O2. Always check `module spider` first.)
 61 | 
 62 | ```bash
 63 | $ module load gcc/6.2.0 bowtie2/2.2.9
 64 | ```
 65 | 
 66 | We will perform alignment on our single raw FASTQ file, `H1hesc_Input_Rep1_chr12.fastq`. Details on Bowtie2 and its functionality can be found in the [user manual](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml); we encourage you to peruse through to get familiar with all available options.
 67 | 
 68 | The basic options for aligning reads to the genome using Bowtie2 are:
 69 | 
 70 | * `-p`: number of processors / cores
 71 | * `-q`: reads are in FASTQ format
 72 | * `--local`: local alignment feature to perform soft-clipping
 73 | * `-x`: /path/to/genome_indices_directory
 74 | * `-U`: /path/to/FASTQ_file
 75 | * `-S`: /path/to/output/SAM_file
 76 | 
 77 | ```bash
 78 | $ bowtie2 -p 2 -q --local \
 79 | -x ~/chipseq/reference_data/chr12 \
 80 | -U ~/chipseq/raw_data/H1hesc_Input_Rep1_chr12.fastq \
 81 | -S ~/chipseq/results/bowtie2/H1hesc_Input_Rep1_chr12_aln_unsorted.sam
 82 | ```
 83 | 
 84 | ## Alignment file format: SAM/BAM
 85 | 
 86 | The output we requested from the Bowtie2 aligner is an unsorted SAM file, also known as **Sequence Alignment Map format**. The SAM file, is a **tab-delimited text file** that contains information for each individual read and its alignment to the genome. While we will go into some features of the SAM format, the paper by [Heng Li et al](http://bioinformatics.oxfordjournals.org/content/25/16/2078.full) provides a lot more detail on the specification.
 87 | 
 88 | The file begins with a **header**, which is optional. The header is used to describe source of data, reference sequence, method of alignment, etc., this will change depending on the aligner being used. Each section begins with character ‘@’ followed by **a two-letter record type code**.  These are followed by two-letter tags and values. Example of some common sections are provided below:
 89 | 
 90 | ```
 91 | @HD  The header line
 92 | VN: format version
 93 | SO: Sorting order of alignments
 94 | 
 95 | @SQ  Reference sequence dictionary
 96 | SN: reference sequence name
 97 | LN: reference sequence length
 98 | SP: species
 99 | 
100 | @PG  Program
101 | PN: program name
102 | VN: program version
103 | ```
104 | 
105 | Following the header is the **alignment section**. Each line that follows corresponds to alignment information for a single read. Each alignment line has **11 mandatory fields for essential mapping information** and a variable number of other fields for aligner specific information. 
106 | 
107 | ![SAM1](../img/sam_bam.png)
108 | 
109 | An example read mapping is displayed above. *Note that the example above spans two lines, but in the file it is a single line.* Let's go through the fields one at a time. 
110 | 
111 | - **`QNAME`:** Query name or read name - this is the same read name present in the header of the FASTQ file
112 | - **`FLAG`:** numerical value providing information about read mapping and whether the read is part of a pair.
113 |  
114 |   > **NOTE:** The information stored inside the FLAG is additive based on the following information being TRUE or FALSE:
115 |   > 
116 |   > | Flag | Description |
117 |   > | ------:|:----------------------:|
118 |   > | 1 | read is mapped |
119 |   > | 2 | read is mapped as part of a pair |
120 |   > | 4 | read is unmapped |
121 |   > | 8 | mate is unmapped |
122 |   > | 16| read reverse strand|
123 |   > | 32 | mate reverse strand |
124 |   > | 64 | first in pair |
125 |   > | 128 | second in pair |
126 |   > | 256 | not primary alignment |
127 |   > | 512 | read fails platform/vendor quality checks |
128 |   > | 1024| read is PCR or optical duplicate |
129 |   > 
130 |   > * For a given alignment, each of these flags are either **on or off** indicating the condition is **true or false**. 
131 |   > * The `FLAG` is a combination of all of the individual flags (from the table above) that are true for the alignment 
132 |   > * The beauty of the flag values is that **any combination of flags can only result in one sum**.
133 |   > 
134 |   > **There are tools that help you translate the bitwise flag, for example [this one from Picard](https://broadinstitute.github.io/picard/explain-flags.html)**
135 | 
136 | - **`RNAME`:** is the reference sequence name, giving the chromosome to which the read mapped. The example read is from chromosome 1 which explains why we see 'chr1'. 
137 | - **`POS`:** refers to the 1-based leftmost position of the alignment. 
138 | - **`MAPQ`:** is giving us the alignment quality, the scale of which will depend on the aligner being used. 
139 | - **`CIGAR`:** is a sequence of letters and numbers that represent the *edits or operations* required to match the read to the reference. The letters are operations that are used to indicate which bases align to the reference (i.e. match, mismatch, deletion, insertion), and the numbers indicate the associated base lengths for each 'operation'.
140 | 
141 | Now to the remaning fields in our SAM file:
142 | 
143 | ![SAM1](../img/sam_bam3.png)
144 | 
145 | The next three fields are more pertinent to paired-end data. 
146 | 
147 | - **`MRNM`:** is the mate reference name. 
148 | - **`MPOS`:** is the mate position (1-based, leftmost). 
149 | - **`ISIZE`:** is the inferred insert size.
150 | 
151 | Finally, you have the raw sequence data from the original FASTQ file stored for each read:
152 | 
153 | - **`SEQ`:** is the raw sequence
154 | - **`QUAL`:** is the associated quality values for each position in the read.
155 | 
156 | 
157 | Let's take a quick peek at our SAM file that we just generated. Since it is just a text file, we can browse through it using `less`:
158 | 
159 | ``` bash
160 | $ less H1hesc_Input_Rep1_chr12_aln_unsorted.sam
161 | ```
162 | 
163 | **Does the information you see line up with the fields we described above?**
164 | 
165 | ## Filtering reads
166 | 
167 | An important issue with ChIP-seq data concerns the inclusion of multiple mapped reads (reads mapped to multiple loci on the reference genome). **Allowing for multiple mapped reads increases the number of usable reads and the sensitivity of peak detection; however, the number of false positives may also increase** [[1]](https://www.ncbi.nlm.nih.gov/pubmed/21779159/). Therefore we need to filter our alignment files to **contain only uniquely mapping reads** in order to increase confidence in site discovery and improve reproducibility. Since there is no parameter in Bowtie2 to keep only uniquely mapping reads, we will need to perform the following steps to generate alignment files containing only the uniquely mapping reads:
168 | 
169 | 1. Change alignment file format from SAM to BAM
170 | 2. Sort BAM file by read coordinate locations
171 | 3. Filter to keep only uniquely mapping reads (this will also remove any unmapped reads)
172 | 
173 | ### 1. Changing file format from SAM to BAM
174 | 
175 | While the SAM alignment file output by Bowtie2 is human readable, we need a BAM alignment file for downstream tools. Therefore, we will use [Samtools](http://samtools.github.io) to convert the file formats.
176 | 
177 | To use `samtools` we will need to load the module:
178 | 
179 | ```bash
180 | $ module load gcc/6.2.0 # you may not need to load this if you are working in the same session from Bowtie2
181 | $ module load samtools/1.9
182 | ```
183 | 
184 | The command we will use is `samtools view` with the following parameters:
185 | 
186 | * `-h`: include header in output
187 | * `-S`: input is in SAM format
188 | * `-b`: output BAM format
189 | * `-o`: /path/to/output/file
190 | 
191 | ```bash
192 | $ samtools view -h -S -b \
193 | -o H1hesc_Input_Rep1_chr12_aln_unsorted.bam \
194 | H1hesc_Input_Rep1_chr12_aln_unsorted.sam
195 | ```
196 | 
197 | You can find additional parameters for the samtools functions in the [manual](http://www.htslib.org/doc/samtools-1.2.html).
198 | 
199 | ### 2. Sorting BAM files by genomic coordinates
200 | 
201 | Before we can filter to keep the uniquely mapping reads, we need to sort our BAM alignment files by genomic coordinates (instead of by name). To perform this sort, we will use [Sambamba](http://lomereiter.github.io/sambamba/index.html), which is a tool that quickly processes BAM and SAM files.
202 | 
203 | The command we will use is `sambamba sort` with the following parameters:
204 | 
205 | * `-t`: number of threads / cores
206 | * `-o`: /path/to/output/file
207 | 
208 | ```bash
209 | $ sambamba sort -t 2 \
210 | -o H1hesc_Input_Rep1_chr12_aln_sorted.bam \
211 | H1hesc_Input_Rep1_chr12_aln_unsorted.bam 
212 | ```
213 | 
214 | > **NOTE: This tool is not available as a module on O2.** You will only be able to use this as part of the tools available in the `bcbio` pipeline. In a previous lesson, you had added this to your $PATH by modifying your `.bashrc` file. **If the command above does not work for you, run this line below:**
215 | > 
216 | > `export PATH=/n/app/bcbio/tools/bin:$PATH`
217 | 
218 | 
219 | We could have also used `samtools` to perform the above sort, however using `sambamba` gives us dual functionality. List the contents of the directory -- what do you see? The advantage to using `sambamba` is that along with the newly sorted file, an index file is generated. If we used `samtools` this would have been a two-step process.
220 | 
221 | ### 3. Filtering uniquely mapping reads
222 | 
223 | Finally, we can filter the BAM to keep only uniquely mapping reads. We will use the `sambamba view` command with the following parameters:
224 | 
225 | * `-t`: number of threads / cores
226 | * `-h`: print SAM header before reads
227 | * `-f`: format of output file (default is SAM)
228 | * `-F`: set [custom filter](https://github.com/lomereiter/sambamba/wiki/%5Bsambamba-view%5D-Filter-expression-syntax) - we will be using the filter to remove duplicates, multimappers and unmapped reads.
229 | 
230 | ```bash
231 | $ sambamba view -h -t 2 -f bam \
232 | -F "[XS] == null and not unmapped  and not duplicate" \
233 | H1hesc_Input_Rep1_chr12_aln_sorted.bam > H1hesc_Input_Rep1_chr12_aln.bam
234 | ```
235 | We filtered out unmapped reads by specifying in the filter `not unmapped`, and duplicates with `not duplicate`. Also, among the reads that were aligned, we filtered out multimappers by specifying `[XS] == null`. 'XS' is a tag generated by Bowtie2 that gives an alignment score for the second-best alignment, and it is only present if the read is aligned and more than one alignment was found for the read.
236 | 
237 | Now that the alignment files contain only uniquely mapping reads, we are ready to perform peak calling.
238 | 
239 | > _**NOTE:** After performing read alignment, it's useful to generate QC metrics for the alignment using tools such as [MultiQC](http://multiqc.info) prior to moving on to the next steps of the analysis._
240 | 
241 | > ### Filtering out Blacklisted Regions
242 | > Although we will not perform this step, it is common practice to apply an additional level of filtering to our BAM files. That is, we remove alignments that occur with defined Blacklisted Regions.
243 | > 
244 | > Blacklisted regions represent artifact regions that tend to show artificially high signal (excessive unstructured anomalous reads mapping). These regions are often found at specific types of repeats such as centromeres, telomeres and satellite repeats and typically appear uniquely mappable so simple mappability filters applied above do not remove them. The ENCODE and modENCODE consortia have compiled blacklists for various species and genome versions including human, mouse, worm and fly. These blacklisted regions (coordinate files) can be filtered out from our alignment files before proceeding to peak calling.
245 | > 
246 | > We will revisit this in more detail when we [discuss QC metrics](https://hbctraining.github.io/Intro-to-ChIPseq/lessons/07_QC_quality_metrics.html) in a later lesson.
247 | 
248 | ***
249 | *This lesson has been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.*
250 | 
251 | 


--------------------------------------------------------------------------------
/lessons/10_data_visualization.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Visualization of peaks"
  3 | author: "Meeta Mistry"
  4 | date: "Thursday July 29th, 2017"
  5 | ---
  6 | 
  7 | Approximate time: 80 minutes
  8 | 
  9 | ## Learning Objectives
 10 | * Generate bigWig files
 11 | * Visualizing enrichment patterns at particular locations in the genome
 12 | 
 13 | ## Visualization of ChIP-seq data
 14 | 
 15 | The first part of ChIP-sequencing analysis uses common processing pipelines, which involves the alignment of raw reads to the genome, data filtering, and identification of enriched signal regions (peak calling). In the second stage, individual software programs allow detailed analysis of those peaks, biological interpretation, and visualization of ChIP-seq results.
 16 | 
 17 | There are various strategies for visualizing enrichment patterns and we will explore a few of them. To start, we will create bigWig files for our samples, a standard file format commonly used for ChIP-seq data visualization.
 18 | 
 19 | ## Creating bigWig files
 20 | 
 21 | The first thing we want to do is take our alignment files (BAM) and convert them into bigWig files. The bigWig format is an indexed binary format useful for dense, continuous data that will be displayed in a genome browser as a graph/track, but also is used as input for some of the visualization commands we will be running in `deepTools`. 
 22 | 
 23 | [`deepTools`](http://deeptools.readthedocs.org/en/latest/content/list_of_tools.html), is a suite of Python tools developed for the efficient analysis of high-throughput sequencing data, such as ChIP-seq, RNA-seq or MNase-seq. `deepTools` has a wide variety of commands that go beyond those that are covered in this lesson. We encourage you to look through the docuementation and explore on your own time. 
 24 | 
 25 | 
 26 | <img src="../img/bam_to_bigwig.png" width="700">
 27 | 
 28 | *Image acquired from [deepTools documentation](http://deeptools.readthedocs.io/en/latest/content/tools/bamCoverage.html?highlight=bigwig) pages*
 29 | 
 30 | Start an interactive session with 6 cores. *If you are already logged on to a compute node you will want to exit and start a new session*.
 31 | 
 32 | ```bash
 33 | $ srun --pty -p interactive -t 0-12:00 --mem 8G -c 6 --reservation=HBC2 bash
 34 | ```
 35 | 
 36 | We will begin by creating a directory for the visualization output and loading the required modules to run `deepTools`.
 37 | 
 38 | ```bash
 39 | $ cd ~/chipseq/results/
 40 | $ mkdir -p visualization/bigWig visualization/figures
 41 | ```
 42 | 
 43 | ```bash
 44 | $ module load gcc/6.2.0  python/2.7.12
 45 | $ module load deeptools/3.0.2 
 46 | ```
 47 | 
 48 | One last thing we need to do is **create an index file for each one of our BAM files**. To perform some functions on the BAM file, many tools require an index. Think of an index located at the back of a textbook. When you are interested in a particular subject area you look for the keyword in the index and identify the pages that contain the relevant information. Similarily, indexing the BAM file aims to achieve fast retrieval of alignments overlapping a specified region without going through the whole alignment file. 
 49 | 
 50 | In order to index a BAM file, we will use [SAMtools](http://samtools.sourceforge.net/), a tool suite that provides alot of functionality in dealing with alignment files. There is a command called **`samtools index`**, which is what we will use. Since we need an index for each of our BAM files, we will put this in a `for` loop to avoid having to run the same command multiple times.
 51 | 
 52 | First, let's load the module:
 53 | 
 54 | ```bash
 55 | $ module load samtools/1.9
 56 | ```
 57 | 
 58 | Now, at the command prompt start the **`for` loop**:
 59 | 
 60 | ```bash
 61 | 
 62 | for file in ~/chipseq/results/bowtie2/*aln.bam
 63 | do
 64 | samtools index $file
 65 | done
 66 | ```
 67 | 
 68 | > **NOTE:** The above is assuming that you are pressing return after each line of code. If you wanted you could also run this command as a single line:
 69 | >
 70 | > `$ for file in ~/chipseq/results/bowtie2/*aln.bam; do samtools index $file; done`
 71 | >
 72 | 
 73 | Now, to create our bigWig files there are two tools that can be useful: `bamCoverage` and `bamCompare`. The former will take in a single BAM file and return to you a bigWig file. The latter allows you to normalize two files to each other (i.e. ChIP sample relative to input) and will return a single bigWig file.
 74 | 
 75 | Let's **create a bigWig file for Nanog replicate 2** using the `bamCoverage` command. In addition to the input and output files, there are a few additional parameters we have added. 
 76 | 
 77 | * `normalizeUsing`: Possible choices: RPKM, CPM, BPM, RPGC. We will use BPM (Bins Per Million), which is similar to TPM in RNA-seq. BPM (per bin) = number of reads per bin / sum of all reads per bin (in millions).
 78 | * `binSize`: size of bins in bases
 79 | * `smoothLength`: defines a window, larger than the `binSize`, to average the number of reads over. This helps produce a more continuous plot.
 80 | * `centerReads`: reads are centered with respect to the fragment length as specified by `extendReads`. This option is useful to get a sharper signal around enriched regions.
 81 | 
 82 | ```bash
 83 | $ bamCoverage -b bowtie2/H1hesc_Nanog_Rep2_aln.bam \
 84 | -o visualization/bigWig/H1hesc_Nanog_Rep2.bw \
 85 | --binSize 20 \
 86 | --normalizeUsing BPM \
 87 | --smoothLength 60 \
 88 | --extendReads 150 \
 89 | --centerReads \
 90 | -p 6 2> ../logs/Nanog_rep2_bamCoverage.log
 91 | ```
 92 | We can do the same for the **Pou5f1 replicate 1**:
 93 | 
 94 | ```bash
 95 | $ bamCoverage -b bowtie2/H1hesc_Pou5f1_Rep1_aln.bam \
 96 | -o visualization/bigWig/H1hesc_Pou5f1_Rep1.bw \
 97 | --binSize 20 \
 98 | --normalizeUsing BPM \
 99 | --smoothLength 60 \
100 | --extendReads 150 \
101 | --centerReads \
102 | -p 6 2> ../logs/Pou5f1_rep1_bamCoverage.log
103 | ```
104 | >**NOTE:** There is a reason we chose the specific replicates for the above commands, and it will become more obvious as we get to the end of this lesson!
105 | 
106 | Now, if we wanted to **create a bigWig file in which we normalize the ChIP against the input** we would use `bamCompare`. The command is quite similar to `bamCoverage`, the only difference being you require two files as input (`b1` and `b2`).
107 | 
108 | ```bash
109 | ## DO NOT RUN THIS
110 | 
111 | $ bamCompare -b1 bowtie2/H1hesc_Pou5f1_Rep1_aln.bam \
112 | -b2 bowtie2/H1hesc_Input_Rep1_chr12_aln.bam \
113 | -o visualization/bigWig/H1hesc_Pou5f1_Rep1_bgNorm.bw \
114 | --binSize 20 \
115 | --normalizeUsing BPM \
116 | --smoothLength 60 \
117 | --extendReads 150 \
118 | --centerReads \
119 | -p 6 2> ../logs/Pou5f1_rep1_bamCompare.log
120 | ```
121 | 
122 | > **NOTE:** When you are creating bigWig files for your full dataset, this will take considerably longer and you will not want to run this interactively (except for testing purposes). Instead, you will want to write a job submission script with a loop that runs this command over all of your BAM files.
123 | 
124 | Since we are using a toy dataset which contains only a subset of the data, using these bigWigs for visualization would not give us meaningful results. As such, **we have created bigWig files from the full dataset that you can use for the rest of this lesson.**
125 | 
126 | 
127 | ## Profile plots and heatmaps
128 | 
129 | Because many cis-regulatory elements are close to TSSs of their targets, a common visualization technique is to use bigWig files to obtain a global evaluation of enrichment around the TSS. In our example, we will assess enrichment around the TSS and plot this separately for the Nanog and Pou5f1 samples (two replicates in each plot). 
130 | 
131 | Rather than looking at the TSS for all known genes, we will only look be looking at genes on chromosome 12 in the interest of time. Copy over the BED file which contains the coordinates for all genes on chromosome 12 to the visualization folder.
132 | 
133 | ```bash
134 | $ cp /n/groups/hbctraining/chip-seq/deepTools/chr12_genes.bed ~/chipseq/results/visualization/
135 | ```
136 | 
137 | Before we start plotting our data, we first need to prepare an intermediate file that can be used with the `plotHeatmap` and `plotProfile` commands.
138 | 
139 | <img src="../img/computeMatrix_overview.png" width="700">
140 | 
141 | 
142 | The `computeMatrix` command accepts multiple bigWig files and multiple region files (BED format) to create a count matrix which is the intermediate file. It can also be used to filter and sort regions according to their score. Our region file will be the BED file we just copied over and our bigWig files will be those generated from the full dataset that we have provided for you. Additionally, we will specify a window of +/- 1000bp around the TSS of genes (`-b` and `-a`). For each window, `computeMatrix` will calculate scores based on the read density values in the bigWig files.
143 | 
144 | First, let's create a matrix for one of the Nanog replicates:
145 | 
146 | ```bash
147 | 
148 | $ computeMatrix reference-point --referencePoint TSS \
149 | -b 1000 -a 1000 \
150 | -R ~/chipseq/results/visualization/chr12_genes.bed \
151 | -S /n/groups/hbctraining/chip-seq/full-dataset/bigWig/Encode_Nanog*.bw \
152 | --skipZeros \
153 | -o ~/chipseq/results/visualization/matrixNanog_TSS_chr12.gz \
154 | -p 6 \
155 | --outFileSortedRegions ~/chipseq/results/visualization/regions_TSS_chr12.bed
156 | 
157 | ```
158 | 
159 | > **NOTE:** Typically, the genome regions are genes, and can be obtained from the [UCSC table browser](http://rohsdb.cmb.usc.edu/GBshape/cgi-bin/hgTables). Alternatively, you could look at other regions of interest that are not genomic feature related (i.e. binding regions from another protein of interest).
160 | 
161 | Now, let's create another matrix for the Pou5f1 replicates:
162 | 
163 | ```bash
164 | 
165 | $ computeMatrix reference-point --referencePoint TSS \
166 | -b 1000 -a 1000 \
167 | -R ~/chipseq/results/visualization/chr12_genes.bed \
168 | -S /n/groups/hbctraining/chip-seq/full-dataset/bigWig/Encode_Pou5f1*.bw \
169 | --skipZeros \
170 | -p 6 \
171 | -o ~/chipseq/results/visualization/matrixPou5f1_TSS_chr12.gz \
172 | --outFileSortedRegions ~/chipseq/results/visualization/regionsPou5f1_TSS_chr12.bed
173 | 
174 | ```
175 | 
176 | Using that matrix we can create a **profile plot** which is essentially a density plot that evaluates read density across all transcription start sites. For Nanog, we can see that **Replicate 2 has a particularly higher amount of signal at the TSS compared to Replicate 1**. 
177 | 
178 | ```bash
179 | $ plotProfile -m visualization/matrixNanog_TSS_chr12.gz \
180 | -out visualization/figures/TSS_Nanog_profile.png \
181 | --perGroup \
182 | --colors green purple \
183 | --plotTitle "" --samplesLabel "Rep1" "Rep2" \
184 | --refPointLabel "TSS" \
185 | -T "Nanog read density" \
186 | -z ""
187 | 
188 | ```
189 | 
190 | <img src="../img/TSS_Nanog_profile.png" width="500">
191 | 
192 | Alternatively, we could use a **heatmap** to evaluate the same matrix of information:
193 | 
194 | ```bash
195 | $ plotHeatmap -m visualization/matrixNanog_TSS_chr12.gz \
196 | -out visualization/figures/TSS_Nanog_heatmap.png \
197 | --colorMap RdBu \
198 | --whatToShow 'heatmap and colorbar' \
199 | --zMin -4 --zMax 4  
200 | ```
201 | <img src="../img/TSS_Nanog_heatmap.png" width="400">
202 | 
203 | 
204 | Similarly we can do the same for **Pou5f1. Here, we find that Replicate 1 exhibits stronger signal**.
205 | 
206 | ```bash
207 | $ plotProfile -m visualization/matrixPou5f1_TSS_chr12.gz \
208 | -out visualization/figures/TSS_Pou5f1_profile.png \
209 | --perGroup --colors green purple \
210 | --plotTitle "" --samplesLabel "Rep1" "Rep2" \
211 | --refPointLabel "TSS" -T "Pou5f1 read density" -z ""
212 | ```
213 | 
214 | <img src="../img/TSS_Pou5f1_profile.png" width="400">
215 | 
216 | ```bash
217 | $ plotHeatmap -m visualization/matrixPou5f1_TSS_chr12.gz \
218 | -out visualization/figures/TSS_Pou5f1_heatmap.png \
219 | --colorMap RdBu \
220 | --whatToShow 'heatmap and colorbar' \
221 | --zMin -2 --zMax 2  
222 | ```
223 | 
224 | <img src="../img/TSS_Pou5f1_heatmap.png" width="400">
225 | 
226 | If we wanted **both images in one single plot**, we can do that with `plotHeatmap` and just removing the `--whatToShow` parameter.
227 | 
228 | ```bash
229 | $ plotHeatmap -m visualization/matrixPou5f1_TSS_chr12.gz \
230 | -out visualization/figures/TSS_Pou5f1_profile-heatmap.png \
231 | --colorMap RdBu \
232 | --zMin -2 --zMax 2  
233 | ```
234 | 
235 | <img src="../img/TSS_Pou5f1_heatmap_and_profile.png" width="400">
236 | 
237 | > **NOTE:** Both `plotProfile` and `plotHeatmap` have many options, including the ability to change the type of lines plotted and to plot by group rather than sample. We encourage you to explore the documentation to find out more detail.
238 | 
239 | ## Visualizing enrichment in differentially enriched regions
240 | 
241 | Previously, we had evaluated differential enrichment between the two factors in our study. We had found **almost all of the peaks that were identfied were specific to Nanog and only one region that had significantly higher enrichment in Pou5f1**. We can use the BED files we generated with DiffBind as input to `deepTools` and visualize enrichment in those regions to evaluate the differences in read density.
242 | 
243 | * Open up `FileZilla` and **copy over the BED files to O2** in`~/chipseq/results/visualization`:
244 | 
245 | <img src="../img/filezilla_diffbind.png">
246 | 
247 | Now we can use some of the `deepTools` commands we had explored previously. **Note that we have changed the command from `reference-point` to `scale-regions`.** In the `scale-regions` mode, all regions in the BED file are stretched or shrunken to the length in bases indicated by the user (`--regionBodyLength`).
248 | 
249 | <img src="../img/computeMatrix_modes.png" width="600">
250 | 
251 | Let's **start with Nanog file which contains 33 regions** that were identified as increased in enrichment compared to Pou5f1. The plot confirms what we had expected, that is, Pou5f1 don't have much read depth in these regions. 
252 | 
253 | ```bash
254 | 
255 |  $ computeMatrix scale-regions \
256 | -R ~/chipseq/results/visualization/Nanog_enriched.bed \
257 | -S /n/groups/hbctraining/chip-seq/full-dataset/bigWig/Encode_Pou5f1*.bw /n/groups/hbctraining/chip-seq/full-dataset/bigWig/Encode_Nanog*.bw \
258 | --skipZeros \
259 | -p 6 \
260 | --regionBodyLength 2000 \
261 | -a 500 -b 500 \
262 | -o ~/chipseq/results/visualization/matrixAll_Nanog_binding_sites.gz
263 | 
264 | 
265 | $ plotProfile -m visualization/matrixAll_Nanog_binding_sites.gz \
266 | -out visualization/figures/Allsamples_NanogSites_profile.png \
267 | --perGroup  --plotTitle "" \
268 | --samplesLabel "Pou5f1-Rep1" "Pou5f1-Rep2" "Nanog-Rep1" "Nanog-Rep2" \
269 | -T "Nanog only binding sites"  -z "" \
270 | --startLabel "" \
271 | --endLabel "" \
272 | --colors red red darkblue darkblue
273 | ```
274 | 
275 | <img src="../img/Allsamples_NanogSites_profile.png" width="500">
276 | 
277 | With **Pou5f1, remember we only had one region**. We are still able to plot this data but you will notice that it is a bit more boxy in nature. This is because values are not being averaged over multiple regions.
278 | 
279 | ```bash
280 | 
281 |  $ computeMatrix scale-regions \
282 | -R ~/chipseq/results/visualization/Pou5f1_enriched.bed \
283 | -S /n/groups/hbctraining/chip-seq/full-dataset/bigWig/Encode_Pou5f1*.bw /n/groups/hbctraining/chip-seq/full-dataset/bigWig/Encode_Nanog*.bw \
284 | --skipZeros \
285 | -p 6 \
286 | --regionBodyLength 2000 \
287 | -a 500 -b 500 \
288 | -o ~/chipseq/results/visualization/matrixAll_Pou5f1_binding_sites.gz 
289 | 
290 | 
291 | $ plotProfile -m visualization/matrixAll_Pou5f1_binding_sites.gz \
292 | -out visualization/figures/Allsamples_Pou5f1Sites_profile.png \
293 | --perGroup  --plotTitle "" \
294 | --samplesLabel "Pou5f1-Rep1" "Pou5f1-Rep2" "Nanog-Rep1" "Nanog-Rep2" \
295 | -T "Pou5f1 only binding sites"  -z "" \
296 | --startLabel "" --endLabel "" \
297 | --colors red red darkblue darkblue
298 | ```
299 | 
300 | <img src="../img/Allsamples_Pou5f1Sites_profile2.png" width="500">
301 | 
302 | 
303 | 
304 | 
305 | 
306 | ***
307 | *This lesson has been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.*
308 | 


--------------------------------------------------------------------------------
/lessons/11_qualitative_assessment_IGV.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Qualitative assessment of peaks"
 3 | author: "Meeta Mistry"
 4 | date: "Thursday July 29th, 2017"
 5 | ---
 6 | 
 7 | Approximate time: 25 minutes
 8 | 
 9 | ## Learning Objectives
10 | 
11 | * Use IGV to visualize BigWig, BED and data from ENCODE
12 | 
13 | ## Qualitative assessment using IGV
14 | 
15 | Another method for assessing the quality of your alignment is to visualize the alignment using a genome browser. For this workshop we will be using the [Integrative Genomics Viewer (IGV)](https://www.broadinstitute.org/igv/) from the Broad Institute. *You should already have this downloaded on your laptop.* IGV is an interactive tool which allows exploration of large, integrated genomic datasets. It supports a wide variety of data types, including array-based and next-generation sequence data, and genomic annotations, which facilitates invaluable comparisons.
16 | 
17 | 
18 | ### Transfer files
19 | 
20 | In order to visualize our ChIP-seq enrichment we will first need to move over the bigWig files. We previously used `FileZilla` to transfer files from O2 to your laptop and so we will do the same for these files.
21 | 
22 | 
23 | > **NOTE:**  There is another way to do so using the command line interface. Similar to the `cp` command to copy there is a command that allows you to securely copy files between computers. **The command is called `scp` and allows files to be copied to, from, or between different hosts.** It uses ssh for data transfer and provides the same authentication and same level of security as ssh. The first argument is the location on the remote server and the second argument is the destination on your local machine. 
24 | >
25 | > `$ scp username@transfer.rc.hms.harvard.edu:/path/to/file_on_O2 Path/to/directory/local_machine`
26 | 
27 | Open up `FileZilla` and connect to the transfer node on O2. Navigate to the correct directory on the cluster panel and copy over the following files:
28 | 
29 | * Nanog-Rep2-bigWig: `~/chipseq/results/visualization/bigWig/H1hesc_Nanog_Rep2.bw`
30 | * Pou5f1-Rep1-bigWig: `~/chipseq/results/visualization/bigWig/H1hesc_Pou5f1_Rep1.bw`
31 | * Nanog-Rep2-narrowPeak: `~/chipseq/results/macs2/Nanog-rep2_peaks.narrowPeak`
32 | * Pou5f1-Rep1-narrowPeak: `~/chipseq/results/macs2/Pou5f1-rep1_peaks.narrowPeak`
33 | 
34 | You will also want to **locate the BED files we generated from the differential enrichment analysis** using DiffBind. These files will be in your `chipseq` project in your `results` folder.
35 | 
36 | > **NOTE:** We are copying over only a single sample bigWig from each group. Since we observed that in each case there was a stronger replicate (high read density) that is what we used to make our selection.
37 | 
38 | * Start [IGV](https://www.broadinstitute.org/software/igv/download).
39 | * Load the Human genome (hg19) into IGV using the dropdown menu at the top left of your screen. _Note: there is also an option to "Load Genomes from File..." under the "Genomes" pull-down menu - this is useful when working with non-model organisms_.
40 | * Load the 2 bigWig files and 4 BED files using the **"Load from File..."** option under the **"File"** pull-down menu. 
41 | 
42 | Your IGV interface should now look something like the screenshot below. By default, you will be in a zoomed out view. You will notice that for both bigWig tracks there appears to be a dense blue chunk at the beginning of chromosome 12, which makes sense considering the subsetted toy dataset we are working with. **Use the pulldown menu to zoom into chromosome 12.**
43 | 
44 | <img src="../img/igv-1.png">
45 | 
46 | > Before we start looking at specific genes you will want to **Autoscale** each track.
47 | > 
48 | > * Right click on the left-hand side panel. You should see many options available to you. If "Autoscale" is not checked go ahead and do this. You can do the same for both bigWig tracks.
49 | 
50 | The interaction between Pou5f1, and Nanog is supported by immunoprecipitation, functional analysis, and co-localization of binding sites, and so it is not surprising that they share the **same target genes**. While the 2 BED files output by `DiffBind` contain peaks that are non-overlapping, the narrowPeak files output by macs2 (raw peaks) along with the bigWig display will display the areas where both proteins bind. 
51 | 
52 | * For example take a look at **Sox5** (use the search box to zoom into the gene). How many peaks are associated with this gene for Nanog? For Pou5f1?
53 | * How convincing is the difference in enrichment?
54 | * For the **Erc1** gene, there is only one peak associated with this gene. Which factor binds here? Is the differential enrichment obvious?
55 | 
56 | * Finally, we are going to visually **compare our data** to the output from the **full dataset from ENCODE**, by loading that data from the IGV server using the **"Load from ENCODE (2012)..."** option under the **"File"** pull-down menu
57 | 
58 | ***
59 | *This lesson has been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.*
60 | 


--------------------------------------------------------------------------------
/lessons/CC_metrics_extra.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Cross-correlation Metrics"
  3 | authors: "Mary Piper and Meeta Mistry"
  4 | date: "October 2, 2018"
  5 | ---
  6 | 
  7 | ## Quality Metrics Based on Cross-Correlation
  8 | 
  9 | 
 10 | ### Cross-correlation plot
 11 | 
 12 | Strand cross-correlation is computed as the Pearson's linear correlation between the minus strand and the plus strand, after shifting minus strand by k base pairs. In the end we will have a cross-correlation value for each shift value, and they can be plotted against each other to generate a cross-correlation plot as shown below. The **cross-correlation plot typically produces two peaks**: a peak of enrichment corresponding to the predominant fragment length (highest correlation value) and a peak corresponding to the read length (“phantom” peak).
 13 | 
 14 | <img src="../img/cc-example.png">
 15 | 
 16 | ### Metrics based on the cross-correlation plot
 17 | 
 18 | There are two metrics that are computed using the cross-correlation described below. If you are interested in computing these and drawing cross-correlation plots outside of the ChIPQC package you can use [phantompeakqualtools](https://github.com/kundajelab/phantompeakqualtools). 
 19 | 
 20 | #### Normalized strand cross-correlation coefficent (NSC):
 21 | 
 22 | The ratio of the maximal cross-correlation value divided by the background cross-correlation (minimum cross-correlation value over all possible strand shifts). 
 23 | 
 24 | <img src="https://latex.codecogs.com/gif.latex?\frac{max(CC&space;values)}{min(CCvalues)}" title="\frac{max(CC values)}{min(CCvalues)}" />
 25 | 
 26 | - higher NSC values indicate more enrichment (better signal:noise)
 27 | - low signal-to-noise: NSC values < 1.1
 28 | - minimum possible NSC value: 1 (no enrichment) 
 29 | 
 30 | #### Relative strand cross-correlation coefficient (RSC):
 31 | 
 32 | The ratio of the fragment-length cross-correlation value minus the background cross-correlation value, divided by the phantom-peak cross-correlation value minus the background cross-correlation value. 
 33 | 
 34 | <img src="https://latex.codecogs.com/gif.latex?\frac{max(CCvalues)&space;-&space;background}{phantomCCvalue&space;-&space;background}" title="\frac{max(CCvalues) - background}{phantomCCvalue - background}" />
 35 | 
 36 | - high enrichment: RSC values > 1
 37 | - low signal-to-noise: RSC values < 0.8
 38 | - minimum possible RSC value: 0 (no enrichment)
 39 | 
 40 | > **NOTE:** Low NSC and RSC values can be due to failed and poor quality ChIP, low read sequence quality and hence lots of mismappings, shallow sequencing depth or a combination of these. Also, datasets with few binding sites (< 200) which could be due to biological reasons (i.e. a factor that truly binds only a few sites in a particular tissue type) would output low NSC and RSC scores.
 41 | 
 42 | 
 43 | ### Fragment length 
 44 | 
 45 | The shift value at which we observe the highest correlation value is considered to be the estimated fragment length. Some tools will report to you the top three fragment length values if the peak is not entirely clear. You will want the fragment length estimate to roughly resemble the actual fragment length you had decided on when size selecting during your library preparation.
 46 | 
 47 | 
 48 | ### Why is it that we observe this 'phantom peak' at read length?
 49 | 
 50 | The concept is that if you have a read that maps uniquely on strand `x` at position `i` (where `i` is the starting position); it follows that you will have a read mapping to strand `y` at position `i+r`. Because the way the counts are stored, with the number of reads starting at each coordinate, you will get a bunch of reads at `x[i]` and a bunch of reads at `y[i+r]` that are `r` distance away from each other.
 51 | 
 52 | 
 53 | ### `phantompeakqualtools` 
 54 | 
 55 | The [`phantompeakqualtools`](https://code.google.com/archive/p/phantompeakqualtools/) package is a tool used to compute cross-correlation plots and the quality measures described above for ChIP-seq data [[1](http://www.g3journal.org/content/4/2/209.full)]. We have some instructions below if youare interested in trying it.
 56 | 
 57 | #### **Set up**
 58 | 
 59 | The `phantompeakqualtools` package is written as an R script, that uses `samtools` as a dependency. The package has various options that need to be specified when running from the command line. To get set up, we will need to start an interactive session, load the necessary modules and set up the directory structure:
 60 | 
 61 | ```
 62 | $ srun --pty -p interactive -t 0-12:00 --mem 8G --reservation=HBC bash	
 63 | 
 64 | $ module load gcc/6.2.0 R/3.4.1 samtools/1.3.1
 65 | 
 66 | $ cd ~/chipseq/results
 67 | 
 68 | $ mkdir chip_qc
 69 | 
 70 | $ cd chip_qc
 71 | ```
 72 | 
 73 | **We have downloaded the software for you, and have a copy you can use.**  The directory contains several files.
 74 | 
 75 | ```
 76 | $ ls -l /n/groups/hbctraining/chip-seq/phantompeakqualtools/
 77 | ```
 78 | 
 79 | > **NOTE:**  You can download the `phantompeakqualtools` package, directly from [GitHub](https://github.com/kundajelab/phantompeakqualtools), if you wanted your own local version. This repo is maintained by the developer Anshul Kundaje.
 80 | 
 81 | 
 82 | In this folder there should be a `README.txt` which contains all the commands, options, and output descriptions. Let's check out the `README.txt`:
 83 | 
 84 | ```
 85 | $ less /n/groups/hbctraining/chip-seq/phantompeakqualtools/README.md
 86 | ```
 87 | 
 88 | #### **Using R libraries**
 89 | 
 90 | In the README you will have noticed an *INSTALLATION* section. We will need to install the R package, `spp` and `caTools`, into our personal R library to run the script. Since this is a bit more involved, in the interest of time we have created the libraries and shared them for you to use. To use our libraries, you will need to setup an environmental variable called `R_LIBS_USER` and point it to the location on O2 where our libraries reside:
 91 | 
 92 | ```
 93 | $  export R_LIBS_USER="/n/groups/hbctraining/R/library/"
 94 | ```
 95 | 
 96 | > **NOTE: Testing libraries**
 97 | >
 98 | > If you want to check and see that this is working, you can open up R by typing R and pressing Enter:
 99 | > 
100 | ```
101 | $ R
102 | ```
103 | > 
104 | > And then once in R, try loading the libraries:
105 | 
106 | ```
107 | R version 3.4.1 (2017-06-30) -- "Single Candle"
108 | Copyright (C) 2017 The R Foundation for Statistical Computing
109 | Platform: x86_64-pc-linux-gnu (64-bit)
110 | 
111 | R is free software and comes with ABSOLUTELY NO WARRANTY.
112 | You are welcome to redistribute it under certain conditions.
113 | Type 'license()' or 'licence()' for distribution details.
114 | 
115 |   Natural language support but running in an English locale
116 | 
117 | R is a collaborative project with many contributors.
118 | Type 'contributors()' for more information and
119 | 'citation()' on how to cite R or R packages in publications.
120 | 
121 | Type 'demo()' for some demos, 'help()' for on-line help, or
122 | 'help.start()' for an HTML browser interface to help.
123 | Type 'q()' to quit R.
124 | 
125 | > library(spp)
126 | > library(caTools)
127 | ```
128 | >
129 | > To exit, type: `q()` in the console.
130 | 
131 | #### **Running `phantompeakqualtools`**
132 | 
133 | To obtain quality measures based on cross-correlation plots, we will be running the `run_spp.R` script from the command line which is a package built on SPP. This modified SPP package allows for determination of the cross-correlation peak and predominant fragment length in addition to peak calling. We will be using this package solely for obtaining these quality measures (no peak calling). 
134 | 
135 | The options that we will be using include:
136 | 
137 | * `-c`: full path and name (or URL) of tagAlign/BAM file
138 | * `-savp`: save cross-correlation plot
139 | * `-out`: will create and/or append to a file several important characteristics of the dataset described in more detail below.
140 | 
141 | ```
142 | ## DO NOT RUN THIS
143 | ## THIS SCRIPT IS FOR COMPUTING METRICS ON A SINGLE FILE
144 | $ Rscript /n/groups/hbctraining/chip-seq/phantompeakqualtools/run_spp.R -c=<tagAlign/BAMfile> -savp -out=<outFile>
145 | ```
146 | >_**NOTE:** Even though the script is called `run_spp.R`, we aren't actually performing peak calling with SPP._
147 | 
148 | From within the `phantompeakqualtools` directory, we will create output directories and use a 'for loop' to **run the script on every Nanog and Pouf51 BAM file**:
149 | 
150 | ```
151 | $ mkdir -p logs qual
152 | 
153 | $ for bam in ../bowtie2/*Nanog*aln.bam ../bowtie2/*Pou5f1*aln.bam
154 | do 
155 | bam2=`basename $bam _aln.bam`
156 | Rscript /n/groups/hbctraining/chip-seq/phantompeakqualtools/run_spp.R -c=$bam -savp -out=qual/${bam2}.qual > logs/${bam2}.Rout
157 | done
158 | ```
159 | 
160 | The for loop generates **three output files**. The **quality metrics** are written in a tab-delimited text file, and the **log files** contains the standard output text. A third file is created in the same directory as the BAM files. These are pdf files that contain the **cross-correlation** plot for each sample. Let's move those files into the appropriate output directory:
161 | 
162 | ```
163 | $ mv ../bowtie2/*pdf qual  
164 | 
165 | ```
166 | 
167 | To visualize the quality metrics (.qual) files more easily, we will concatenate the files together to create a single summary file that you can move over locally and open up with Excel.
168 | 
169 | ```
170 | $ cat qual/*qual > qual/phantompeaks_summary.xls
171 | ```
172 | Let's use Filezilla or `scp` to move the summary file over to our local machine for viewing. Open up the file in Excel and take a look at our NSC and RSC values. 
173 | 
174 | #### **`phantompeakqualtools`: quality metrics output**
175 | 
176 | The qual files are tab-delimited with the columns containing the following information:
177 | 
178 | - COL1: Filename: tagAlign/BAM filename 
179 | - COL2: numReads: effective sequencing depth (i.e. total number of mapped reads in input file)
180 | - COL3: estFragLen: comma separated strand cross-correlation peak(s) in decreasing order of correlation. (**NOTE:** The top 3 local maxima locations that are within 90% of the maximum cross-correlation value are output. In almost all cases, the top (first) value in the list represents the predominant fragment length.) 
181 | - COL4: corr_estFragLen: comma separated strand cross-correlation value(s) in decreasing order (col2 follows the same order) 
182 | - COL5: phantomPeak: Read length/phantom peak strand shift 
183 | - COL6: corr_phantomPeak: Correlation value at phantom peak 
184 | - COL7: argmin_corr: strand shift at which cross-correlation is lowest 
185 | - COL8: min_corr: minimum value of cross-correlation 
186 | - COL9: Normalized strand cross-correlation coefficient (NSC) = COL4 / COL8 
187 | - COL10: Relative strand cross-correlation coefficient (RSC) = (COL4 - COL8) / (COL6 - COL8) 
188 | - COL11: QualityTag: Quality tag based on thresholded RSC (codes: -2:veryLow,-1:Low,0:Medium,1:High,2:veryHigh)
189 | 
190 | > **NOTE:** The most important metrics we are interested in are the values in columns 9 through 11, however these numbers are computed from values in the other columns.
191 | 
192 | **How do the values compare to the thresholds mentioned above?** All samples have quite high NSC values indicating more enrichment, a good signal to noise and a fair number of peaks. Nanog-rep2 has a comparably higher NSC value which might explain the increased number of peaks for that sample compared to the others. The RSC and quality tags further indicate good chip signal and a quality IP, yielding a very high quality tag. Based on these metrics, the samples look good for further analysis.
193 | 
194 | 
195 | 
196 | 


--------------------------------------------------------------------------------
/lessons/README.md:
--------------------------------------------------------------------------------
1 | ### All lessons for ChIP-seq analysis workshop
2 | 


--------------------------------------------------------------------------------
/lessons/chipseeker_visualization.md:
--------------------------------------------------------------------------------
 1 | ### Visualization with ChIPseeker
 2 | 
 3 | First, let's take a look at peak locations across the genome. The `covplot()` function calculates **coverage of peak regions** across the genome and generates a figure to visualize this across chromosomes. We do this for the Nanog peaks and find a considerable number of peaks on all chromosomes. 
 4 | 
 5 | ```
 6 | # Assign peak data to variables
 7 | nanog <- readPeakFile(samplefiles[[1]])
 8 | pou5f1 <- readPeakFile(samplefiles[[2]])
 9 | 
10 | # Plot covplot
11 | covplot(nanog, weightCol="V5")
12 | 
13 | ```
14 | 
15 | > **NOTE:** In the `covplot()` function we provide the column which represents the amount of enrichment (`weightCol="V5"`), and that is the value plotted on the y-axis. This is usually some score value; in our case this is the IDR score.
16 | 
17 | <img src="../img/covplot.png">
18 | 
19 | 
20 | Using a window of +/- 1000bp around the TSS of genes we can plot the **density of read count frequency to see where binding is relative to the TSS** or each sample.
21 | 
22 | ```
23 | # Prepare the promotor regions
24 | promoter <- getPromoters(TxDb=txdb, upstream=1000, downstream=1000)
25 | 
26 | # Calculate the tag matrix
27 | tagMatrixList <- lapply(as.list(samplefiles), getTagMatrix, windows=promoter)
28 | 
29 | ## Profile plots
30 | plotAvgProf(tagMatrixList, xlim=c(-1000, 1000), conf=0.95,resample=500, facet="row")
31 | ```
32 | <img src="../img/density_profileplots.png">
33 | 
34 | With these plots the confidence interval is estimated by bootstrap method (500 iterations) and is shown in the grey shading that follows each curve. The Nanog peaks exhibit a nice narrow peak at the TSS with small confidence intervals, whereas the Pou5f1 peaks display a bit wider and less smoothed peak around the TSS with larger confidence intervals.
35 | 
36 | The **heatmap is another method of visualizing the read count frequency** relative to the TSS.
37 | 
38 | 	# Plot heatmap
39 | 	tagHeatmap(tagMatrixList, xlim=c(-1000, 1000), color=NULL)
40 | 
41 | <img src="../img/Rplot.png" width="500">
42 | 
43 | > **NOTE:**  The profile plots and heatmaps are similar to what we did using `deepTools` in the visualization lesson, however here the amplitude of the peak is based on the number of peaks and not on the number of reads aligning (since BAM files are not involved). ChIPseeker is useful for getting a quick look at your data, but for increased accuracy and flexibility in customizing your figure we recommend the `deepTools` methods.
44 | 
45 | ***
46 | *This lesson has been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.*
47 | 
48 | 
49 | 
50 | 


--------------------------------------------------------------------------------
/lessons/compare_callers_IGV.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Discover the overlap between peak callers and visualize peaks in IGV"
  3 | author: "Radhika Khetani"
  4 | date: "Tuesday, July 19th, 2016"
  5 | ---
  6 | 
  7 | Contributors: Radhika Khetani, 
  8 | 
  9 | Approximate time: 60 minutes
 10 | 
 11 | ## Learning Objectives
 12 | 
 13 | * Learn how to perform coordinate-based analysis using *bedtools*
 14 | * Explore ChIP-seq data in IGV
 15 | 
 16 | 
 17 | ## How much overlap exists between SPP and MACS2?
 18 | 
 19 | In this section, our goal is to determine what peaks are in common between the calls made by both SPP and MACS2. To perform this task we are going to use a suite of tools called bedtools.
 20 | 
 21 | ### bedtools
 22 | 
 23 | The idea is that genome coordinate information can be used to perform relatively simple arithmetic, like combining, subsetting, intersecting, etc., to obtain all sorts of information. [bedtools](http://bedtools.readthedocs.org/en/latest/index.html) from [Aaron Quinlan's group](http://quinlanlab.org/) at University of Utah is an easy to use, extremely versatile tool that performs tasks of this nature. 
 24 | <img src="../img/bedtools.png" width="700">
 25 | 
 26 | As the name implies, this suite of tools works with bed files; in addition it works with other file formats that have genome coordinate information. 
 27 | 
 28 | <img src="../img/bedtools-basic.png" width="600">
 29 | 
 30 | > Note: When working with multiple files to perform arithmetic on genomic coordinates, it is essential that all files have coordinate information for the same exact version of the genome!
 31 | 
 32 | ### Setting up
 33 | 
 34 | 	$ bsub -Is -q interactive bash
 35 | 	
 36 | 	$ cd ~/ngs_course/chipseq/results/
 37 | 	
 38 | 	$ module load seq/BEDtools/2.23.0
 39 | 	
 40 | 	$ module load seq/samtools/1.3
 41 | 
 42 | 	$ mkdir -p overlap_spp_macs2/
 43 | 	
 44 | 	$ cd overlap_spp_macs2/
 45 | 
 46 | ### Finding the Nanog peaks that overlap between SPP and MACS2
 47 | 
 48 | Before using bedtools to obtain the overlap, we need to combine the information from both replicates.
 49 | 
 50 | #### Combining the replicates
 51 | 
 52 | <img src="../img/combine-for-merge.png" width="600">
 53 | 
 54 | ```
 55 | # Combine the peaks called for both replicates by SPP using `cat`
 56 | 
 57 | $ cat ../spp/Nanog_Rep1.narrowPeak ../spp/Nanog_Rep2.narrowPeak > spp_Nanog.narrowPeak
 58 | 	
 59 | # Sort/re-order the combined files by coordinates using `sort`
 60 | 
 61 | $ sort -k1,1 -k2,2n spp_Nanog.narrowPeak > spp_Nanog_sorted.narrowPeak
 62 | ```	
 63 | 
 64 | #### Merge peaks to generate a single file from the 2 replicates using `bedtools merge`
 65 | 
 66 | <img src="../img/merge-glyph.png" width="600">
 67 | ```
 68 | 	$ bedtools merge -h
 69 | 	
 70 | 	$ bedtools merge -i spp_Nanog_sorted.narrowPeak > spp_Nanog_merged.bed 
 71 | ```	
 72 | 
 73 | > Note: this command modifies your `narrowPeak` file into a simple, 3-column `bed` file.
 74 | 
 75 | ****
 76 | **Exercise**
 77 | 
 78 | 1. Generate a merged bed files for MACS2 calls for the Nanog sample, and call it `macs2_Nanog_merged.bed`.
 79 | 
 80 | ****
 81 | 
 82 | How many peaks did we start with?
 83 | 
 84 | 	$ wc -l ../[sm]*/*Nanog*narrowPeak
 85 | 	
 86 | > Note: In the above command, we are using **2 wildcards**, one is the `*`, and the other is `[ ]`. This wildcard tells shell that you want to consider all the characters within the brackets as independent, sort of like an "or" case. In the above case shell will perform a `wc -l` on all filenames that fit the pattern `*Nanog*narrowPeak`, within directories that either start with `s` or with `m`. In our case this would be the `spp` and `macs2` directories. 
 87 | 
 88 | How many peaks did we get after merging? Is this what you expected?
 89 | 
 90 | 	$ wc -l *merged.bed
 91 | 
 92 | #### Identify overlap by getting the intersect between peaks from each caller using `bedtools intersect`
 93 | 	
 94 | `bedtools merge` combines overlapping peaks, but `bedtools intersect` only reports back the peaks that are overlapping with respect to the file defined as `a` in the command.
 95 | 
 96 | <img src="../img/bedtools_intersect.png" width="600">
 97 | 
 98 | 	$ bedtools intersect -h
 99 | 	
100 | 	$ bedtools intersect -a spp_Nanog_merged.bed -b macs2_Nanog_merged.bed -wo > Nanog_spp-macs_overlap.bed
101 | 
102 | How many overlapping peaks did we get?
103 | 
104 | 	$ wc -l Nanog_spp-macs_overlap.bed
105 | 
106 | ### Getting set up for IGV 
107 | 
108 | Once you generate your overlapping peaks from ChIP-seq data, you might want to visualize it in a genome browser like IGV. In addition to peak calls, it is useful to look at the BAM files (IP and input) that were used to generate the peaks.
109 | 
110 | Use FileZilla to transfer the following files to a new directory on your computer called `ChIP-seq_visualization`:
111 | 
112 | * `bowtie2/H1hesc_Input_Rep1_chr12_aln.bam` and `bowtie2/H1hesc_Input_Rep1_chr12_aln.bam.bai`
113 | 
114 | * `bowtie2/H1hesc_Nanog_Rep1_chr12_aln.bam` and `bowtie2/H1hesc_Nanog_Rep1_chr12_aln.bam.bai`
115 | 
116 | * `macs2/Nanog-rep1_treat_pileup.bdg` and `macs2/Nanog-rep1_control_lambda.bdg` and change the extension to `.bedgraph` for IGV
117 | 
118 | * `overlap_spp_macs2/Nanog_spp-macs_overlap.bed`
119 | 
120 | ### IGV
121 | 
122 | 1. Open up IGV and make sure you are using hg19. Start loading all the files we just transferred over. Let's go to chromosome 12, since that is where our data are from.
123 | 
124 | 1. What do the various formats look like? (Zoom in to view the alignment information.)
125 | 
126 | 1. Do you observe any correlation between the peaks and the alignment patterns in the Nanog and Input tracks?
127 | 
128 | 1. Suggested genes and surrounding areas to look at:
129 | 
130 | 	GRIN2B
131 | 
132 | 	SOX5
133 | 	
134 | 	GPR19
135 | 
136 | ***
137 | *This lesson has been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.*
138 | 


--------------------------------------------------------------------------------
/lessons/extra_intro_to chipseq.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Introduction to ChIP-seq and directory setup"
 3 | author: "Mary Piper, Radhika Khetani, Meeta Mistry"
 4 | date: "March 14th, 2018"
 5 | ---
 6 | 
 7 | 
 8 | During this session we will be performing a complete workflow for ChIP-seq analysis, starting with experimental design and generation of the raw sequencing reads and ending with functional enrichment analyses and motif discovery.
 9 | 
10 | ![chipseq_workflow_general](../img/chipseq_workflow_general.png)
11 | 
12 | ## Experimental design and library preparation
13 | 
14 | Several steps are involved in the library preparation of protein-bound DNA fragments for sequencing: 
15 | 
16 | ![exp_workflow](../img/chipseq_experimental_workflow.png)
17 | 
18 | 1. After the chromatin is isolated from the cell, proteins are cross-linked to the DNA
19 | 2. The DNA is sheared into fragments (sonication)
20 | 3. A protein-specific antibody is used to immunoprecipitate the protein-bound DNA fragments
21 | 4. The crosslink is reversed and DNA purified
22 | 5. DNA fragments are size selected and amplified using PCR
23 | 
24 | 
25 | Within the DNA fragments enriched for the regions binding to a protein of interest, only a fraction correspond to actual signal. The proportion of DNA fragments containing the actual binding site of the protein depends on the **number of active binding sites, the number of starting genomes, and the efficiency of the IP**. 
26 | 
27 | In addition, when performing ChIP-seq, some sequences may appear enriched due to the following:
28 | 
29 | - Open chromatin regions are fragmented more easily than closed regions
30 | - Repetitive sequences might seem to be enriched (copy number inaccuracies in genome assembly)
31 | - Uneven distribution of sequence reads across the genome
32 | 
33 | Therefore, proper controls are essential. A ChIP-seq peak should be compared with the same region of the genome in a matched control.
34 | 
35 | ![peaks](../img/chipseq_exp_peaks.png)
36 | 
37 | The same starting material should be divided to be used for both the protein-specific IP and the control. The control sample can be generated by one of the following recommended techniques: 
38 | 
39 | - No IP (input DNA) 
40 | - No antibody ("mock IP")
41 | - Non-specific antibody (IgG "mock IP")
42 | 
43 | ![controls](../img/chipseq_exp_controls.png)
44 | 
45 | ***
46 | *This lesson has been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.*
47 | 


--------------------------------------------------------------------------------
/lessons/handling-replicates-bedtools.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Handling replicates"
 3 | author: "Meeta Mistry"
 4 | date: "Monday, June 27th, 2017"
 5 | ---
 6 | 
 7 | Contributors: Meeta Mistry, Radhika Khetani 
 8 | 
 9 | Approximate time: 75 minutes
10 | 
11 | ## Learning Objectives
12 | 
13 | * Combining replicates using simple overlap with Bedtools
14 | 
15 | 
16 | ## Overlapping peaks
17 | 
18 | In this section, our goal is to determine what peaks are in common between the the two replicates for each factor (Nanog and Pou5f1). To perform this task we are going to use a suite of tools called `bedtools`.
19 | 
20 | ### `bedtools`
21 | 
22 | The idea is that genome coordinate information can be used to perform relatively simple arithmetic, like combining, subsetting, intersecting, etc., to obtain all sorts of information. [bedtools](http://bedtools.readthedocs.org/en/latest/index.html) from [Aaron Quinlan's group](http://quinlanlab.org/) at University of Utah is easy to use, and an extremely versatile tool that performs tasks of this nature. 
23 | 
24 | <img src="../img/bedtools.png" width="700">
25 | 
26 | As the name implies, this suite of tools works with **Bed** files, but it also works with other file formats that have genome coordinate information. 
27 | 
28 | <img src="../img/bedtools-basic.png" width="600">
29 | 
30 | > **NOTE:** When working with multiple files to perform arithmetic on genomic coordinates, it is essential that all files have coordinate information for the same exact version of the genome and the same coordinate system (0-based or 1-based)!
31 | 
32 | ### Setting up
33 | 
34 | Let's start an interactive session and change directories and set up a space for the resulting overlaps. 
35 | 
36 | ```bash
37 | $ srun --pty -p short -t 0-12:00 --mem 8G --reservation=HBC bash	
38 | 
39 | $ cd ~/chipseq/results/
40 | 
41 | $ mkdir bedtools
42 | ```
43 | 	
44 | Load the modules for `bedtools` and `samtools`:
45 | 	
46 | ```bash
47 | $ module load gcc/6.2.0 bedtools/2.26.0 samtools/1.3.1
48 | ```
49 | 	
50 | ### Finding overlapping peaks between replicates
51 | 
52 | The [`bedtools intersect`](https://bedtools.readthedocs.io/en/latest/content/tools/intersect.html) command within bedtools is the one we want to use, since it is able to report back the peaks that are overlapping with respect to a given file (the file designated as "a").
53 | 
54 | <img src="../img/bedtools_intersect.png" width="600">
55 | 
56 | 
57 | To find out more information on the parameters available when intersecting, use the help flag:
58 | 
59 | ```bash
60 | $ bedtools intersect -h
61 | ```
62 | 
63 | The intersect tool evaluates A (file 1) and finds regions that overlap in B (file 2). We will add the `-wo` which indicates to write the original A (file 1) and B (file 2) entries plus the number of base pairs of overlap between the two features.
64 | 
65 | Let's start with the Nanog replicates: 
66 | 
67 | ```bash
68 | $ bedtools intersect \
69 | -a macs2/Nanog-rep1_peaks.narrowPeak \
70 | -b macs2/Nanog-rep2_peaks.narrowPeak \
71 | -wo > bedtools/Nanog-overlaps.bed
72 | ```
73 | 
74 | **How many overlapping peaks did we get?**
75 | 
76 | We'll do the same for the Pou5f1 replicates:
77 | 
78 | ```bash
79 | $ bedtools intersect \
80 | -a macs2/Pou5f1-rep1_peaks.narrowPeak \
81 | -b macs2/Pou5f1-rep2_peaks.narrowPeak \
82 | -wo > bedtools/Pou5f1-overlaps.bed
83 | ```
84 | Note that we are working with subsetted data and so our list of peaks for each replicate is small. Thus, the overlapping peak set will be small as we found with both Nanog and Pou5f1. What is interesting though, is that even though the individual peak lists are smaller for Pou5f1 samples, the overlapping replicates represent a higher proportion of overlap with respect to each replicate.
85 | 
86 | > **_Historical Note_:** "A simpler heuristic for establishing reproducibility was previously used as a standard for depositing ENCODE data and was in effect when much of the currently available data was submitted. According to this standard, either 80% of the top 40% of the peaks identified from one replicate using an acceptable scoring method should overlap the list of peaks from the other replicate, OR peak lists scored using all available reads from each replicate should share more than 75% of regions in common. As with the current standards, this was developed based on experience with accumulated ENCODE ChIP-seq data, albeit with a much smaller sample size." [[ENCODE Guidelines, Landt et al, 2012]](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3431496/)
87 | 
88 | 
89 | ***
90 | 
91 | *This lesson has been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.*
92 | 
93 | 
94 | 
95 | 


--------------------------------------------------------------------------------
/lessons/integrating_rna-seq_and_chip-seq.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Integrating RNA-seq and ChIP-seq"
 3 | author: "Meeta Mistry"
 4 | date: "June 29th, 2017"
 5 | ---
 6 | 
 7 | ## Integrating RNA-seq and ChIP-seq
 8 | 
 9 | As discussed previously, cells use a wide range of mechanisms to regulate gene expression. The increase or decrease in production of specific gene products is dependent on sophisticated programs/machinery, which can change in different cellular conditions. 
10 | 
11 | Evaluating the amount of up- and down-regulation of gene, was something that we explored during the RNA-seq sessions. In this ChIP-seq session, we discussed the machinery regulating expression, specifically in the context of transcription factors and chromatin regulators. It is very common to run both of these types of experiments and integrate data from each to derive a better understanding at the molecular level. However, there are not alot of sophisticated methods developed to compae the two types of data in a robust manner.
12 | 
13 | 
14 | ### Direct comparisons
15 | 
16 | Typically in the literature, one will find studies in which the RNA-seq and ChIP-seq were performed independently and then differntially expressed genes were compared with target genes from the ChIP-seq using a simple overlap measure. These overlapping statistics are complemented with a figure focussing on a panel of select genes. For these genes binding evidence is displayed (using bigWig tracks and/or density profile plots) and a plot of expression changes taken from the RNA-seq data next to it. 
17 | 
18 | 
19 | <img src="../img/rna-chip.png">
20 | 
21 | > This was taken from [Gao et. al, Nature 2014](file:///Users/mmistry/Downloads/Gao2014Nature%20Auts2%20.pdf), where they investigated the AUTS2 protein, risk factor for ASD and other neurological disorders and its link with PRC1, a key epigenetic regulator. 
22 | 
23 | 
24 | ### BETA (Binding and Expression Target Analysis)
25 | 
26 | An another alternative is a tool called [BETA](http://cistrome.org/BETA/) from Shirley Liu's lab at HMS. BETA is a software package that integrates ChIP-seq of transcription factors or chromatin regulators with differential gene expression data to infer direct target genes. This tool stands out from the other nearest gene analyses as it has potential to find target genes for proteins that bind enhancer regions.
27 | 
28 | 
29 | BETA has **three functions**: 
30 | 
31 | 1. to predict whether the factor has activating or repressive function
32 | 2. to infer the factor’s target genes
33 | 3. to identify the motif of the factor and its collaborators which might modulate the factor’s activating or repressive function. 
34 | 
35 | It has three commands depending on what input data you have and what output you desire. 
36 | 
37 | * BETA **basic**: TF activating and repressive function prediction and direct targets detecting
38 | * BETA **plus**: BETA basic + motif analysis on target regions 
39 | * BETA **minus**: TF target genes prediction based on regulatory potential score with only binding data
40 | 
41 | <img src="../img/beta.png" width="500">
42 | 
43 | **Image source: Nat Protoc. 2013 Dec; 8(12): 2502–2515.**
44 | 
45 | > *NOTE: BETA is available as a module on Orchestra.*
46 | 
47 | 
48 | #### How does it work?
49 | 
50 | * For each gene a **regulatory potential (RP)** value is computed by taking all binding sites within the specified range of gene TSS (i.e 100kb)
51 | * RP is a gene’s likelihood of being regulated by a factor, and is dependent on the number of sites withing range of the TSS and the distance between the binding site and TSS.
52 | * **BETA minus** will rank genes by the RP value to identify targets. 
53 | * **BETA basic** will require your DE analysis results in a specific format and knowledge of which are significant. It uses a CDF to determine whether the UP and DOWN groups differ from NON-DE. This is used to identify activating and repressive function. For each gene a rank product score is computed using your RP and DE statistic. This rank product is the new score to identify targets. 
54 | 
55 | 
56 | 
57 | 
58 |  
59 | 
60 | 
61 | 


--------------------------------------------------------------------------------
/lessons/motif_analysis_prep.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Motif Analysis Prep using Bedtools"
 3 | author: " Meeta Mistry"
 4 | date: "Thursday, April 25th, 2019"
 5 | ---
 6 | 
 7 | Approximate time: 20 minutes
 8 | 
 9 | 
10 | ## Preparing files for Motif Analysis
11 | 
12 | Once you have your set of enriched regions which represent binding sites from your protein of interest, a next logical question is within those regions are there any **similar patterns of sequences that are over-represented**. These sequences are often referred to as **motifs**.
13 | 
14 | Tools for motif analyis often require sequence information for each of your binding regions, and so this lesson will show you how to use `bedtools` to obtain that.
15 | 
16 | Start an interactive session:	
17 | 
18 | ```bash
19 | $ srun --pty -p interactive -t 0-12:00 --mem 1G --reservation=HBC2 /bin/bash
20 | ```	
21 | 
22 | Since the motif analyses are unlikely to give reliable results using only the 32.8 Mb of reads mapping to chr12, **we will use the full set of peak calls output from the IDR analysis**. Move into your project directory, and copy over the IDR bed files:
23 | 
24 | ```bash
25 | $ cd ~/chipseq/results
26 | 
27 | $ cp /n/groups/hbctraining/chip-seq/full-dataset/idr/*.bed .
28 | 
29 | ```
30 | 
31 | Extract the **first three columns** of the IDR peak calls for the Nanog file:
32 | 
33 | ```bash
34 | 	$ cut -f 1,2,3 Nanog-idr-merged.bed  > Nanog-idr-merged-simple.bed
35 | ```
36 | 
37 | 
38 | To extract the sequences corresponding to the peak coordinates for motif discovery, we will use the [bedtools](http://bedtools.readthedocs.org/en/latest/content/bedtools-suite.html) suite of tools. **The `getfasta` command extracts sequences from a reference fasta file for each of the coordinates defined in a BED/GFF/VCF file**. 	
39 | 
40 | 
41 |  ```bash	
42 | $ module load gcc/6.2.0 bedtools/2.27.1	
43 | 
44 | $ bedtools getfasta -fi \
45 | /n/groups/shared_databases/igenome/Homo_sapiens/UCSC/hg19/Sequence/WholeGenomeFasta/genome.fa \
46 | -bed Nanog-idr-merged-simple.bed \
47 | -fo Nanog-idr-merged-dreme.fasta
48 | ```	
49 | 
50 |  Using `scp` or **FileZilla** on your local computer, transfer `Nanog-idr-merged-dreme.fasta` to your Desktop.
51 | 
52 | ***
53 | 
54 | *This lesson has been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.*
55 | 
56 | 


--------------------------------------------------------------------------------
/lessons/orchestra_mounting.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Easy access to Orchestra using sshfs"
  3 | author: "Michael Steinbaugh, Lorena Pantano, Radhika Khetani"
  4 | date: "Wednesday, July 28th, 2017"
  5 | ---
  6 | 
  7 | Approximate time: 30 minutes
  8 | 
  9 | ## Learning Objectives
 10 | 
 11 | * Access a remote server as a folder on a local computer
 12 | 
 13 | ## For Windows
 14 | 
 15 | > *Note that these are untested*.
 16 | 
 17 | #### OPTION 1 (Try first)
 18 | 
 19 | [http://www.sftpnetdrive.com/download-thanks](http://www.sftpnetdrive.com/download-thanks)
 20 | 
 21 | #### OPTION 2 (Try only if Option 1 fails!)
 22 | 
 23 | Download the .exe file for Dokan from [https://github.com/dokan-dev/dokany/releases/tag/v0.7.4](https://github.com/dokan-dev/dokany/releases/tag/v0.7.4) and run it.
 24 | 
 25 | Next, download the .zip file for win-sshfs [https://github.com/Foreveryone-cz/win-sshfs/releases/tag/1.5.12.8](https://github.com/Foreveryone-cz/win-sshfs/releases/tag/1.5.12.8), unzip it and run the .exe file within.
 26 | 
 27 | 
 28 | ## For OSX
 29 | 
 30 | To have orchestra accessible on your laptop/desktop as a folder, you need to use something called [`sshfs`](https://en.wikipedia.org/wiki/SSHFS) (ssh filesystem). This is a command that is not native to OSXand you need to go through several steps in order to get it. Once you have `sshfs`, then you need to set up ssh keys to connect Orchestra to your laptop without having to type in a password. 
 31 | 
 32 | ### 1. Installing sshfs on OSX
 33 | 
 34 | Below are 2 ways to get sshfs on OSX or macs, and I am listing both since one might work better on some versions of OSX than others.
 35 | 
 36 | #### OPTION 1 (Try first)
 37 | 
 38 | Download OSXfuse from [https://github.com/osxfuse/osxfuse/releases](https://github.com/osxfuse/osxfuse/releases/download/osxfuse-3.6.0/osxfuse-3.6.0.dmg), and install it.
 39 | 
 40 | Download sshfs from [https://github.com/osxfuse/sshfs/releases](https://github.com/osxfuse/sshfs/releases/download/osxfuse-sshfs-2.5.0/sshfs-2.5.0.pkg), and install it.
 41 | 
 42 | #### OPTION 2 (Try only if Option 1 fails!)
 43 | 
 44 | Step 1. Install [Xcode](https://developer.apple.com/xcode/)
 45 | ```bash
 46 | $ xcode-select --install
 47 | ```
 48 | 
 49 | Step 2. Install Homebrew using ruby (from Xcode)
 50 | ```bash
 51 | $ /usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"
 52 | 
 53 | 	# Uninstall Homebrew
 54 | 	# /usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/uninstall)"
 55 | ```
 56 | 
 57 | Step 2.1. Check to make sure that Homebrew is working properly
 58 | ```bash
 59 | $ brew doctor
 60 | ```
 61 | 
 62 | Step 3. Install Cask from Homebrew's caskroom
 63 | ```bash
 64 | $ brew tap caskroom/cask
 65 | ```
 66 | 
 67 | Step 4. Install OSXfuse using Cask
 68 | ```bash
 69 | $ brew cask install osxfuse
 70 | ```
 71 | 
 72 | Step 5. Install sshfs from fuse
 73 | ```bash
 74 | $ brew install homebrew/fuse/sshfs
 75 | ```
 76 | 
 77 | ### 2. Set up "ssh keys"
 78 | 
 79 | Now, we have installed `sshfs`, the next step is to connect Orchestra (or a remote server) to our laptops. To make this process seamless, we will first set up ssh keys which can be used to connect to the server without having to type in a password everytime.
 80 | 
 81 | ```bash
 82 | # set up ssh keys
 83 | $ ssh-keygen -t rsa -b 4096 -f ~/.ssh/id_rsa -C "ecommonsID"
 84 | $ ssh-add -K ~/.ssh/id_rsa
 85 | ```
 86 | 
 87 | Arguments for `ssh-keygen`:
 88 | * `-t` = Specifies the type of key to create. The possible values are "rsa1" for protocol version 1 and "rsa" or "dsa" for protocol version 2. *We want rsa.*
 89 | * `-b` = Specifies the number of bits in the key to create. For RSA keys, the minimum size is 768 bits and the default is 2048 bits. *We want 4096*
 90 | * `-f` = name of output "keyfile"
 91 | * `-C` = Provides a new comment
 92 | 
 93 | Arguments for `ssh-add`:
 94 | * `-K` = Store passphrases in your keychain
 95 | 
 96 | ```bash
 97 | # copy the contents of `id_rsa.pub` to ~/.ssh/authorized_keys on Orchestra
 98 | $ cat ~/.ssh/id_rsa.pub | pbcopy
 99 | ```
100 | 
101 | > `pbcopy` puts the output of `cat` into the clipboard (in other words, it is equivalent to copying with <kbd>ctrl + c</kbd>) so you can just paste it as usual with <kbd>ctrl + v</kbd>.
102 | 
103 | 
104 | Log into Orchestra and use vim to open `~/.ssh/authorized_keys` and copy the contents from your computer to this file and save it. 
105 | 
106 | 
107 | ### 3. Mount Orchestra using sshfs
108 | 
109 | Now, let's set up for running `sshfs` on our laptops (local machines), by creating a folder with an intuitive name for your home directory on the cluster to be mounted in.
110 | 
111 | ```bash
112 | $ mkdir ~/Orchestra
113 | ```
114 | 
115 | Finally, let's run the `sshfs` command to have Orchestra mount as a folder in the above space.
116 | ```bash
117 | $ sshfs ecommonsID@transfer.orchestra.med.harvard.edu:. ~/Orchestra -o volname="Orchestra" -o follow_symlinks
118 | ```
119 | 
120 | Now we can browse through our home directory on Orchestra as though it was a folder on our laptop. 
121 | 
122 | > If you want to access your lab's directory in `/groups/` or your directory in `/n/scratch2`, you can create sym links to those in your home directory and you will be able to access those as well.
123 | 
124 | Once you are done with it, you can cancel the connection using `umount` and the name of the folder.
125 | 
126 | ```bash
127 | $ umount ~/Orchestra 
128 | ```
129 | 
130 | #### Create an "alias" for mounting and logging into orchestra
131 | 
132 | On your local machine do the following:
133 | 
134 | ```bash
135 | $ cd
136 | 
137 | $ ls -l
138 | 
139 | $ ll
140 | ```
141 | 
142 | `ll` should not work for you, but it works on my computer, why? It's because I have set up an alias for my bash environment, using the `alias` command, such that it knows that I want to actually do `ls -l` when I say `ll`. Let's set it up for your environment.
143 | 
144 | ```bash
145 | $ alias ll='ls -l'
146 | 
147 | $ ll
148 | ```
149 | 
150 | This alias is only going to be available to you while that Terminal window is open. If you wanted to use that alias all the time, what would you do? 
151 | 
152 | You would add it to `~/.bashrc` or `~/.bash_profile`!
153 | 
154 | Let's open either `~/.bash_profile` or `~/.bashrc` files on your laptop (*not on orchestra*), and add a few commands to it.
155 | 
156 | ```bash
157 | alias ll='ls -l'
158 | 
159 | alias orchestra='ssh ecommonsID@orchestra.med.harvard.edu'
160 | 
161 | alias orch_mount='sshfs ecommonsID@transfer.orchestra.med.harvard.edu:. ~/Orchestra -o volname="Orchestra" -o follow_symlinks'
162 | ```
163 | 
164 | Now, open a new Terminal window, or source the file you just modified, and try these out!
165 | 
166 | ***
167 | *This lesson has been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.*
168 | 


--------------------------------------------------------------------------------
/lessons/peak_calling_spp.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Peak calling with SPP"
  3 | author: "Meeta Mistry"
  4 | date: "Thursday, March 3rd, 2016"
  5 | ---
  6 | 
  7 | Contributors: Meeta Mistry
  8 | 
  9 | Approximate time: 90 minutes
 10 | 
 11 | ## Learning Objectives
 12 | 
 13 | * Learning how to use SPP for peak calling
 14 | * Understanding different components of the SPP algorithm
 15 | * Interpretation of results from a peaks including cross-correlation plots and various file formats 
 16 | 
 17 | 
 18 | ## SPP
 19 | 
 20 | SPP is a data processing pipeline optimized for the detection of localized protein binding positions from unpaired sequence reads. The [publication](http://www.nature.com.ezp-prod1.hul.harvard.edu/nbt/journal/v26/n12/full/nbt.1508.html) describes the algorithm in great detail, from which we have listed some of the main features here.
 21 | 
 22 | * Discarding or restricting positions with abnormally high number of tags
 23 | * Provides smoothed tag density in WIG files for viewing in other browsers
 24 | * Provides conservative statistical estimates of fold enrichment ratios along the genome, to determine regions of significant enrichment/depletion (can be exported for visualization)
 25 | 
 26 | The main steps of the ChIP-seq processing pipline are described in the illustration below. As we walk through the SPP pipeline in this lesson, we will describe each step in more detail.
 27 | 
 28 | <div style="text-align:center"><img src="../img/spp-fig2.png" width="150"></div>
 29 | 
 30 | 
 31 | SPP is an R package which can be installed in one of two ways: There is [source code](https://github.com/hms-dbmi/spp/archive/1.13.tar.gz) avaiable for download, or alternatively it can be installed using `devtools` as it is now [available on GitHub](https://github.com/hms-dbmi/spp).
 32 | 
 33 | 
 34 | 
 35 | ## Setting up
 36 | 
 37 | Start an interactive session with a single core (if you don't already have one going) since we are working with such a small dataset; for parallel processing options with SPP see note below.
 38 | 
 39 | 	$ srun --pty -p interactive -t 0-12:00 /bin/bash
 40 | 
 41 | Now let's setup the directory structure. Navigate to `~/chipseq/` if you are not already there. Within the results directory we will create directory called `spp`:
 42 | 
 43 | 	$ mkdir results/spp
 44 | 	$ cd results
 45 | 	
 46 | The last thing we need to before getting started is to load the appropriate software. As mentioned, SPP is an R package. On Orchestra the package comes installed by default when you load the most recent R module:
 47 | 
 48 | 	$ module load stats/R/3.2.1
 49 | 	
 50 | 
 51 | > ### Parallel processing with SPP
 52 | > 	
 53 | > When working with large datasets it can be beneficial to use multiple cores during some of the more computationally intensive processes of peak calling. In order to do so, you will need to install the `snow` package in R. Using snow you can initialize a cluster of nodes for parallel processing (in the example below we have a cluster of 8 nodes). *See `snow` package manual for details.* This cluster variable can then be used as input to functions that allow for parallel processing.
 54 | > 
 55 | > 	library(snow)
 56 | > 	
 57 | > 	cluster <- makeCluster(8)
 58 | 
 59 | 
 60 | ## An R script for running SPP
 61 | 
 62 | To run SPP, there are several functions that need to be run sequentially. For more information on these functions the [home page](http://compbio.med.harvard.edu/Supplements/ChIP-seq/) is quite useful, as they provide a brief tutorial showing the use of the main methods.
 63 | 
 64 | For this class, we have put together an R script that contains all of the methods required for peak calling. You can copy over the script into your current directory, and then we can discuss the methods in more detail.
 65 | 
 66 | 	$ cp /groups/hbctraining/ngs-data-analysis-longcourse/chipseq/scripts/get_peaks.R .
 67 | 
 68 | Open it up using `vim`, as there is a modification we need to make in order for you to be able to run this from your working directory. Use `:set number` in `vim` to add numbers to your lines. Now scroll down to line 16. Here, you need to change the path to where your `spp` directory is located. It will look something like this:
 69 | 
 70 | 	/home/user_name/chipseq/results/spp/
 71 | 
 72 | Save and exit vim. We can now run this script from the command line by using the `Rscript` command followed by the name of our script and need to pass in two arguments:
 73 | 
 74 | 1. Input BAM file
 75 | 2. Treatment/IP BAM file
 76 | 
 77 | Let's try running it on Nanog-rep1:
 78 | 
 79 | 	$ Rscript get_peaks.R bowtie2/H1hesc_Input_Rep1_chr12_aln.bam bowtie2/H1hesc_Nanog_Rep1_chr12_aln.bam
 80 | 
 81 | Before we look at the output, we'll first take some time to discuss whats inside our R script. 
 82 | 
 83 | ### Setup the environment
 84 | 
 85 | The first few lines are setting up the environment which involves **loading the library and reading in the data**. The input and treatment BAM files need to be given as arguments to this script when running it. The final few lines in this chunk of code include defining a path for the resulting output files and a prefix for output file names.
 86 | 
 87 | 
 88 | (***DO NOT RUN THIS***)
 89 | 
 90 | ```
 91 | # Load library
 92 | library(spp)
 93 | 
 94 | # Get filenames from arguments
 95 | filenames <- commandArgs(trailingOnly=TRUE)
 96 | file.input <- filenames[1]
 97 | file.data <- filenames[2]
 98 | 
 99 | # Load in data
100 | input.data <- read.bam.tags(file.input, read.tag.names=T)
101 | chip.data <- read.bam.tags(file.data, read.tag.names=T)
102 | 
103 | # Set path 
104 | path <- "/groups/hbctraining/ngs-data-analysis-longcourse/chipseq/spp/"
105 | 
106 | # Create a prefix for your output file
107 | # This can be changed based on file naming convention 
108 | s <- strsplit(file.data,split="_")
109 | prefix <- paste(s[[1]][2], "_", s[[1]][3], sep="")
110 | ``` 
111 | 
112 | ### Remove anomalous features
113 | 
114 | The next chunk of code **uses the cross-correlation profile to calculate binding peak separation distance**.  The separation distance will be printed out and the **cross-correlation plot** will be saved to file. The `srange` argument gives the possible range for the size of the protected region; it should be higher than tag length but note that making the upper boundary too high will increase calculation time. The `bin` argument is telling SPP to bin tags within the specified number of basepairs to speed up calculation. Increasing the bin size decreases the accuracy of the determined parameters. The numbers we have selected here are defaults suggested in the tutorial.
115 | 
116 | At this point SPP also assesses whether the inclusion of **reads with non-perfect alignment quality** improves the cross-correlation peak, and flags them accordingly. If you would like to accept all aligned tags, specify `accept.all.tags=T` argument to save time.
117 | 
118 | (***DO NOT RUN THIS***)
119 | ```
120 | # Get binding info from cross-correlation profile
121 | binding.characteristics <- get.binding.characteristics(chip.data,srange=c(50,500),bin=5)
122 | 
123 | # Print out binding peak separation distance
124 | print(paste("binding peak separation distance =",binding.characteristics$peak$x))
125 | 
126 | # Plot cross-correlation profile
127 | pdf(file=paste(path, prefix, ".crosscorrelation.pdf", sep=""),width=5,height=5)
128 | par(mar = c(3.5,3.5,1.0,0.5), mgp = c(2,0.65,0), cex = 0.8)
129 | plot(binding.characteristics$cross.correlation,type='l',xlab="strand shift",ylab="cross-correlation")
130 | abline(v=binding.characteristics$peak$x,lty=2,col=2)
131 | dev.off()
132 | ```
133 | 
134 | ### Assemble informative tags
135 | 
136 | The next function will select tags with acceptable alignment quality, based on flags assigned above. Moving forward with only informative tags, the ChIP and input data has been converted into a simple list of tag coordinate vectors (read start position:read end position). 
137 | 
138 | <div style="text-align:center"><img src="../img/read-density2.png" width="300"></div>
139 | 
140 | (***DO NOT RUN THIS***)
141 | ```
142 | # select informative tags based on the binding characteristics
143 | chip.data <- select.informative.tags(chip.data, binding.characteristics)
144 | input.data <- select.informative.tags(input.data, binding.characteristics)
145 | 
146 | ```
147 | 
148 | ### Subtract background
149 | 
150 | The statistical significance of a peak observed for a putative protein binding position depends on the expected background pattern. 
151 | 
152 | The input tag density identifies three major types of background anomalies: 
153 | 
154 | 1. Singular peaks of tag density at a single chromosome position many orders of magnitude higher than the surrounding density. Such peaks commonly occur at the same position on both chromosome strands. 
155 | 2. Larger, nonuniform regions of increased background tag density on either one or both strands 
156 | 3. Background tag density patterns resembling true protein-binding positions (typically shows smaller separation between strand peaks)
157 | 
158 | <div style="text-align:center"><img src="../img/background-subtract.png" width="500"></div>
159 | 
160 | The next function is used to correct for background anomalies described in point 1 above. `remove.local.tag.anomalies()` will scan along the chromosomes calculating local density of regions (can be specified using window.size parameter, default is 200bp), removing or restricting singular positions with extremely high tag count relative to the neighborhood. 
161 | 
162 | (***DO NOT RUN THIS***)
163 | ```
164 | # restrict or remove singular positions with very high tag counts
165 | chip.data <- remove.local.tag.anomalies(chip.data)
166 | input.data <- remove.local.tag.anomalies(input.data)
167 | ```
168 | 
169 | ### Determine binding positions
170 | 
171 | To identify peaks, background subtraction methods are applied to correct for anomalies outlined in 2) and 3) above. The corrections have little effect on the top binding positions, but help with lower ranked peaks reducing false-positive peaks arising from uneven background.
172 | 
173 | We will use the WTD method to call binding positions, which uses a sliding window and calculates the geometric average on positive and negative strand. Additionally, we will specify an FDR of 1% and a window size estimated by the binding.characteristics:
174 | 
175 | (***DO NOT RUN THIS***)
176 | ```
177 | # binding detection parameters
178 | # desired FDR (1%). Alternatively, an E-value can be supplied to the method calls below instead of the fdr parameter
179 | fdr <- 1e-2
180 | 
181 | # the binding.characteristics contains the optimized half-size for binding detection window
182 | detection.window.halfsize <- binding.characteristics$whs
183 | 
184 | # determine binding positions using wtd method
185 | bp <- find.binding.positions(signal.data=chip.data,control.data=input.data,fdr=fdr,whs=detection.window.halfsize)
186 | 
187 | print(paste("detected",sum(unlist(lapply(bp$npl,function(d) length(d$x)))),"peaks"))
188 | 
189 | ```
190 | 
191 | ### Output to file
192 | 
193 | Finally, we will write the results to file. Three files will be generated in your `spp` directory:
194 | 
195 | 1. Detected binding positions
196 | 2. narrowPeak file
197 | 3. WIG file of enrichment estimates
198 | 
199 | To generate enrichment estimates SPP scans ChIP and signal tag density to estimate lower bounds of tag enrichment (and upper bound of tag depletion if it is significant) along the genome. The resulting profile gives conservative statistical estimates of log2 fold-enrichment ratios along the genome. The example below uses a window of 500bp (and background windows of 1, 5, 25 and 50 times that size) and a confidence interval corresponding to 1%.
200 | 
201 | (***DO NOT RUN THIS***)
202 | ```
203 | # output detected binding positions
204 | output.binding.results(bp,paste(path, prefix,".binding.positions.txt", sep=""))
205 | write.narrowpeak.binding(bp,paste(path, prefix,".narrowPeak", sep=""))
206 | 
207 | # output wig file
208 | enrichment.estimates <- get.conservative.fold.enrichment.profile(chip.data,input.data,fws=500,step=100,alpha=0.01)
209 | writewig(enrichment.estimates,paste(path, prefix, ".enrichment.estimates.wig", sep=""), paste(prefix, "_Conservative fold-enrichment/depletion estimates shown on log2 scale", sep=""))
210 | 
211 | ```
212 | 
213 | ## Running SPP on all files
214 | 
215 | To generate peaks for the remaining samples we need to run the Rscript three more times, each time changing the input to the appropriate files:
216 | 
217 | `$ Rscript get_peaks.R bowtie2/H1hesc_Input_Rep2_chr12_aln.bam bowtie2/H1hesc_Nanog_Rep2_chr12_aln.bam`
218 | 
219 | `$ Rscript get_peaks.R bowtie2/H1hesc_Input_Rep1_chr12_aln.bam bowtie2/H1hesc_Pou5f1_Rep1_chr12_aln.bam`
220 | 
221 | `$ Rscript get_peaks.R bowtie2/H1hesc_Input_Rep2_chr12_aln.bam bowtie2/H1hesc_Pou5f1_Rep2_chr12_aln.bam`
222 | 
223 | ## Evaluating SPP output
224 | 
225 | Take a look at the files output from SPP. For each file you should see **4 files**, one of which is the **narrowPeak** files described above. 
226 | 
227 | There is also a **binding positions** file, which to the narrowPeak file in that each line correspond to a peak, but it contains differ in fields of information. The file contains a table with each row corresponding to a detected position, with the following columns: 
228 | 
229 | 1. chromosome 
230 | 2. position of detected binding site on the chromosome
231 | 3. score reflecting magnitude of the binding 
232 | 4. E-value corresponding to the peak magnitude
233 | 5. FDR corresponding to the peak magnitude
234 | 6. lower bound of the fold-enrichment ratio
235 | 7. maximum likelihood estimate of the fold-enrichment ratio
236 | 
237 | We can summarize the number of peaks for each sample by counting the lines:
238 | 
239 | 	wc -l spp/*.narrowPeak
240 | 
241 | *How do these numbers compare to those generated by MACS2?*
242 | 
243 | > **NOTE:** To take a quick look at how the results overlap with MACS2 (and get your hands wet with [bedtools](http://bedtools.readthedocs.org/en/latest/index.html)) you can browse our [lesson on comparing peak callers](https://github.com/hbctraining/In-depth-NGS-Data-Analysis-Course/blob/may2017/sessionV/lessons/compare_callers_IGV.md). However, keep in mind that you **can't really compare peaks from two different peak callers using their default or arbitrary thresholds**. They are totally uncalibrated. Rather you run them individually through IDR and then compare the IDR thresholded peaks.
244 | 
245 | ***
246 | *This lesson has been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.*
247 | 
248 | * *Material in this lesson is in part adapted from the SPP tutorial: http://compbio.med.harvard.edu/Supplements/ChIP-seq/tutorial.html*
249 | 
250 | 
251 | 
252 | 
253 | 


--------------------------------------------------------------------------------
/lessons/qc_deeptools.md:
--------------------------------------------------------------------------------
  1 | ## Quality assessment using `deepTools`
  2 | 
  3 | Another way in which we can assess the quality of our samples is by investigating the read coverages for each sample and determine the variability in coverage per sample group. Replicate samples that vary greatly in where the reads stack up is indicative of a weak ChIP-seq experiment. In addition, we can identify outlier samples or batch effects.
  4 | 
  5 | To evaluate read coverage we will be using the [`deepTools`](http://deeptools.readthedocs.org/en/latest/content/list_of_tools.html), a suite of python tools developed for the efficient analysis of high-throughput sequencing data, such as ChIP-seq, RNA-seq or MNase-seq. `deepTools` has a wide variety of tools that go beyond those that are covered in this lesson. We encourage you to look through the docuementation and explore on your own time.
  6 | 
  7 | Assessing and visualizing coverage using `deepTools` requires three steps: 
  8 | 
  9 | 1. Indexing the BAM alignment files
 10 | 2. Calculation of the read coverage scores using the `multiBamSummary` tool
 11 | 3. Visualizing how read coverage scores compare between samples
 12 | 
 13 | First let's create a directory for all of the output generated by deepTools, and move into that directory:
 14 | 
 15 | ```
 16 | $ cd ~/ngs_course/chipseq/results/chip_qc		
 17 |  		
 18 | $ mkdir deeptools 		
 19 |  	
 20 | $ cd deeptools
 21 | ```
 22 | To use deepTools, we will need an **index** (`.bai` file) for each of our BAM files using the `samtools index` tool. This has **already been done for you**, if you look inside your `bowtie2` folder:
 23 | 
 24 | ```
 25 | $ ls -l ../../bowtie2/*.bai
 26 | ```
 27 | 
 28 | Let's load the module and we are ready to get started:
 29 | 
 30 | ```
 31 | $ module load seq/deeptools/2.2.0
 32 | ```
 33 | 
 34 | ### Calculation of the read coverage scores using the `multiBamSummary` tool
 35 | 
 36 | The `multiBamSummary` tool will calculate the read coverage scores for specific genomic regions between samples and provide the output as a binary compressed numpy array (.npz) file; however, the analysis can be performed on the entire genome by changing the mode of this tool to `bins`. If you prefer, it can also output a `readCounts.tab` file that contains a list read counts per sample for every 10,000bp region in the genome from which you can use to create your own images. 
 37 | 
 38 | ```
 39 | $ multiBamSummary bins --ignoreDuplicates -p 6 \
 40 | --bamfiles ../../bowtie2/*aln.bam \
 41 | -out deeptools_multiBAM.out.npz \
 42 | --outRawCounts readCounts.tab
 43 | ```
 44 | 
 45 | ### Visualizing read coverage quality metrics
 46 | 
 47 | Now that we have the read coverage scores calculated for all samples, we can now analyze the coverage between samples using a variety of the `deepTools` tools:
 48 | 
 49 | #### 1. Sample correlation - `plotCorrelation` tool
 50 | 
 51 | The `plotCorrelation` tool allows us to visualize the similarity between samples based on their read coverage of regions of the genome. 
 52 | 
 53 | ![correlate](../img/QC_bamCorrelate_deeptools.png)
 54 | 
 55 | We can visualize correlations using a scatterplot:
 56 | 
 57 | ```
 58 | $ plotCorrelation --corData deeptools_multiBAM.out.npz \
 59 | --plotFile deepTools_scatterplot.png \
 60 | --corMethod pearson \
 61 | --whatToPlot scatterplot \
 62 | --labels Input_Rep1 Input_Rep2 Nanog_Rep1 Nanog_Rep2 Pou5f1_Rep1 Pou5f1_Rep2
 63 | ```
 64 | <img src="../img/deepTools_scatterplot.png" width="500">
 65 | 
 66 | We expect high correlations between replicates, and lower correlations between samplegroups. However, we do not observe this when looking at read coverage on chromosome 12. Specifically, we see that Input-Rep1 does not correlate well with any of the other samples. If this were for the entire genome, we might be concerned that we would not have reporducibility between replicates for many of the peaks, and that Input-Rep1 is a potential outlier.
 67 | 
 68 | The same `plotCorrelation` tool can be used to examine the read coverage similarity using a heatmap to perform heirarchical clustering and determine whether our sample groups cluster well (i.e. have similar read coverage profiles within and between sample groups). The lack of correlation between replicates is even more visible in the heatmap, as is the lack of correlation with Input-Rep1 and all other samples.
 69 | 
 70 | ```
 71 | $ plotCorrelation --corData deeptools_multiBAM.out.npz \
 72 | --plotFile deepTools_heatmap.png \
 73 | --corMethod pearson \
 74 | --whatToPlot heatmap \
 75 | --labels Input_Rep1 Input_Rep2 Nanog_Rep1 Nanog_Rep2 Pou5f1_Rep1 Pou5f1_Rep2 \
 76 | --plotNumbers
 77 | ```
 78 | <img src="../img/deeptools_heatmap.png" width="400">
 79 | 
 80 | #### 2. Sample variability - `plotPCA` tool
 81 | 
 82 | The next quality metric we will explore is the principal component analysis (PCA) of our read coverage calculations. PCA can be used to determine whether samples display greater variability between experimental conditions than between replicates of the same treatment based on information (read coverage values) from thousands of regions. PCA is also useful to identify unexpected patterns, such as those caused by batch effects or outliers. 
 83 | 
 84 | We will use the tool `plotPCA` to sort the principal components according to the amount of variability of the data that they explain and generate two plots:
 85 | 
 86 | - the PCA plot for the top two principal components eigenvalues 
 87 | - the Scree plot for the top five principal components where the bars represent the amount of variability explained by the individual factors and the red line traces the amount of variability is explained by the individual components in a cumulative manner [[1]](http://deeptools.readthedocs.org/en/latest/content/tools/plotPCA.html)
 88 | 
 89 | ```
 90 | $ plotPCA --corData deeptools_multiBAM.out.npz \
 91 | --plotFile deepTools_pcaplot.png \
 92 | -T "PCA of read counts" \
 93 | --outFileNameData deeptools_pcaProfile.tab \
 94 | --labels Input_Rep1 Input_Rep2 Nanog_Rep1 Nanog_Rep2 Pou5f1_Rep1 Pou5f1_Rep2
 95 | ```
 96 | 
 97 | <img src="../img/deepTools_pcaplot.png" width=400>
 98 | 
 99 | Similar to the correlation plots, we see little clustering of the replicates. The variation between sample groups does not account for the major sources of variation in the data. Keep in mind that these plots are generated for a small subset of data. It is likely that using the whole dataset will change the outcome of these plots and perhaps more similar to what we would expect.
100 | 
101 | ***
102 | *This lesson has been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.*


--------------------------------------------------------------------------------
/lessons/shell_review.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "The Shell"
  3 | author: "Mary Piper, Radhika Khetani, Meeta Mistry"
  4 | date: "April 2019"
  5 | ---
  6 | 
  7 | ## Learning Objectives
  8 | - Review shell commands
  9 | - Review HPC concepts 
 10 | 
 11 | ## Setting up
 12 | 
 13 | Let's look at some of the basic HPC concepts with the **first few slides in [this slide deck](https://hbctraining.github.io/Intro-to-rnaseq-hpc-salmon/lectures/HPC_intro_O2_review.pdf)**, then we will come back and get some practice.
 14 | 
 15 | ### Connecting to a *login* node on O2
 16 | 
 17 | Type in the following command with your username to login:
 18 | 
 19 | ```bash
 20 | ssh username@o2.hms.harvard.edu
 21 | ```
 22 | 
 23 | You will receive a prompt for your password, and you should type in your associated password; note that the cursor will *not move* as you type in your password.
 24 | 
 25 | A warning might pop up the first time you try to connect to a remote machine, type "Yes" or "Y". 
 26 | 
 27 | Once logged in, you should see the O2 icon, some news, and the command prompt: 
 28 | 
 29 | ```
 30 | [rc_training10@login01 ~]$ 
 31 | ```
 32 | 
 33 | > `ssh` stands for secure shell. All of the information (like your password) going between your computer and the O2 login computer is encrypted when using `ssh`.
 34 | >
 35 | > A "node" on a cluster is essentially a computer in the cluster of computers.
 36 | 
 37 | **A login node's only function is to enable users to log in to a cluster, it is not meant to be used for any actual work/computing.**
 38 | 
 39 | ### Connecting to a *compute* node on O2
 40 | 
 41 | There are multiple ways to connect with, and do work on, a compute node; a compute node is where all work should be performed. To connect to a compute node, users have to interact with a job scheduler like *slurm* using commands like `srun` or `sbatch`, and by specifying what resources they require.
 42 | 
 43 | 1. The `srun` command with a few mandatory parameters will create an "interactive session" on O2. This is essentially a way for us to do work on the compute node directly from the terminal. If the connectivity to the cluster is lost in the middle of a command being run that work will be lost in an interactive session.
 44 | 
 45 | 2. The `sbatch` command with a few mandatory parameters + a specialized shell script will result in the script being run on a compute node. This "job" will not be accessible directly from the Terminal and will run in the background. Users do not need to remain connected to the cluster when such a "batch job" is running.
 46 | 
 47 | You will get practice with running batch jobs, for now we are going to start an interactive session on O2 using `srun`. 
 48 | 
 49 | ```bash
 50 | $ srun --pty -p interactive -t 0-8:00 --mem 1G --reservation=HBC2 /bin/bash
 51 | ```
 52 | 
 53 | In the above command the parameters we are using are requesting specific resources:
 54 | * `--pty` - Start an interactive session
 55 | * `-p interactive` - on the "partition" called "interactive" (a partition is a group of computers dedicated to certain types of jobs, interactive, long, short, high-memory, etc.)
 56 | * `-t 0-8:00` - time needed for this work: 0 days, 8 hours, 0 minutes.
 57 | * `--mem 1G` - memory needed - 1 gigabyte
 58 | * `--reservation=HBC2` - ***this is only for this workshop, make sure you don't use it in the future with your own accounts***
 59 | * `/bin/bash` - You want to interact with the compute node using the *bash* shell
 60 | 
 61 | > These resources are listed slightly differently in the specialized script that is submitted directly using `sbatch`. We will be reviewing the arguments above and what that specialized script looks like at the end of this lesson.
 62 | 
 63 | Make sure that your command prompt is now preceded by a character string that contains the word "compute":
 64 | 
 65 | ```
 66 | [rc_training10@compute-a-16-163 ~]$
 67 | ```
 68 | 
 69 | ### Copying example data folder
 70 | 
 71 | Your accounts were erased after the command-line workshop, so we are starting fresh this time, let's copy over the same data folder we worked with in the shell workshop to our home directories:
 72 | 
 73 | ```bash
 74 | $ cp -r /n/groups/hbctraining/unix_lesson/ .
 75 | ```
 76 | 
 77 | ****
 78 | 
 79 | **Exercise**
 80 | 
 81 | 1. In the above command, what does the `.` at the end mean? Is it essential?
 82 | 2. Why did we have to run the `cp` command with `-r`?
 83 | 3. Is the path to the `unix_lesson/` directory a "full" path or a "relative" path?
 84 | 
 85 | ****
 86 | 
 87 | ## Reviewing shell commands
 88 | 
 89 | We are going to start this review with more exercises, this time hands on! Remember, there are likely multiple ways to do the same thing and we will try to cover at least a few.
 90 | 
 91 | ****
 92 | 
 93 | **Exercises**
 94 | 
 95 | **Shell basics**
 96 | 
 97 | 1. Change directory into the `unix_lesson/` directory.
 98 | 2. Use the `tree` command to get a directory structure of `unix_lesson/`.
 99 | 3. Take a quick look at the `Mov10_oe_1.subset.fq` file using `less` from `unix_lesson/`, without changing directories.
100 | 4. Move up to your home directory (parent of `unix_lesson/`).
101 | 5. With a single command change directories to the `raw_fastq/` folder.
102 | 6. With a shortest possible command change directories back to the home directory.
103 | 7. What does the `~` in the command prompt mean?
104 | 8. What is the full path to your home directory?
105 | 9. List, in long listing format, the contents of `/n/groups/hbctraining/intro_rnaseq_hpc/full_dataset/` **using tab completion**.
106 | 10. Modify the above command using the `*` wildcard to only list those files that have "oe" in their names.
107 | 11. How many and which commands have you run so far today?
108 | 
109 | **Loops and shell scripts**
110 | 
111 | 16. Use the `for` loop to iterate over each FASTQ file in `~/unix_lesson/raw_fastq/` and do the following:
112 |       * Print the name of the current file
113 |       * Dump out the first 40 lines into a new file that will be saved in `~/unix_lesson/shell_review/`
114 | 17. Place the above `for` loop into a shell script using `vim` and run it.
115 | 
116 | **Environment variables**
117 | 
118 | 19. Display the contents of the `$HOME` variable.
119 | 20. Use the `which` command to check where the executable file for the `pwd` command lives in the directory structure.
120 | 21. How does shell know where to find the executable file for the `pwd` command?
121 | 22. Display the contents of the variable that stores the various paths to folders containing executable command files.
122 | 23. Can you run the `bowtie2` command? What do you think you might need to do to run this command?
123 | 
124 | **LMOD system**
125 | 
126 | 24. Load the `gcc/6.2.0` module.
127 | 25. Has `$PATH` changed? 
128 | 26. Load the `bowtie2/2.3.4.3` module.
129 | 27. List the modules that are loaded.
130 | 
131 | ****
132 | 
133 | ## Some setting up for the rest of the workshop
134 | 
135 | ### Add a path to `$PATH`
136 | 
137 | We need to use one tool that is unavailable as a module on O2, but it is available in a folder on O2, so we are going to add it to our $PATH. If we just add it using the `export` command, it will only be available to us in this specific interactive session. However, if we place that export command in a script that is run everytime a new interactive session is started, it is more efficient.
138 | 
139 | * Use `vim` to open `~/.bashrc`
140 | * Add the following line at the end of the file `export PATH=/n/app/bcbio/tools/bin:$PATH`
141 | * Save and quit out of `vim`
142 | 
143 | ### Resources on O2 and asking Slurm for them
144 | 
145 | Finally, let's review some of the information for O2 and slurm in [the rest of the slides](https://hbctraining.github.io/Intro-to-rnaseq-hpc-salmon/lectures/HPC_intro_O2_review.pdf)
146 | 
147 | ****
148 | 
149 | *This lesson has been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.*
150 | 


--------------------------------------------------------------------------------
/lessons/web_based_functional_analysis.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "ChIP-seq Functional Analysis"
 3 | author: "Mary Piper, Radhika Khetani"
 4 | date: "Thursday, June 29th, 2017"
 5 | ---
 6 | 
 7 | Approximate time: 40 minutes
 8 | 
 9 | ## Learning Objectives
10 | 
11 | * Explore web-based tools for functional enrichment analysis of the peak calls
12 | 
13 | ## Web-based Functional Enrichment: GREAT
14 | 
15 | <img src="../img/chip_workflow_march2018_step5.png" width="700">
16 | 
17 | We have identified regions of enrichment in the genome which represent the potential binding sites for Nanog and Pou5f1. 
18 | 
19 | After identifying likely binding sites, downstream analyses will often include: 
20 | 
21 | 1. Identifying which genes are associated with the binding sites 
22 | 2. Exploring whether there is any associated enrichment of processes, pathways, or networks
23 | 
24 | We will explore a web-based tool called [GREAT](http://great.stanford.edu/public/html/) for performing these analyses using our Nanog peak calls.
25 | 
26 | Since the functional enrichment analyses are unlikely to give reliable results using only the 32.8 Mb of reads mapping to chr12,  we will use the **full set of peak calls output from the IDR analysis**.
27 | 
28 | ## Set-up
29 | 
30 | Start an interactive session:
31 | 
32 | ```bash
33 | $ srun --pty -p interactive -t 0-12:00 --mem 1G --reservation=HBC2 bash	
34 | ```
35 | 
36 | Extract the first three columns of the IDR peak calls for the whole genome of Nanog:
37 | 
38 | ```bash
39 | $ cd ~/chipseq/results
40 | 
41 | $ mkdir functional_analysis
42 | 
43 | $ cd functional_analysis
44 | 
45 | $ cp /n/groups/hbctraining/chip-seq/full-dataset/idr/*.bed .
46 | 
47 | $ cut -f 1,2,3 Nanog-idr-merged.bed  > Nanog-idr-merged-great.bed
48 | ```
49 | 
50 | 
51 | Using `scp` or **FileZilla** on your local computer, transfer `Nanog-idr-merged-great.bed` to your Desktop.
52 | 
53 | ```bash
54 | $ scp username@transfer.rc.hms.harvard.edu:~/chipseq/results/functional_analysis/*merged-* Desktop/
55 | ```
56 | 
57 | ## Functional enrichment analysis
58 | 
59 | We will use [GREAT](http://bejerano.stanford.edu/great/public/html/index.php) to perform the functional enrichment analysis. GREAT takes a list of regions, associates them with nearby genes, and then analyzes the gene annotations to assign biological meaning to the data.
60 | 
61 |  Open [GREAT](http://bejerano.stanford.edu/great/public/html/index.php), and perform the following steps:
62 | 
63 | 1. Choose the `Nanog-idr-merged-great.bed` file and use the `Whole genome` for Background regions. Click Submit. GREAT provides the output in HTML format organized by section.
64 | 
65 | 2. Expand the `Job Description` section. Click on `View all genomic region-gene associations`. Note that each associated gene is listed with location from the transcription start site as shown below:
66 | 
67 | 	![tss_gene](../img/tss_distance.png)
68 | 
69 | 	Within this section, you have the option to download the list of genes associated with Nanog binding sites or you could view all of the binding sites as a custom track in the UCSC Genome Browser.
70 | 	
71 | 3. Scroll down to the `Region-Gene Association Graphs`. Observe the graphics displaying the summary of the number of genes associated with each binding site and the binding site locations relative to the transcription start sites of the associated genes
72 | 	
73 | 	![tss_graphs](../img/great_region_assoc.png)
74 | 
75 | 4. Below the `Region-Gene Association Graphs` are the `Global Controls`, where you can select the annotation information to display. Keep the default settings and scroll down to view the information displayed. 
76 | 
77 | 5. Explore the GO Biological Process terms associated with the Nanog binding sites. Notice the options available at the top of the tables for exporting data, changing settings, and visualization.
78 | 
79 | 	![annot](../img/great_annot.png)
80 | 	
81 | 	GREAT calculates two measures of statistical enrichment: "one using a binomial test over genomic regions and one using a hypergeometric test over genes" [[2](http://bejerano.stanford.edu/help/display/GREAT/Statistics)]. Each test has its own biases, which are compensated for by the other test. 
82 | 	
83 | 6. Click on the term `negative regulation of stem cell differentiation`:
84 | 
85 | 	![select_go](../img/great_selection_go.png)
86 | 	
87 | 	Note that summary information about the binding sites of Nanog for genes associated with this GO term are displayed.
88 | 	
89 | 7. Expand the section for `This term's genomic region-gene association tables`. Notice that you have the option to download the gene table.
90 | 
91 | 8. Click on `NOTCH1`. Explore the binding regions directly within the UCSC Genome Browser.
92 | 
93 | 
94 | 
95 | ***
96 | *This lesson has been developed by members of the teaching team at the [Harvard Chan Bioinformatics Core (HBC)](http://bioinformatics.sph.harvard.edu/). These are open access materials distributed under the terms of the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.*
97 | 
98 | 


--------------------------------------------------------------------------------
/samplesheet_chr12.csv:
--------------------------------------------------------------------------------
1 | SampleID,Factor,Replicate,bamReads,ControlID,bamControl,Peaks,PeakCaller,Tissue,Condition
2 | Nanog.Rep1,Nanog,1,data/bams/H1hesc_Nanog_Rep1_aln.bam,Nanog-Input1,data/bams/H1hesc_Input_Rep1_aln.bam,data/peakcalls/Nanog-rep1_peaks.narrowPeak,narrow,NA,NA
3 | Nanog.Rep2,Nanog,2,data/bams/H1hesc_Nanog_Rep2_aln.bam,Nanog-Input2,data/bams/H1hesc_Input_Rep2_aln.bam,data/peakcalls/Nanog-rep2_peaks.narrowPeak,narrow,NA,NA
4 | Pou5f1.Rep1,Pou5f1,1,data/bams/H1hesc_Pou5f1_Rep1_aln.bam,Pou5f1-Input1,data/bams/H1hesc_Input_Rep1_aln.bam,data/peakcalls/Pou5f1-rep1_peaks.narrowPeak,narrow,NA,NA
5 | Pou5f1.Rep2,Pou5f1,2,data/bams/H1hesc_Pou5f1_Rep2_aln.bam,Pou5f1-Input2,data/bams/H1hesc_Input_Rep2_aln.bam,data/peakcalls/Pou5f1-rep2_peaks.narrowPeak,narrow,NA,NA


--------------------------------------------------------------------------------
/schedule/2-day.md:
--------------------------------------------------------------------------------
 1 | # Workshop Schedule
 2 | 
 3 | 
 4 | ## Day 1
 5 | 
 6 | | Time            |   Topic  | Instructor |
 7 | |:------------------------:|:----------:|:--------:|
 8 | |9:00 - 9:15| [Introduction to Workshop](https://hbctraining.github.io/Intro-to-ChIPseq/lectures/Intro_to_workshop.pdf) | Radhika |
 9 | |9:15 - 10:00 | [Introduction to ChIP-seq](https://hbctraining.github.io/Intro-to-ChIPseq/lectures/Introduction%20to%20ChIP-seq%202019.pdf) | Dr. Shannan Ho Sui |
10 | |10:00 - 10:10 | [ChIP-seq Analysis Overview](https://hbctraining.github.io/Intro-to-ChIPseq/lectures/ChIP-seq_workflow_scope.pdf) | Meeta |
11 | |10:10 - 10:20 | Break | |
12 | |10:20 - 11:20 | [Unix review and Working in an HPC environment](https://hbctraining.github.io/Intro-to-ChIPseq/lessons/shell_review.html) | Radhika |
13 | |11:20 - 11:50 | [Project Organization and Data Management](https://hbctraining.github.io/Intro-to-ChIPseq/lessons/01_Intro_chipseq_data_organization.html) | Meeta |
14 | |11:50 - 12:50 | [Sequence Data QC using FastQC](https://hbctraining.github.io/Intro-to-ChIPseq/lessons/02_QC_FASTQC.html) | Mary |
15 | |12:50 - 13:50 | Lunch | |
16 | |13:50 - 14:05 | [Alignment theory](https://github.com/hbctraining/Intro-to-ChIPseq/blob/master/lectures/alignment_theory.pdf) | Meeta |
17 | |14:05 - 15:05 | [Alignment and filtering of reads](https://hbctraining.github.io/Intro-to-ChIPseq/lessons/03_align_and_filtering.html) | Mary |
18 | |15:05 - 15:15 | Break | |
19 | |15:15 - 16:15 | [Automating generation of alignment files](https://hbctraining.github.io/Intro-to-ChIPseq/lessons/04_automation.html) | Radhika |
20 | |16:15 - 16:30 | [ChIP-seq File Formats](https://hbctraining.github.io/Intro-to-ChIPseq/lectures/Fileformats.pdf) | Radhika |
21 | |16:30 - 17:00 | [Peak calling with MACS2](https://hbctraining.github.io/Intro-to-ChIPseq/lessons/05_peak_calling_macs.html) | Meeta |
22 | 
23 | 
24 | ## Day 2
25 | 
26 | | Time            |  Topic  | Instructor |
27 | |:------------------------:|:----------:|:--------:|
28 | |9:00 - 9:20 | [Peak calling with MACS2 (cont'd)](https://hbctraining.github.io/Intro-to-ChIPseq/lessons/05_peak_calling_macs.html) | Meeta |
29 | |9:20 - 10:40| [Assessing Peak calls and ChIP quality using ChIPQC](https://hbctraining.github.io/Intro-to-ChIPseq/lessons/06_combine_chipQC_and_metrics.html) | Mary |
30 | |10:40 - 10:50 | Break | |
31 | |10:50 - 11:25 | [Handling Replicates](https://hbctraining.github.io/Intro-to-ChIPseq/lessons/07_handling-replicates-idr.html) | Radhika |
32 | |11:25 - 12:10 | [Differentially enriched peaks using DiffBind](https://hbctraining.github.io/Intro-to-ChIPseq/lessons/08_diffbind_differential_peaks.html) | Meeta |
33 | |12:10 - 13:10 | Lunch | |
34 | |13:10 - 14:00 | [Visualization and exploration of ChIP-seq data](https://hbctraining.github.io/Intro-to-ChIPseq/lessons/10_data_visualization.html) | Meeta |
35 | |14:00 - 14:45 | [Qualitative assessment using IGV](https://hbctraining.github.io/Intro-to-ChIPseq/lessons/11_qualitative_assessment_IGV.html) | Radhika |
36 | |14:45 - 14:55 | Break | |
37 | |14:55 - 16:15 | [Functional analysis and Motif Analysis](https://hbctraining.github.io/Intro-to-ChIPseq/lessons/12_functional_analysis.html) | Mary |
38 | |16:15 - 16:45| [Overview of ChIP-seq workflow](https://github.com/hbctraining/Intro-to-ChIPseq/blob/master/lectures/ChIP-seq_troubleshooting_2019.pdf) | Meeta |
39 | |16:45 - 17:00| [Wrap-up and Survey](https://hbctraining.github.io/Intro-to-ChIPseq/lectures/Wrap-up_new.pdf) | Radhika |
40 | 


--------------------------------------------------------------------------------
/schedule/3-day.md:
--------------------------------------------------------------------------------
 1 | # Workshop Schedule
 2 | 
 3 | ## Day 1
 4 | 
 5 | | Time            |  Topic  | Instructor |
 6 | |:------------------------:|:------------------------------------------------:|:--------:|
 7 | |9:00 - 9:40 | [Workshop Introduction](https://hbctraining.github.io/Intro-to-ChIPseq/lectures/Intro_to_workshop.pdf) | Radhika |
 8 | |9:40 - 10:30 | [Introduction to the Shell](https://hbctraining.github.io/Intro-to-Shell/lessons/01_the_filesystem.html) | Radhika |
 9 | |10:30 - 10:45 | Break | |
10 | |10:45 - 11:35 | [Introduction to the Shell (cont.)](https://hbctraining.github.io/Intro-to-Shell/lessons/01_the_filesystem.html) | Meeta |
11 | |11:35 - 12:15 | [Searching and Redirection](https://hbctraining.github.io/Intro-to-Shell/lessons/02_searching_files.html) | Mary |
12 | |12:15 - 13:15 | Lunch | |
13 | |13:15 - 13:45 | [Introduction to the Vim Text Editor](https://hbctraining.github.io/Intro-to-Shell/lessons/03_vim.html) | Mary |
14 | |13:45 - 15:00 | [Loops and Shell Scripts](https://hbctraining.github.io/Intro-to-Shell/lessons/04_loops_and_scripts.html) | Meeta |
15 | |15:00 - 15:15 | Break | |
16 | |15:15 - 15:45 | [Permissions and Environment Variables](https://hbctraining.github.io/Intro-to-Shell/lessons/05_permissions_and_environment_variables.html) | Radhika |
17 | |15:45 - 17:00 | [Introduction to High-Performance Computing for HMS-RC's O2](https://hbctraining.github.io/Intro-to-rnaseq-hpc-O2/lectures/HPC_intro_O2.pdf) | Radhika |
18 | 
19 | ## Day 2
20 | 
21 | | Time            |   Topic  | Instructor |
22 | |:------------------------:|:----------:|:--------:|
23 | |9:00 - 10:00 | [Introduction to ChIP-seq](https://github.com/hbctraining/Intro-to-ChIPseq/blob/master/lectures/Introduction_to_ChIP-seq.pdf) | Meeta |
24 | |10:00 - 10:15 | Break | |
25 | |10:15 - 11:00 | [Project Organization and Best Practices in Data Management](https://hbctraining.github.io/Intro-to-ChIPseq/lessons/01_Intro_chipseq_data_organization.html) | Radhika |
26 | |11:00 - 11:50 | [Sequencing data QC using FastQC](https://hbctraining.github.io/Intro-to-ChIPseq/lessons/02_QC_FASTQC.html) | Mary |
27 | |11:50 - 12:15 | [Alignment theory](https://github.com/hbctraining/Intro-to-ChIPseq/blob/master/lectures/alignment_theory.pdf) | Meeta |
28 | |12:15 - 13:15 | Lunch | |
29 | |13:15 - 14:00 | [Alignment and filtering of reads](https://hbctraining.github.io/Intro-to-ChIPseq/lessons/03_align_and_filtering.html) | Mary |
30 | |14:00 - 15:20 | [Automating generation of alignment files](https://hbctraining.github.io/Intro-to-ChIPseq/lessons/04_automation.html) | Radhika |
31 | |15:20 - 15:35 | Break | |
32 | |15:35 - 15:55 | [ChIP-seq File Formats](https://hbctraining.github.io/Intro-to-ChIPseq/lectures/Workflows_and_fileformats.pdf) | Radhika |
33 | |15:55 - 17:00 | [Peak calling with MACS2](https://hbctraining.github.io/Intro-to-ChIPseq/lessons/05_peak_calling_macs.html) | Meeta |
34 | 
35 | 
36 | ## Day 3
37 | 
38 | | Time            |  Topic  | Instructor |
39 | |:------------------------:|:----------:|:--------:|
40 | |9:00 - 9:20 | [Peak calling with MACS2 (contd.)](https://hbctraining.github.io/Intro-to-ChIPseq/lessons/05_peak_calling_macs.html) | Meeta |
41 | |9:20 - 10:20 | [Assessing ChIP quality using cross correlation](https://hbctraining.github.io/Intro-to-ChIPseq/lessons/06_QC_cross_correlation.html) | Kayleigh |
42 | |10:20 - 10:35 | Break | |
43 | |10:35 - 11:35| [Assessing Peak calls and ChIP quality using ChIPQC](https://hbctraining.github.io/Intro-to-ChIPseq/lessons/07_QC_quality_metrics.html) | Meeta |
44 | |11:35 - 12:00 | [Handling Replicates with Bedtools](https://hbctraining.github.io/Intro-to-ChIPseq/lessons/08_handling-replicates-bedtools.html) | Radhika |
45 | |12:00 - 13:00 | Lunch | |
46 | |13:00 - 13:50 | [Handling Replicates with IDR](https://hbctraining.github.io/Intro-to-ChIPseq/lessons/09_handling-replicates-idr.html) | Radhika |
47 | |13:50 - 15:10 | [Visualization and exploration of ChIP-seq data](https://hbctraining.github.io/Intro-to-ChIPseq/lessons/10_data_visualization.html) | Meeta |
48 | |15:10 - 15:25 | Break | |
49 | |15:25 - 15:50 | [Qualitative assessment using IGV](https://hbctraining.github.io/Intro-to-ChIPseq/lessons/11_qualitative_assessment_IGV.html) | Radhika |
50 | |15:50 - 16:20 | [Functional analysis](https://hbctraining.github.io/Intro-to-ChIPseq/lessons/12_functional_analysis.html) | Kayleigh |
51 | |16:20 - 16:40 | [Overview of ChIP-seq workflow](https://hbctraining.github.io/Intro-to-ChIPseq/lectures/ChIP-seq_troubleshooting.pdf) | Meeta |
52 | |16:40 - 17:00 | [Wrap-up and Survey](https://hbctraining.github.io/Intro-to-ChIPseq/lectures/Wrap-up.pdf) | Radhika |
53 | 
54 | **Dataset:** [Introduction to shell dataset](https://www.dropbox.com/s/3lua2h1oo18gbug/unix_lesson.tar.gz?dl=1)
55 | 


--------------------------------------------------------------------------------