├── .DS_Store
├── DBGenerator.py
├── LICENSE
├── PathoScope_Tutorial.md
├── README.md
├── core_gene_alignment.nwk
├── genomes
    ├── GCA_000008285.1_ASM828v1_genomic.fna
    ├── GCA_000021185.1_ASM2118v1_genomic.fna
    ├── GCA_000026705.1_ASM2670v1_genomic.fna
    ├── GCA_000168635.2_ASM16863v2_genomic.fna
    ├── GCA_000168815.1_ASM16881v1_genomic.fna
    └── GCA_000196035.1_ASM19603v1_genomic.fna
├── img
    ├── .DS_Store
    ├── Screenshot 2016-09-21 23.00.22.png
    ├── cbib.png
    ├── core_gene_alignment.tre.png
    ├── geneious.png
    ├── genomes.png
    ├── lactam.png
    ├── lactam2.png
    ├── microgenomics.png
    ├── pangenome_frequency.png
    ├── pangenome_matrix.png
    ├── pangenome_pie.png
    ├── prokkaterm.png
    ├── quast.png
    ├── spades.png
    ├── sra01.png
    ├── sra02.png
    ├── sra022.png
    ├── sra03.png
    ├── sra04.png
    ├── sra05.png
    ├── sra06.png
    ├── struc.png
    ├── term.png
    └── unab.jpg
├── infmed.md
├── pangenome.md
└── sra.md


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microgenomics/tutorials/cdc7c10b2be779650607ee89109713c6bf8e5277/.DS_Store


--------------------------------------------------------------------------------
/DBGenerator.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import csv
  5 | import re
  6 | from glob import glob
  7 | from operator import itemgetter
  8 | from multiprocessing import Process
  9 | from sys import argv
 10 | 
 11 | 
 12 | def get_genomas_locus():
 13 |     out_csv = open('genomas_locus.csv', 'w')
 14 |     print('Starting get_genomas_locus')
 15 |     genomas_locus = [tuple(filter(None, tuple(map(itemgetter(i), lista))))  # obtiene la columna de todos los locus de cada genoma
 16 |                      for i in range(14, len(lista[0]))]
 17 |     for genoma in genomas_locus:  # para cada columna haz
 18 |         for locus in genoma[1:]:  # genoma[0] es el nombre del genoma y lo demas son los locus
 19 |             locus = locus.split()  # corta los espacios en blanco
 20 |             if len(locus) == 1:  # revisa que solo tenga un locus
 21 |                 locus = locus[0]
 22 |                 print('{}|{}'.format(genoma[0], locus), file=out_csv)
 23 |             else:  # tiene mas de un locus en esa celda
 24 |                 for locus in loci:  # separalos!
 25 |                     print('{}|{}'.format(genoma[0], locus), file=out_csv)
 26 |     out_csv.close()
 27 |     print('END get_genomas_locus')
 28 | 
 29 | 
 30 | def get_pangenoma():  # parsea el gene_presence_absence.csv
 31 |     out_csv = open('pangenoma.csv', 'w')
 32 |     print('Starting get_pangenoma')
 33 |     for row in lista[1:]:
 34 |         Gene = row[0]
 35 |         Non_unique_Gene_name = row[1]
 36 |         Annotation = row[2]
 37 |         No_isolates = row[3]
 38 |         No_sequences = row[4]
 39 |         Avg_sequences_per_isolate = row[5]
 40 |         Genome_Fragment = row[6]
 41 |         Order_within_Fragment = row[7]
 42 |         Accessory_Fragment = row[8]
 43 |         Accessory_Order_with_Fragment = row[9]
 44 |         QC = row[10]
 45 |         Min_group_size_nuc = row[11]
 46 |         Max_group_size_nuc = row[12]
 47 |         Avg_group_size_nuc = row[13]
 48 |         print('{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{}'.format(Gene,
 49 |             Non_unique_Gene_name, Annotation, No_isolates, No_sequences,
 50 |             Avg_sequences_per_isolate, Genome_Fragment, Order_within_Fragment,
 51 |             Accessory_Fragment, Accessory_Order_with_Fragment, QC,
 52 |             Min_group_size_nuc, Max_group_size_nuc, Avg_group_size_nuc), file=out_csv)
 53 |     out_csv.close()
 54 |     print('END get_pangenoma')
 55 | 
 56 | 
 57 | def get_pangenoma_locus():
 58 |     out_csv = open('pangenoma_locus.csv', 'w')
 59 |     print('Starting get_pangenoma_locus')
 60 |     for row in lista[1:]:  # como la lista tiene encabezados hay que partri del segundo
 61 |         Gene = row[0]  # getea el nombre del gen
 62 |         loci = row[14:]  # y de los locis
 63 |         for locus in loci:
 64 |             locus = locus.split()
 65 |             if len(locus) == 1:  # tiene solo un locus
 66 |                 locus = locus[0]
 67 |                 print('{}|{}'.format(Gene, locus), file=out_csv)
 68 |             else:  # tiene mas de un locus
 69 |                 for l in locus:
 70 |                     print('{}|{}'.format(Gene, l), file=out_csv)
 71 |     out_csv.close()
 72 |     print('END get_pangenoma_locus')
 73 | 
 74 | 
 75 | def get_locus_sequence():
 76 |     out_csv = open('locus_sequence.csv', 'w')
 77 |     print('Starting get_locus_sequence')
 78 |     ffns = glob('{}/*.ffn'.format(argv[1]))  # genera la lista de todos los ffn entregados por PROKKA, prokka_ es el prefijo de los directorios anotados por prokka
 79 |     p = re.compile(r'>(\w+).*')  # regex para encontrar los locus en formato fasta
 80 |     genomas_locus = open('genomas_locus.csv')  # es necesario tener el csv listo
 81 |     reader = csv.reader(genomas_locus, delimiter='|')
 82 |     lista_genomas_locus = [row for row in reader]  # lista de TODOS los locus
 83 | 
 84 |     for ffn in ffns:  # por cada archivo de secuencia
 85 |         archivo = open(ffn)
 86 |         reader = archivo.readlines()
 87 |         parsed = []
 88 |         codigo = ffn.split('/')[-1].split('.')[0]  # es el codigo del archivo ffn
 89 |         db = [x[1] for x in lista_genomas_locus if codigo in x[0]]  # todos los locus con el hash unico dado por roary del genoma espesifico si el codigo del archivo fnn es el codigo del genoma del locus, agregalo
 90 |         # todo este bloque es para obtener la secuencia sin saltos de linea
 91 |         # y solo un saldo de linea antes del > en el fasta
 92 |         for linea in reader:
 93 |             if '>' in linea:
 94 |                 parsed.append(linea)
 95 |             else:
 96 |                 parsed.append(linea.strip())
 97 |         string = p.sub(r'\n>\1', ''.join(parsed))
 98 |         # fin del bloque magico
 99 |         lista_locus = string.split('>')  # lista de los locus y su secuencia
100 |         lista_locus = [x.split() for x in lista_locus]  # lista de la forma [[locus, secuencia],...]
101 |         for locus in lista_locus[1:]:  # para cada uno de todos los locus (se usa [1:] para saltar la cabecera del csv)
102 |             codp = re.compile(locus[0])  # regex para buscar el hash asociado al locus de roary
103 |             search = [codp.search(x) for x in db]
104 |             search = tuple(filter(None, search))  # elimina los None
105 |             if len(search) == 1:  # si hay una coincidencia, es que se encontro
106 |                 search = search[0].string
107 |                 print('{}|{}'.format(search, locus[-1]), file=out_csv)
108 |             elif len(search) == 0:
109 |                 pass  # no hay resultados del locus en la db
110 |             else:
111 |                 print(locus)
112 |                 raise
113 |     out_csv.close()
114 |     print('END get_locus_sequence')
115 | 
116 | 
117 | if __name__ == '__main__':
118 |     if not argv[1]:
119 |         print('Se necesita pasar el directorio de los ffns como primer y unico argumento')
120 |         exit()
121 |     csvfile = open('gene_presence_absence.csv')
122 |     reader = csv.reader(csvfile)
123 |     lista = [row for row in reader]
124 |     LISTA_GENOMAS = tuple(lista[0][14:])
125 |     APIGFF = 'https://www.patricbrc.org/portal/portal/patric/Genome?cType=genome&cId={}'
126 | 
127 |     process_get_genomas_locus = Process(target=get_genomas_locus)
128 |     process_get_pangenoma = Process(target=get_pangenoma)
129 |     process_get_pangenoma_locus = Process(target=get_pangenoma_locus)
130 |     process_get_locus_sequence = Process(target=get_locus_sequence)
131 | 
132 |     process_get_pangenoma.start()
133 |     process_get_pangenoma_locus.start()
134 |     process_get_genomas_locus.start()
135 |     process_get_genomas_locus.join()
136 |     # espero a que termine el proceso para que tenga listo el csv para correr las demas funciones que dependen del csv
137 |     process_get_locus_sequence.start()
138 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Center for Bioinformatics and Integrative Biology
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/PathoScope_Tutorial.md:
--------------------------------------------------------------------------------
  1 | ![banner](https://raw.githubusercontent.com/microgenomics/tutorials/master/img/microgenomics.png)
  2 | 
  3 | # PathoScope Tutorial
  4 | -------------------------
  5 | 
  6 | ##### In this demo we will explore how to get a taxonomic profile from a metagenomic experiment using PathoScope 2.0. A more in-depth tutorial can be found in the **PathoScope** repo [here](https://github.com/PathoScope/PathoScope/raw/master/pathoscope2.0_v0.02_tutorial.pdf)
  7 | 
  8 | ### What PathoScope can do for you
  9 | **PathoScope** is a modular piece of software that will allow you to go all the way from a fastq file to a text file (typically tab-delimited) with columns representing genomes, their proportions, etc.  
 10 | There are 6 **PathoScope modules**, however, for this demo we will focus on the three most important ones:
 11 | - ***PathoLib*** - Allows user to automatically generate custom reference genome libraries for specific scenarios or datasets
 12 | - ***PathoMap*** - Aligns reads to target reference genome library and removes sequences that align to the filter and host libraries
 13 | - ***PathoID*** - Reassigns ambiguous reads, identifies microbial strains present in the sample, and estimates proportions of reads from each genome  
 14 | 
 15 | Once you run your samples through **PathoScope**, you can easily import the outputfiles into R for downstream exploratory data analysis and statistical inferences.
 16 | 
 17 | ### PathoScope Dependencies
 18 | The only dependencies for **PathoScope** are [*Bowtie2*](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml) and [python](https://www.python.org) *2.7.3* or higher. Make sure that both are in your PATH by issuing something like `echo $PATH`
 19 | 
 20 | ### Installing PathoScope and PS_demo
 21 | PathoScope is now hosted in GitHub so you can easily get it by issuing the following command from the Terminal  
 22 | 
 23 | 		git clone https://github.com/PathoScope/PathoScope.git
 24 | 
 25 | And PS_demo
 26 | 
 27 | 		git clone https://github.com/ecastron/PS_demo.git
 28 | 
 29 | ### Get data and get reference genomes
 30 | We are going to use data from a study exploring microbiome diversity in oropharingeal swabs from schizophrenia patients and healthy controls. The SRA accession number is `SRR1519057`. 
 31 | 
 32 | ![SRA](https://github.com/ecastron/PS_demo/raw/master/img/img01.png)
 33 | 
 34 | This file is probably too big for a demo so I randomly subsampled the reads down to a more manageable size (~40 M to 40 K reads)  
 35 | - Go ahead and download the data [here](https://raw.githubusercontent.com/ecastron/PS_demo/master/ES_211.fastq)
 36 | - Now you need at least two files, one to be used as target library (where your reads are going to be mapped) and another one to be used as filter library (internal controls, host genome, contaminants, etc. that you want to remove)
 37 | 
 38 | As target library, you can use any multi fasta file containing full or draft genomes, or even nucleotide entries from NCBI, and combinations of both. The only condition is that the fasta entries start with the taxonomy ID from NCBI as follows:
 39 | 
 40 | Originally:  
 41 | \>gi|40555938|ref|NC_005309.1| Canarypox virus, complete genome  
 42 | 
 43 | but PathoScope likes:  
 44 | \>ti|44088|gi|40555938|ref|NC_005309.1| Canarypox virus, complete genome  
 45 | 
 46 | You could do this very easily in **PathoLib**:
 47 | 
 48 | 		python pathoscope.py LIB -genomeFile my_file.fasta -outPrefix target_library
 49 | 
 50 | Alternatively, we provide the entire NCBI nucleotide database already formatted [here] (ftp://pathoscope.bumc.bu.edu/data/nt_ti.fa.gz) (10 GB file). You could also use **PathoLib** to subsample this big file (50 GB uncompressed) and select only the taxa that you want. For instance, obtaining all the virus entries in nt_ti.fa (virus taxonomy ID = 10239)
 51 | 
 52 | 		python pathoscope.py -LIB python pathoscope.py LIB -genomeFile nt_ti.fa -taxonIds 10239 --subTax -outPrefix virus
 53 | 
 54 | Or in order to create a filter library, say all human sequences:
 55 | 		
 56 | 		python  pathoscope.py -LIB python pathoscope.py LIB -genomeFile nt_ti.fa -taxonIds 9606 --subTax -outPrefix human
 57 | 
 58 | However, I'm providing a target and filter library already formatted that you can download [here](https://www.dropbox.com/sh/3kjvec5lizwmo9l/AAAQmHEAwAfDixGtKC6eTeN1a?dl=0) and [here](https://www.dropbox.com/sh/2gzurlubxkzku0p/AADEIiGDig00FhNpu6XZ2iYaa?dl=0). The target library is a collection of genomes from the reference library of the Human Microbiome Project (description [here](http://hmpdacc.org/HMREFG/)), and the filter library is simply the human genome (hg19). We are also going to use another filter library as well ([phix174](https://www.dropbox.com/sh/9mt2a2v2xdqpj6x/AABgKTPNfwPNO7DpKjo56gdpa?dl=0)) to get rid of all the reads mapping to the Illumina internal control sequence that is sometimes added to sequencing experiments.
 59 | 
 60 | ### Let's map the reads
 61 | Once you have your data and target and filter libraries, we are ready to go ahead with the mapping step. For this we use bowtie2 so we will need to tell **PathoMap** where the bowtie2 indices are. If you don't have bowtie2 indices, not a problem, **PathoMap** will create them for you. And if your fasta files are larger than 4.6 GB (Bowtie2 limit), not a problem either, **PathoMap** will split your fasta files and create indices for each one of the resulting files.
 62 | 
 63 | If you have fasta files and not bowtie2 indices:
 64 | 
 65 | 		python pathoscope.py MAP -U ES_211.fastq -targetRefFiles HMP_ref_ti_0.fa,HMP_ref_ti_1.fa -filterRefFiles human.fa,phix174.fa  -outDir . -outAlign ES_211.sam  -expTag DAV_demo
 66 | 
 67 | But if you already have Bowtie2 indices (our case), you can issue the following command:
 68 | 
 69 | 		python pathoscope.py MAP -U ES_211.fastq -targetIndexPrefixes HMP_ref_ti_0,HMP_ref_ti_1 -filterIndexPrefixes genome,phix174  -outDir . -outAlign ES_211.sam  -expTag DAV_demo
 70 | 
 71 | Let's give it a try...
 72 | 
 73 | ![map](https://github.com/ecastron/PS_demo/raw/master/img/pathomap.png)
 74 | 
 75 | So that should have taken ~3 minutes to run. Now you have a number of things that were printed to the screen as well as files that were created. The summary of the STDOUT is:
 76 | 
 77 | | Reads Mapped  | Library  | 
 78 | |:------------- | ---------------:|
 79 | | 1053      | HMP\_ref\_ti\_0 |
 80 | | 1132      | HMP\_ref\_ti\_0 |
 81 | | 916 | genome |
 82 | | 0 | phix174 |
 83 | 
 84 | And you should have one .sam file per library, plus another file containing the reads mapped to all target libraries (DAV\_demo-appendAlign.sam), a fastq file of the reads mapping to all targets (DAV\_demo-appendAlign.fq), and the file you most care about: ES_211.sam
 85 | 
 86 | ![mapout](https://github.com/ecastron/PS_demo/raw/master/img/mapout.png)
 87 | 
 88 | ### Let's get a taxonomic profile from our .sam file
 89 | The last step in our demo is to obtain a taxonomic profile from ES_211.sam using the read reassignment model implemented in **PathoID**
 90 | 
 91 | 		python pathoscope.py ID -alignFile ES_211.sam -fileType sam -outDir . -expTag DAV -thetaPrior 1000000
 92 | 
 93 | After running the command line above, you should get a tab-delimited file with **PathoScope's** output, and an updated .sam file representing an alignment after **PathoScope's** reassignment model was applied.  
 94 | If you want to see all the output files you should get, check out the *output_files* directory in the PS\_demo repo.
 95 | 
 96 | ### Output TSV file format
 97 | 
 98 | At the top of the file in the first row, there are two fields called "Total Number of Aligned Reads" and "Total Number of Mapped Genomes". They represent the total number of reads that are aligned and the total number of genomes to which those reads align to in the given alignment file.
 99 | 
100 | Columns in the TSV file:
101 | 
102 | 1. **Genome:**  
103 | This is the name of the genome found in the alignment file.
104 | 2. **Final Guess:**  
105 | This represent the percentage of reads that are mapped to the genome in Column 1 (reads aligning to multiple genomes are assigned proportionally) after pathoscope reassignment is performed.
106 | 3. **Final Best Hit:**  
107 | This represent the percentage of reads that are mapped to the genome in Column 1 after assigning each read uniquely to the genome with the highest score and after pathoscope reassignment is performed.
108 | 4. **Final Best Hit Read Numbers:**  
109 | This represent the number of best hit reads that are mapped to the genome in Column 1 (may include a fraction when a read is aligned to multiple top hit genomes with the same highest score) and after pathoscope reassignment is performed.
110 | 5. **Final high confidence hits:**  
111 | This represent the percentage of reads that are mapped to the genome in Column 1 with an alignment hit score of 50%-100% to this genome and after pathoscope reassignment is performed.
112 | 6. **Final low confidence hits:**  
113 | This represent the percentage of reads that are mapped to the genome in Column 1 with an alignment hit score of 1%-50% to this genome and after pathoscope reassignment is performed.
114 | 7. **Initial Guess:**  
115 | This represent the percentage of reads that are mapped to the genome in Column 1 (reads aligning to multiple genomes are assigned proportionally) before pathoscope reassignment is performed.
116 | 8. **Initial Best Hit:**  
117 | This represent the percentage of reads that are mapped to the genome in Column 1 after assigning each read uniquely to the genome with the highest score and before pathoscope reassignment is performed.
118 | 9. **Initial Best Hit Read Numbers:**  
119 | This represent the number of best hit reads that are mapped to the genome in Column 1 (may include a fraction when a read is aligned to multiple top hit genomes with the same highest score) and before pathoscope reassignment is performed.
120 | 10. **Initial high confidence hits:**  
121 | This represent the percentage of reads that are mapped to the genome in Column 1 with an alignment hit score of 50%-100% to this genome and before pathoscope reassignment is performed.
122 | 11. **Initial low confidence hits:**  
123 | This represent the percentage of reads that are mapped to the genome in Column 1 with an alignment hit score of 1%-50% to this genome and before pathoscope reassignment is performed.
124 | 
125 | ### Ready for the next demo?
126 | 
127 | Let's see an example analysis on data generated in **PathoScope** [here](https://github.com/ecastron/PS_demo/blob/master/demo02.md)
128 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ![banner](https://raw.githubusercontent.com/microgenomics/tutorials/master/img/microgenomics.png)
 2 | 
 3 | # Tutorials and demos
 4 | -------------------------
 5 | 
 6 | Welcome to the Microbial Genomics Lab. In this repo you will find demos, protocols, and tutorials by members of the Castro Lab at the Center for Bioinformatics and Integrative Biology (CBIB). Feel free to use them for non-commercial activities given that you give proper credit. Have fun!
 7 | 
 8 | ### List of materials
 9 | 
10 | * [Running PathoScope2](https://github.com/microgenomics/tutorials/blob/master/PathoScope_Tutorial.md)
11 | * [How to perform a pangenome analysis using Roary](https://github.com/microgenomics/tutorials/blob/master/pangenome.md)
12 | * [How to perform basic exploratory analyses with PathoScope2's output](https://github.com/ecastron/PS_demo/blob/master/demo02.md)
13 | * [How to download data from the Sequence Read Archive](https://github.com/microgenomics/tutorials/blob/master/sra.md)
14 | 
15 | 


--------------------------------------------------------------------------------
/core_gene_alignment.nwk:
--------------------------------------------------------------------------------
1 | ((GCA_000196035:0.00732,GCA_000168635:0.0052270000000000025):0.028753499999999994,(GCA_000021185:0.030983999999999998,(GCA_000026705:0.0012090000000000017,(GCA_000168815:0.0019479999999999983,GCA_000008285:0.0015220000000000025):7.160000000000014E-4):0.017212000000000005):0.005089500000000004);


--------------------------------------------------------------------------------
/img/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microgenomics/tutorials/cdc7c10b2be779650607ee89109713c6bf8e5277/img/.DS_Store


--------------------------------------------------------------------------------
/img/Screenshot 2016-09-21 23.00.22.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microgenomics/tutorials/cdc7c10b2be779650607ee89109713c6bf8e5277/img/Screenshot 2016-09-21 23.00.22.png


--------------------------------------------------------------------------------
/img/cbib.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microgenomics/tutorials/cdc7c10b2be779650607ee89109713c6bf8e5277/img/cbib.png


--------------------------------------------------------------------------------
/img/core_gene_alignment.tre.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microgenomics/tutorials/cdc7c10b2be779650607ee89109713c6bf8e5277/img/core_gene_alignment.tre.png


--------------------------------------------------------------------------------
/img/geneious.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microgenomics/tutorials/cdc7c10b2be779650607ee89109713c6bf8e5277/img/geneious.png


--------------------------------------------------------------------------------
/img/genomes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microgenomics/tutorials/cdc7c10b2be779650607ee89109713c6bf8e5277/img/genomes.png


--------------------------------------------------------------------------------
/img/lactam.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microgenomics/tutorials/cdc7c10b2be779650607ee89109713c6bf8e5277/img/lactam.png


--------------------------------------------------------------------------------
/img/lactam2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microgenomics/tutorials/cdc7c10b2be779650607ee89109713c6bf8e5277/img/lactam2.png


--------------------------------------------------------------------------------
/img/microgenomics.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microgenomics/tutorials/cdc7c10b2be779650607ee89109713c6bf8e5277/img/microgenomics.png


--------------------------------------------------------------------------------
/img/pangenome_frequency.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microgenomics/tutorials/cdc7c10b2be779650607ee89109713c6bf8e5277/img/pangenome_frequency.png


--------------------------------------------------------------------------------
/img/pangenome_matrix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microgenomics/tutorials/cdc7c10b2be779650607ee89109713c6bf8e5277/img/pangenome_matrix.png


--------------------------------------------------------------------------------
/img/pangenome_pie.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microgenomics/tutorials/cdc7c10b2be779650607ee89109713c6bf8e5277/img/pangenome_pie.png


--------------------------------------------------------------------------------
/img/prokkaterm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microgenomics/tutorials/cdc7c10b2be779650607ee89109713c6bf8e5277/img/prokkaterm.png


--------------------------------------------------------------------------------
/img/quast.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microgenomics/tutorials/cdc7c10b2be779650607ee89109713c6bf8e5277/img/quast.png


--------------------------------------------------------------------------------
/img/spades.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microgenomics/tutorials/cdc7c10b2be779650607ee89109713c6bf8e5277/img/spades.png


--------------------------------------------------------------------------------
/img/sra01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microgenomics/tutorials/cdc7c10b2be779650607ee89109713c6bf8e5277/img/sra01.png


--------------------------------------------------------------------------------
/img/sra02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microgenomics/tutorials/cdc7c10b2be779650607ee89109713c6bf8e5277/img/sra02.png


--------------------------------------------------------------------------------
/img/sra022.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microgenomics/tutorials/cdc7c10b2be779650607ee89109713c6bf8e5277/img/sra022.png


--------------------------------------------------------------------------------
/img/sra03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microgenomics/tutorials/cdc7c10b2be779650607ee89109713c6bf8e5277/img/sra03.png


--------------------------------------------------------------------------------
/img/sra04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microgenomics/tutorials/cdc7c10b2be779650607ee89109713c6bf8e5277/img/sra04.png


--------------------------------------------------------------------------------
/img/sra05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microgenomics/tutorials/cdc7c10b2be779650607ee89109713c6bf8e5277/img/sra05.png


--------------------------------------------------------------------------------
/img/sra06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microgenomics/tutorials/cdc7c10b2be779650607ee89109713c6bf8e5277/img/sra06.png


--------------------------------------------------------------------------------
/img/struc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microgenomics/tutorials/cdc7c10b2be779650607ee89109713c6bf8e5277/img/struc.png


--------------------------------------------------------------------------------
/img/term.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microgenomics/tutorials/cdc7c10b2be779650607ee89109713c6bf8e5277/img/term.png


--------------------------------------------------------------------------------
/img/unab.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microgenomics/tutorials/cdc7c10b2be779650607ee89109713c6bf8e5277/img/unab.jpg


--------------------------------------------------------------------------------
/infmed.md:
--------------------------------------------------------------------------------
  1 | # Laboratorio de Genómica para Informática Médica
  2 | -
  3 | La actividad del día de hoy consiste en ganar experiencia directa con datos de secuenciamiento masivo y algunos pasos básicos para transformar esos datos en información biológica útil.  
  4 | 
  5 | Vamos a trabajar directamente con reads producidas en un equipo Illumina [HiSeq 2500](http://www.illumina.com/systems/hiseq_2500_1500.html). Las reads corresponden a un genoma de *Escherichia coli* aislado de un paciente con una infección del tracto urinario. Un punto esencial para tratar este tipo de infección, es saber si existen genes de resistencia a antibióticos presentes en el genoma y en contra de qué antibióticos actuan.  
  6 | 
  7 | Échemos un vistazo a la estructura del genoma de [*E. coli*](http://www.ncbi.nlm.nih.gov/genome/?term=escherichia%20coli).  
  8 | 
  9 | 		¿Cuántos genes tiene?
 10 | 		¿Cuál es la longitud del genoma?
 11 | 		¿Cuál es el %GC?
 12 | 
 13 | Éstas estadísticas nos van a ayudar a comparar nuestro genoma una vez ensamblado con "lo que deberíamos obtener". Ahora, en resumen lo que vamos a hacer hoy es:  
 14 | 
 15 | * Descargar y comprobar la calidad de las reads usando el programa [FastQC](http://www.bioinformatics.babraham.ac.uk/projects/download.html).  
 16 | * Ensamblar el genoma usando el método de [De Bruijn Graphs](http://www.nature.com/nbt/journal/v29/n11/pdf/nbt.2023.pdf) implementado en el programa [SPAdes](http://bioinf.spbau.ru/spades).  
 17 | * Evaluar y comparar el ensamble al genoma de referencia de *E. coli* usando el servidor web [QUAST](http://quast.bioinf.spbau.ru).
 18 | * Navegar a través de los contigs y anotaciones para buscar un gen de resistencia a Beta Lactamas o betalatamasas (penicilinas, cefalosporinas, carbapenemas, y monobactamos) en el visualizador [GeneiousBasic](http://www.geneious.com/download).
 19 | * Finalmente, haremos una exploración de la estructura tridimensional de una beta lactamasa y cómo su estructura es similar en diferentes organismos.   
 20 | 
 21 | Para empezar, descarguemos las reads tal cual son entregadas por el secuenciador ([aquí](https://www.dropbox.com/s/7gh1343s4yk0rsf/reads.zip?dl=0)). Alternativamente, las pueden encontrar en sus estaciones de trabajo (en el Escritorio en el directorio **CURSO_22SEPT**).
 22 | 
 23 | Exploremos la calidad de las reads en FastQC
 24 | 
 25 | ![fastqc](https://github.com/microgenomics/tutorials/raw/master/img/Screenshot%202016-09-21%2023.00.22.png)  
 26 | 
 27 | 		Explora el significado del gráfico y qué es lo que está expresado en el eje Y.  
 28 | 		¿Qué indica la información en "overrrepresented sequences" o "Kmer content"?
 29 | 
 30 | En general, un investigador tomaría una desición con respecto a cómo cortar y/o filtrar las reads que no se ajusten a algún estándar. Esta decisión es muchas veces arbitraria y depende de qué tan bien se siente el investigador trabajando sobre el conjunto de reads que pasaron el control de calidad.  
 31 | 
 32 | Ahora, necesitamos usar la línea de comandos para poder ensamblar este genoma. Observen que las reads vienen en dos archivos porque corresponden a `Paired-end reads`. Busquen en la Mac el ícono de la Terminal. Al ejecutar la Terminal deberán ver una ventana como la siguiente:  
 33 | 
 34 | ![terminal](https://github.com/microgenomics/tutorials/raw/master/img/term.png)
 35 | 
 36 | Si no están familiarizados con la Terminal, les recomiendo revisar las lecciones del sitio web de [Data Carpentry](http://www.datacarpentry.org), en particular la lección sobre el [Shell](http://www.datacarpentry.org/shell-genomics/lessons/01_the_filesystem.html).
 37 | 
 38 | Ahora, al escribir `spades.py` debería ejecutarse el ensamblador SPAdes y mostrarnos el menú de ayuda:
 39 | 
 40 | ![spades](https://github.com/microgenomics/tutorials/raw/master/img/spades.png)
 41 | 
 42 | Inspeccionemos las opciones. Como mínimo, necesitamos entregarle el nombre y ubicación de los archivos de entrada (las reads), donde dejar los resultados (output directory) y cómo proceder con el ensamblaje.
 43 | 
 44 | 		spades.py -o ensamblaje -1 Escherichia_coli_GW_UTI_007_TCTCTTCA-TAACGCTG_L005_R1_001.fastq -2 Escherichia_coli_GW_UTI_007_TCTCTTCA-TAACGCTG_L005_R2_001.fastq -t 8 -m 4 --cov-cutoff auto
 45 | 
 46 | El ensamblaje en total debería tomar aproximadamente 5 a 10 minutos. El archivo resultante que nos interesa se llama `contigs.fasta` el cual es un archivo de texto plano en [formato fasta](https://en.wikipedia.org/wiki/FASTA_format) que contiene secuencias contiguas que fueron formadas a partir de sobreponer las reads unas con otras.
 47 | 
 48 | Veamos cuántos contigs se formaron y cuál fue la longitud total del genoma ensamblado. Vamos a la página de [QUAST](http://quast.bioinf.spbau.ru) y subamos nuestro archivo `contigs.fasta`.
 49 | 
 50 | ![quast](https://github.com/microgenomics/tutorials/raw/master/img/quast.png)
 51 | 
 52 | La ejecución de QUAST debería tomar 10 minutos aproximadamente. Una vez finalizado, inspecciona la página de resultados.
 53 | 
 54 | 		¿Qué es el N50?
 55 | 		¿Cuál es la longitud de nuestro genoma?
 56 | 		¿Cuántos genes tiene en comparación con la referencia? ¿Su contenido GC?
 57 | 
 58 | Una vez finalizado QUAST y la inspección de los resultados, podemos continuar con la anotación del genoma. Anotar un genoma tiene que ver con identificar dónde están los genes, qué hacen, y en qué proceso metabólico están involucrados. Vamos a usar un programa que usa una combinación de búsquedas por similitud y *ab initio*. El programa se llama [Prokka](https://github.com/tseemann/prokka) y fue desarrollado por un investigador australiano, [Torsten Seemann](https://twitter.com/torstenseemann).
 59 | 
 60 | Desde la Terminal ejecutemos Prokka al escribir `prokka`
 61 | 
 62 | ![prokka](https://github.com/microgenomics/tutorials/raw/master/img/prokkaterm.png)
 63 | 
 64 | Al igual que con SPAdes, necesitamos indicarle a prokka cómo procesar nuestro archivo con contigs, qué bases de datos usar, y como realizar los cálculos.
 65 | 
 66 | 		prokka --outdir anotación --prefix EColi --addgenes --locustag Ecoli --genus Escherichia --species coli --kingdom Bacteria --gram neg --cpus 16 --evalue 1e-5 contigs.fasta
 67 | 
 68 | El proceso completo debería tomar 10 minutos aproximadamente. Una vez terminado, Prokka va a haber generado 11 archivos de salida, e.g., .gbk, .fna, .faa, .ffn, etc.
 69 | 
 70 | 		¿Qué información contienen estos archivos?
 71 | 
 72 | Ahora carguemos el archivo .gbk en Geneious. Deberíamos ver algo como la siguiente imagen:
 73 | 
 74 | ![geneious](https://github.com/microgenomics/tutorials/raw/master/img/geneious.png)
 75 | 
 76 | Geneious es una herramienta muy poderosa donde podemos explorar de manera gráfica los resultados del proceso de anotación genómica. Exploremos de manera libre las herramientas y opciones que provee Geneious. En la esquina superior derecha pueden escoger e ir navegando contig por contig y explorando las anotaciones que fueron agregadas a la secuencia de DNA.
 77 | 
 78 | Vayamos a la pestaña de Annotations y seleccionemos All Sequences en la esquina superior izquierda. Luego podemos buscar anotaciones específicas a través de todos los contigs. Probemos con "Lactam"
 79 | 
 80 | ![lactam](https://github.com/microgenomics/tutorials/raw/master/img/lactam.png)
 81 | 
 82 | En el contig 74 (NODE_74) hay una secuencia codificante *bla 1 CDS* de 876 nucleótidos de longitud que está codificada en la posición Reverse de la secuencia. Seleccionemos este resultado y vayamos a Sequence View de nuevo.
 83 | 
 84 | ![lactam2](https://github.com/microgenomics/tutorials/raw/master/img/lactam2.png)
 85 | 
 86 | El producto génico de esta CDS es una "Beta-lactamase OXA-1 precursor". Vamos a la sección Structure y busquemos las palabras clave "Beta-lactamase OXA-1".
 87 | 
 88 | ![struc](https://github.com/microgenomics/tutorials/raw/master/img/struc.png)
 89 | 
 90 | La búsqueda nos retorna cinco resultados. Si hacemos clic en alguno, podemos acceder a la estructura cristalina de la proteína codificada por el gen.
 91 | 
 92 | 		¿Qué otra especie posee está proteína?
 93 | 		¿Cuántas cadenas tiene la proteína?
 94 | 		¿Cuál es el sitio activo?
 95 | 
 96 | ## Referencias
 97 | 
 98 | Seemann T.  
 99 | *Prokka: rapid prokaryotic genome annotation*  
100 | **Bioinformatics** 2014 Jul 15;30(14):2068-9.   
101 | [PMID:24642063](http://www.ncbi.nlm.nih.gov/pubmed/24642063)  
102 | 
103 | Bankevich, A. et al.  
104 | *SPAdes: a new genome assembly algorithm and its applications to single-cell sequencing*  
105 | **Journal of Computational Biology** 2012 19.5 (2012): 455-477.  
106 | [PMID:22506599](https://www.ncbi.nlm.nih.gov/pubmed/22506599)  
107 | 
108 | Gurevich, A et al.  
109 | *QUAST: quality assessment tool for genome assemblies*  
110 | **Bioinformatics** 29.8 (2013): 1072-1075  
111 | [PMID:23422339](https://www.ncbi.nlm.nih.gov/pubmed/23422339)  
112 | 
113 | Kearse, M, et al.  
114 | *Geneious Basic: an integrated and extendable desktop software platform for the organization and analysis of sequence data*  
115 | **Bioinformatics** 28.12 (2012): 1647-1649.  
116 | [PMID:22543367](https://www.ncbi.nlm.nih.gov/pubmed/22543367)  
117 | 
118 | **FastQC**
119 | [http://www.bioinformatics.babraham.ac.uk/projects/fastqc/](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/)
120 | 


--------------------------------------------------------------------------------
/pangenome.md:
--------------------------------------------------------------------------------
  1 | ![banner](https://raw.githubusercontent.com/microgenomics/tutorials/master/img/microgenomics.png)
  2 | # Genome annotation and Pangenome analysis
  3 | -------------------------
  4 | #### In this demo we will explore how to determine a pangenome from a collection of isolate genome sequences in fasta format
  5 | 
  6 | This demo relies on two pieces of software, *Prokka* and *Roary*, so please remember to cite them if you end up publishing results obtained with these tools
  7 | 
  8 | ## Obtaining data
  9 | 
 10 | For details on obtaining Prokka and Roary, please visit their GitHub repos [here](https://github.com/tseemann/prokka/blob/master/README.md) and [here](https://github.com/sanger-pathogens/Roary/blob/master/README.md).
 11 | 
 12 | Assuming you have Prokka and Roary installed and in your PATH variable, go ahead and download the six *Listeria monocytogenes* genomes we are going to use for this demo. From the Terminal:
 13 | 		
 14 | 		wget https://raw.githubusercontent.com/CBIBUNAB/tutorials/master/genomes/GCA_000008285.1_ASM828v1_genomic.fna
 15 | 		wget https://raw.githubusercontent.com/CBIBUNAB/tutorials/master/genomes/GCA_000021185.1_ASM2118v1_genomic.fna
 16 | 		wget https://raw.githubusercontent.com/CBIBUNAB/tutorials/master/genomes/GCA_000026705.1_ASM2670v1_genomic.fna
 17 | 		wget https://raw.githubusercontent.com/CBIBUNAB/tutorials/master/genomes/GCA_000196035.1_ASM19603v1_genomic.fna
 18 | 		wget https://raw.githubusercontent.com/CBIBUNAB/tutorials/master/genomes/GCA_000168635.2_ASM16863v2_genomic.fna
 19 | 		wget https://raw.githubusercontent.com/CBIBUNAB/tutorials/master/genomes/GCA_000168815.1_ASM16881v1_genomic.fna
 20 | 
 21 | You should get something like the following:
 22 | 
 23 | ![genomes](https://raw.githubusercontent.com/microgenomics/tutorials/master/img/genomes.png)
 24 | 
 25 | 
 26 | These genomes correspond to isolates of *L. monocytogenes* reported in *Probing the pan-genome of Listeria monocytogenes: new insights into intraspecific niche expansion and genomic diversification* [PMID: 20846431](http://www.ncbi.nlm.nih.gov/pubmed/?term=20846431).
 27 | 
 28 | We selected the following six genomes based on their level of completeness (finished; contigs, etc) and their genotype (type I-III)
 29 | 
 30 | | Genome Assembly | Genome Accession |  Genotype  | Sequenced by | Status|
 31 | |:------------- | 	--------------- 	| -------------| ------------ | ------------ |
 32 | | GCA_000026705	| 	FM242711			| type I      | Institut_Pasteur| Finished|
 33 | | GCA_000008285	| 	AE017262			| type I      | TIGR| Finished|
 34 | | GCA_000168815	| 	AATL00000000		| type I      | Broad Institute| 79 contigs|
 35 | | GCA_000196035 |	AL591824			| type II     | European Consortium| Finished|
 36 | | GCA_000168635	| 	AARW00000000		| type II     | Broad Institute | 25 contigs|
 37 | | GCA_000021185	| 	CP001175			| type III    | MSU| Finished|
 38 | 
 39 | ## Annotating genomes
 40 | 
 41 | By annotating the genomes we mean to add information regarding genes, their location, strandedness, and features and attributes. Now that you have the genomes, we need to annotate them to determine the location and attributes of the genes contained in them. We will use Prokka because it's extremely fast and it performs well, and also because the *features* file that produces (GFF3) is compatible with Roary.
 42 | 
 43 | 		prokka --kingdom Bacteria --outdir prokka_GCA_000008285 --genus Listeria --locustag GCA_000008285 GCA_000008285.1_ASM828v1_genomic.fna
 44 | 
 45 | Make sure you annotate the six genomes by replacing the `-outdir` and `-locustag` and `fasta file` accordingly. It should take ~ 4 minutes per genome in a standard laptop computer.
 46 | 
 47 | You should end up with 11 files including a .gff file.
 48 | 
 49 | I'm copying a description of the output files from the Prokka documentation here, but please check with the developers for in-depth documentation.
 50 | 
 51 | ### Output Files
 52 | 
 53 | | Extension | Description |
 54 | | --------- | ----------- |
 55 | | .gff | This is the master annotation in GFF3 format, containing both sequences and annotations. It can be viewed directly in Artemis or IGV. |
 56 | | .gbk | This is a standard Genbank file derived from the master .gff. If the input to prokka was a multi-FASTA, then this will be a multi-Genbank, with one record for each sequence. |
 57 | | .fna | Nucleotide FASTA file of the input contig sequences. |
 58 | | .faa | Protein FASTA file of the translated CDS sequences. |
 59 | | .ffn | Nucleotide FASTA file of all the annotated sequences, not just CDS. |
 60 | | .sqn | An ASN1 format "Sequin" file for submission to Genbank. It needs to be edited to set the correct taxonomy, authors, related publication etc. |
 61 | | .fsa | Nucleotide FASTA file of the input contig sequences, used by "tbl2asn" to create the .sqn file. It is mostly the same as the .fna file, but with extra Sequin tags in the sequence description lines. |
 62 | | .tbl | Feature Table file, used by "tbl2asn" to create the .sqn file. |
 63 | | .err | Unacceptable annotations - the NCBI discrepancy report. |
 64 | | .log | Contains all the output that Prokka produced during its run. This is a record of what settings you used, even if the --quiet option was enabled. |
 65 | | .txt | Statistics relating to the annotated features found. |
 66 | 
 67 | GFF files are the input for Roary to compute the pangenome and contain all the annotations plus the genome sequence in fasta format appended at the end.
 68 | 
 69 | ## Determining the pangenome
 70 | 
 71 | Let's put all the .gff files in the same folder (e.g., `./gff`) and run *Roary*
 72 | 		
 73 | 		roary -f ./demo -e -n -v ./gff/*.gff
 74 | 
 75 | Roary will get all the coding sequences, convert them into protein, and create pre-clusters. Then, using BLASTP and MCL, *Roary* will create clusters, and check for paralogs. Finally, *Roary* will take every isolate and order them by presence/absence of orthologs. The summary output is present in the `summary_statistics.txt` file. In our case, the results are as follows:
 76 | 
 77 | Genes| Number
 78 | |----|-------|
 79 | |Core genes (99% <= strains <= 100%)|	2010|
 80 | |Soft core genes (95% <= strains < 99%)| 0|
 81 | |Shell genes (15% <= strains < 95%)| 2454|
 82 | |Cloud genes (0% <= strains < 15%)|	0|
 83 | |Total genes|	4464|
 84 | 
 85 | Additionally, *Roary* produces a `gene_presence_absence.csv` file that can be opened in any spreadsheet software to manually explore the results. In this file, you will find information such as gene name and gene annotation, and, of course, whether a gene is present in a genome or not.
 86 | 
 87 | We already have a phylogeny that represents the evolutionary history of this six isolates, where they form clades according to their genotype, i.e., type I isolates together, and so on.
 88 | 
 89 | ![phylogeny](https://raw.githubusercontent.com/microgenomics/tutorials/master/img/core_gene_alignment.tre.png)
 90 | 
 91 | When you analyze your own data, you will need a phylogeny that represents the evolutionary history of your isolates. The inference of a phylogenetic tree is not part of roary's functions, but you can use the core gene alignment (file: `core_gene_alignment.aln`) as input to infer a tree.
 92 | 
 93 | *Roary* comes with a python script that allows you to generate a few plots to graphically assess your analysis output. You will need the `gene_presence_absence.csv` file and the phylogeny, which you can get from [here](https://github.com/microgenomics/tutorials/blob/master/core_gene_alignment.nwk). Then try using the following command:
 94 | 
 95 | 		python roary_plots.py core_gene_alignment.nwk gene_presence_absence.csv
 96 | 
 97 | You should get three files: a pangenome matrix, a frequency plot, and a pie chart. 
 98 | 
 99 | ![matrix](https://raw.githubusercontent.com/microgenomics/tutorials/master/img/pangenome_matrix.png)
100 | ![frequency](https://raw.githubusercontent.com/microgenomics/tutorials/master/img/pangenome_frequency.png)
101 | ![pie](https://raw.githubusercontent.com/microgenomics/tutorials/master/img/pangenome_pie.png)
102 | 
103 | ## Pangenome sequence analysis
104 | We have already Genome annotation and Pangenome analysis, but if you wanna know the sequence of a gene in particular in the pangenome you have to search by your own the sequence in the .ffn files. To avoid this inconvenient, Enzo Guerrero-Araya wrote a script in Python3 that make csv files of all loci in the pangenome. The csv's files can be imported to a database like Sqlite3.
105 | 
106 | Let's put all the .ffn files in the same folder (e.g., `./ffn`) and run [*DBGenerator.py*](https://github.com/eandree/tutorials/blob/patch-1/DBGenerator.py) in the same directory where is the `gene_presence_absence.csv` file.
107 | 		
108 | 	python3 DBGenerator.py ffn
109 | 
110 | The script in this version will generate 4 csv files:
111 | 
112 | Files| Description
113 | |---|---|
114 | |genomas_locus.csv|It contains 2 columns [name of genome, name of locus]|
115 | |pangenoma.csv|It contains all the information of the annotation that Roary reported in the `gene_presence_absence.csv` file|
116 | |pangenoma_locus.csv|It contains 2 columns [name of gene, name of locus]|
117 | |locus_sequence.csv|It contains 2 columns [name of locus, nucleotide sequence]|
118 | 
119 | Now we have all csv files for make our own database, in terminal you have to type:
120 | 
121 | 	sqlite3 database.sqlite
122 | 
123 | In the sqlite3 prompt rum:
124 | 	
125 | 	create table genomas_locus (cod text, locus text);
126 | 	create table pangenoma (gene text, non_unique_gene_name text, annotation text, no_isolates integer, no_sequences integer, avg_sequences_per_isolate integer, genome_fragment integer, order_within_fragment integer, accessory_fragment integer, accessory_order_with_fragment integer, qc text, min_group_size_nuc integer, max_group_size_nuc integer, avg_group_size_nuc integer);
127 | 	create table pangenoma_locus (gene text, locus text);
128 | 	create table locus_sequence (locus text, sequence text);
129 | 	
130 | 	.separator '|'
131 | 	.import genomas_locus.csv genomas_locus
132 | 	.import pangenoma.csv pangenoma
133 | 	.import pangenoma_locus.csv pangenoma_locus
134 | 	.import locus_sequence.csv locus_sequence
135 | 	
136 | 	create index genomas_locus_index on genomas_locus(cod, locus);
137 | 	create index pangenoma_index on pangenoma(gene, non_unique_gene_name, annotation, no_isolates, no_sequences, avg_sequences_per_isolate, genome_fragment, order_within_fragment, accessory_fragment, accessory_order_with_fragment, qc, min_group_size_nuc, max_group_size_nuc, avg_group_size_nuc);
138 | 	create index pangenoma_locus_index on pangenoma_locus(gene, locus);
139 | 	create index locus_sequence_index on locus_sequence(locus, sequence);
140 | 
141 | Now just we have to join tables with sql query like:
142 | 
143 | 	select '>'|| cod || '|' || locus_sequence.locus || '|' || pangenoma.gene || x'0a' || sequence
144 | 	from locus_sequence
145 | 	inner join pangenoma_locus on locus_sequence.locus = pangenoma_locus.locus
146 | 	inner join pangenoma on pangenoma_locus.gene = pangenoma.gene
147 | 	inner join genomas_locus on locus_sequence.locus = genomas_locus.locus
148 | 	where pangenoma.gene = 'tetC';
149 | 	
150 | 	>GCA_000008285_01152016|GCA_000008285_02480|tetC
151 | 	ATGGAAAAGAAGCGGACTCGGGCAGAAGAATTAGGAATAACTAGAAGAAAAATTTTGGATACAGCACGTGATTTATTTATGGAAAAGGGTTACCGGGCAGTTTCAACAAGAGAAATAGCTAAAATTGCCAACATTACCCAACCGGCACTATATCACCACTTTGAAGATAAAGAATCCCTATATATTGAAGTGGTTCGTGAATTGACGCAAAATATCCAAGTGGAAATGCATCCAATTATGCAAGTGACCAAAGCAAAAGAAGAACAATTACATGATATGTTAATTATGTTAATTGAGGAACATCCAACCAATATTCTATTAATGATTCACGATATTCTTAATGAAATGAAACCAGAAAATCAATTTTTACTTTATAAATTATGGCAAAAAACCTATTTGGAACCATTTCAACTATTTTTTGAGCGTCTAGAAAATGCTGGCGAATTGCGTGATGGTGTCAGTGCTGAGACTGCTGCGAGATACTGTTTGTCCACTATTAGCCCTCTTTTTTCTGGGAAAGGCAGCTTTGCGCAAAAGCAAACGACTACAGAACAAATTGATGAATTAATCAACTTAATGATGTTTGGTATATGTAAAAAAGAGGTATAA
152 | 	>GCA_000021185_01152016|GCA_000021185_00131|tetC
153 | 	ATGGAAAAGAAGCGGACTCGAGCAGAAGAATTAGGAATAACCAGAAGGAAAATCCTTGATACAGCAAGGGATTTATTTATGGAAAAAGGGTACCGGGCAGTCTCGACAAGAGAAATTGCTAAAATTGCCAAAATTACCCAACCAGCACTTTATCACCATTTTGAAGATAAAGAATCACTTTATATTGAAGTAGTTCGTGAATTGACGCAAAATATTCAAGTGGAAATGCACCCAATTATGCAAACGAGCAAAGCAAAAGAAGAACAACTGCATGATATGTTAATCATGTTAATTGAGGAGCATCCAACCAATATTCTGCTAATGATTCATGATATTCTTAATGAAATGAAGCCAGAAAATCAATTTTTACTTTATAAATTGTGGCAAAAAACCTATTTAGAACCATTTCAAGACTTTTTTGAGCGATTAGAAAATGCTGGCGAATTGCGTGATGGTATCAGTGCTGAGACCGCTGCGAGATACTGTTTATCCACTATTAGCCCGCTTTTTTCAGGGAAAGGTAGCTTTGCGCAAAAGCAAACGACTACAGAACAAATCGATGAATTAATCAACTTAATGATGTTTGGCATATGTAAAAAAGAGGTATAA
154 | 	>GCA_000026705_01152016|GCA_000026705_02479|tetC
155 | 	ATGGAAAAGAAGCGGACTCGGGCAGAAGAATTAGGAATAACTAGAAGAAAAATTTTGGATACAGCACGTGATTTATTTATGGAAAAGGGTTACCGGGCAGTTTCAACAAGAGAAATAGCTAAAATTGCCAACATTACCCAACCGGCACTATATCACCACTTTGAAGATAAAGAATCCCTATATATTGAAGTGGTTCGTGAATTGACGCAAAATATCCAAGTGGAAATGCATCCAATTATGCAAGTGACCAAAGCAAAAGAAGAACAATTACATGATATGTTAATTATGTTAATTGAGGAACATCCAACCAATATTCTATTAATGATTCACGATATTCTTAATGAAATGAAACCAGAAAATCAATTTTTACTTTATAAATTATGGCAAAAAACCTATTTGGAACCATTTCAACTATTTTTTGAGCGTCTAGAAAATGCTGGCGAATTGCGTGATGGTGTCAGTGCTGAGACTGCTGCGAGATACTGTTTGTCCACTATTAGCCCTCTTTTTTCTGGGAAAGGCAGCTTTGCGCAAAAGCAAACGACTACAGAACAAATTGATGAATTAATCAACTTAATGATGTTTGGTATATGTAAAAAAGAGGTATAA
156 | 	>GCA_000168635_01152016|GCA_000168635_02549|tetC
157 | 	ATGGAAAAGAAGCGGACTCGAGCAGAAGAATTAGGAATAACTAGAAGAAAAATTTTGGATACAGCACGTGATTTATTTATGGAAAAGGGTTACCGGGCAGTTTCTACAAGAGAAATAGCTAAAATTGCTAACATTACCCAACCGGCACTTTATCATCACTTTGAAGATAAAGAATCCCTATATATTGAAGTGGTTCGTGAATTGACGCAAAATATCCAGGTGGAAATGCATCCAATTATGCAAACGAACAAAGCAAAAGAAGAACAATTACATGATATGTTAATTATGTTAATTGAGGAACATCCCACCAATATTCTATTAATGATTCACGATATTCTTAATGAAATGAAACCAGAGAATCAATTTTTACTTTATAAATTATGGCAAAAAACCTATTTAGAACCATTTCAACAATTTTTTGAGCGTCTAGAAAATGCTGGTGAATTGCGTAATGGTATCAGTGCTGAGACCGCTGCAAGATACTGTTTGTCCACTATTAGCCCTCTTTTTTCAGGGAAAGGTAGCTTTGCGCAAAAGCAAACGACTACAGAACAAATCGATGAATTAATCAACTTAATGATGTTTGGCATATGTAAAAAAGAGGTATAA
158 | 	>GCA_000168815_01152016|GCA_000168815_01572|tetC
159 | 	ATGGAAAAGAAGCGGACTCGGGCAGAAGAATTAGGAATAACTAGAAGAAAAATTTTGGATACAGCACGTGATTTATTTATGGAAAAGGGTTACCGGGCAGTTTCAACAAGAGAAATAGCTAAAATTGCCAACATTACCCAACCGGCACTATATCACCACTTTGAAGATAAAGAATCCCTATATATTGAAGTGGTTCGTGAATTGACGCAAAATATCCAAGTGGAAATGCATCCAATTATGCAAGTGACCAAAGCAAAAGAAGAACAATTACATGATATGTTAATTATGTTAATTGAGGAACATCCAACCAATATTCTATTAATGATTCACGATATTCTTAATGAAATGAAACCAGAAAATCAATTTTTACTTTATAAATTATGGCAAAAAACCTATTTGGAACCATTTCAACTATTTTTTGAGCGTCTAGAAAATGCTGGCGAATTGCGTGATGGTGTCAGTGCTGAGACTGCTGCGAGATACTGTTTGTCCACTATTAGCCCTCTTTTTTCTGGGAAAGGCAGCTTTGCGCAAAAGCAAACGACTACAGAACAAATTGATGAATTAATCAACTTAATGATGTTTGGTATATGTAAAAAAGAGGTATAA
160 | 	>GCA_000196035_01152016|GCA_000196035_02552|tetC
161 | 	ATGGAAAAGAAGCGGACTCGAGCAGAAGAATTAGGAATAACTAGAAGAAAAATTTTGGATACAGCACGTGATTTATTTATGGAAAAGGGTTACCGGGCAGTTTCTACAAGAGAAATAGCTAAAATTGCCAACATTACCCAACCGGCACTGTATCATCACTTTGAAGATAAAGAATCCCTATATATTGAAGTGGTTCGTGAATTGACGCAAAATATCCAGGTGGAAATGCATCCAATTATGCAAACGAACAAAGCAAAAGAAGAACAATTACATGATATGTTAATTATGTTAATTGAGGAACATCCCACCAATATTCTATTAATGATTCACGATATTCTTAATGAAATGAAACCAGAGAATCAATTTTTACTTTATAAATTATGGCAAAAAACCTATTTAGAACCATTTCAACAATTTTTTGAGCGTCTAGAAAATGCTGGTGAATTGCGTAATGGTATCAGTGCTGAGACCGCTGCAAGATACTGTTTGTCCACTATTAGCCCTCTTTTTTCAGGGAAAGGTAGCTTTGCGCAAAAGCAAACGACTACAGAACAAATCGATGAATTAATCAACTTAATGATGTTTGGCATATGTAAAAAAGAGGTATAA
162 | 
163 | And thats its all. we get all sequences in fasta format of tetC gene.
164 |  
165 | 
166 | ## Citation
167 | 
168 | Seemann T.  
169 | *Prokka: rapid prokaryotic genome annotation*  
170 | **Bioinformatics** 2014 Jul 15;30(14):2068-9.   
171 | [PMID:24642063](http://www.ncbi.nlm.nih.gov/pubmed/24642063)  
172 | 
173 | Andrew J. Page, Carla A. Cummins, Martin Hunt, Vanessa K. Wong, Sandra Reuter, Matthew T. G. Holden, Maria Fookes, Daniel Falush, Jacqueline A. Keane, Julian Parkhill.   
174 | *Roary: Rapid large-scale prokaryote pan genome analysis*  
175 | **Bioinformatics** 2015 Jul 20. pii: btv421  
176 | [PMID: 26198102](http://www.ncbi.nlm.nih.gov/pubmed/26198102)
177 | 
178 | 


--------------------------------------------------------------------------------
/sra.md:
--------------------------------------------------------------------------------
 1 | ![banner](https://raw.githubusercontent.com/microgenomics/tutorials/master/img/microgenomics.png)
 2 | 
 3 | # SRA Tutorial
 4 | -------------------------
 5 | 
 6 | 
 7 | ### Get SRA toolkit
 8 | We need to first download SRA toolkit from NCBI's website:
 9 | 
10 | http://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?view=software
11 | 
12 | Select your appropriate binary distribution. For most modern Macs, MacOS 64 bit architecture would do the trick. The resulting file is ~ 51.5 mb
13 | 
14 | ![toolkit](https://raw.githubusercontent.com/microgenomics/tutorials/master/img/sra02.png)
15 | 
16 | Double-click on sratoolkit.2.5.7-mac64.tar.gz to uncompress the file. Now you need to open a Terminal window (within Applications/Utilities/Terminal). 
17 | 
18 | ![term](https://raw.githubusercontent.com/microgenomics/tutorials/master/img/sra04.png)
19 | 
20 | You should see a window like the following:
21 | 
22 | ![term](https://raw.githubusercontent.com/microgenomics/tutorials/master/img/sra022.png)
23 | 
24 | Now you need to "navigate" to the SRA toolkit folder by issueing the following command
25 | 
26 | 	cd /Users/Ed/Downloads/sratoolkit.2.5.7-mac64/bin
27 | 
28 | `cd`  indicates that you want to change directory and the rest is simply the path to the destination directory. Make sure to replace `Ed` in the path above by your own home directory name. To confirm you are where you are suppossed to type `pwd`. The Terminal should return your current location, i.e., `/Users/Ed/Downloads/sratoolkit.2.5.7-mac64/bin`
29 | Lastly, type `./fastq-dump -h` to see the help menu.
30 | 
31 | ![commands](https://raw.githubusercontent.com/microgenomics/tutorials/master/img/sra03.png)
32 | 
33 | ### Get SRA data
34 | 
35 | Next, we need to download actual data. We will use a somewhat small file hosted at SRA under the accession `SRR3171211`. Issue the following command in order to download the data:
36 | 
37 | 	./fastq-dump --accession SRR3171211 --outdir my_outdir
38 | where `SRR3171211` is the file we want and `my_outdir` is simply an arbitrary name for the output directory
39 | 
40 | ![dump](https://raw.githubusercontent.com/microgenomics/tutorials/master/img/sra05.png)
41 | 
42 | and voilà! You should see a file named `SRR3171211.fastq` inside the output directory (takes about 10 minutes; 111.5 mb)
43 | 
44 | ![file](https://raw.githubusercontent.com/microgenomics/tutorials/master/img/sra06.png)
45 | 


--------------------------------------------------------------------------------