├── .gitattributes
├── .gitignore
├── 16S
    ├── 16S.pdf
    ├── README.md
    └── images
    │   ├── 16Sgraph.png
    │   ├── banner.png
    │   ├── calypso_data_summary.png
    │   ├── calypso_start.png
    │   ├── copyrighter.png
    │   ├── dada_provenance.png
    │   ├── data_filtering.png
    │   ├── filter_normalize.png
    │   ├── next.png
    │   ├── normalization_tfm.png
    │   ├── rep_seqs.png
    │   ├── seq_tree.png
    │   ├── successful_normalize.png
    │   ├── successful_parse.png
    │   ├── successful_parsed_counts.png
    │   ├── successful_qiime.png
    │   ├── table_summary.png
    │   ├── upload_optional_files.png
    │   └── upload_own_data.png
├── ANVIO
    ├── ANVIO.pdf
    ├── README.md
    ├── images
    │   ├── aws_rules.png
    │   ├── inbound.png
    │   └── outbound.png
    ├── security.md
    └── security.pdf
├── AnnotationPipelines
    ├── AnnotationPipelines.pdf
    ├── README.md
    └── images
    │   ├── new_folder.png
    │   ├── rast_chooser.png
    │   ├── rast_job_info.png
    │   ├── rast_pipeline.png
    │   └── rast_summ_stats.png
├── AntibioticResistance
    └── images
    │   └── resistance.png
├── Assignments
    ├── Assignments.pdf
    ├── GenomicsAssignment
    │   ├── README.md
    │   └── klebsiella.txt
    ├── MetagenomicsAssignment
    │   └── README.md
    ├── NCBIEDirectAssignment
    │   ├── README.md
    │   └── genera.txt
    └── README.md
├── CITATION.cff
├── CheckM
    ├── CheckM.pdf
    ├── README.md
    └── images
    │   └── checkm.png
├── Conda
    └── README.md
├── CrossAssembly
    ├── CCOM.md
    ├── CCOM.pdf
    ├── CrossAssembly.pdf
    ├── Metabat.md
    ├── Metabat.pdf
    ├── README.md
    └── images
    │   ├── CCOM.png
    │   ├── metabat.png
    │   ├── nineteencontigs.png
    │   └── spades.png
├── Databases
    ├── Databases.pdf
    ├── NCBI_Edirect.md
    ├── NCBI_Edirect.pdf
    ├── README.md
    ├── SRA.md
    ├── SRA.pdf
    ├── SRA_WGS_Abstracts.tsv
    └── images
    │   ├── 1772048.png
    │   ├── DDBJ.png
    │   ├── EMBL.png
    │   ├── GenBankGrowth.png
    │   ├── GroundwaterSize.png
    │   ├── JGI.jpg
    │   ├── JGI.png
    │   ├── Mappers.png
    │   ├── NCBI.png
    │   ├── NCBIEDirect.png
    │   ├── TARASize.png
    │   ├── UniProt.png
    │   ├── groundwater.png
    │   ├── kegg.png
    │   ├── pdb.png
    │   └── seed.png
├── Datasets
    ├── .gitattributes
    ├── CF
    │   ├── 788707_20171213_S_R1.fastq.gz
    │   ├── 788707_20171213_S_R2.fastq.gz
    │   ├── 788707_20180129_S_R1.fastq.gz
    │   ├── 788707_20180129_S_R2.fastq.gz
    │   ├── 788707_20180313_S_R1.fastq.gz
    │   ├── 788707_20180313_S_R2.fastq.gz
    │   ├── 788707_20181126_S_R1.fastq.gz
    │   ├── 788707_20181126_S_R2.fastq.gz
    │   └── README.md
    ├── Datasets.pdf
    ├── README.md
    ├── coral_algae
    │   ├── Algae.tgz
    │   ├── CCA.tgz
    │   ├── Control.tgz
    │   ├── Coral.tgz
    │   ├── README.md
    │   └── fastq
    │   │   ├── Algae_11.fastq.gz
    │   │   ├── Algae_12.fastq.gz
    │   │   ├── Algae_13.fastq.gz
    │   │   ├── Algae_14.fastq.gz
    │   │   ├── CCA_11.fastq.gz
    │   │   ├── CCA_12.fastq.gz
    │   │   ├── CCA_13.fastq.gz
    │   │   ├── Control_11.fastq.gz
    │   │   ├── Control_12.fastq.gz
    │   │   ├── Control_13.fastq.gz
    │   │   ├── Control_14.fastq.gz
    │   │   ├── Coral_11.fastq.gz
    │   │   ├── Coral_12.fastq.gz
    │   │   ├── Coral_13.fastq.gz
    │   │   └── Coral_14.fastq.gz
    ├── drinking_water
    │   ├── Calypso
    │   │   ├── README.md
    │   │   ├── feature-table.biom
    │   │   ├── metadata_calypso.tsv
    │   │   └── taxonomy.tsv
    │   ├── README.md
    │   ├── fastq
    │   │   ├── barcodes.fastq.gz
    │   │   └── sequences.fastq.gz
    │   └── metadata.tsv
    ├── ground_water
    │   ├── README.md
    │   └── fastq
    │   │   ├── SRR3546449_pass_1.fastq.gz
    │   │   ├── SRR3546449_pass_2.fastq.gz
    │   │   ├── SRR3546450_pass_1.fastq.gz
    │   │   ├── SRR3546450_pass_2.fastq.gz
    │   │   ├── SRR3546451_pass_1.fastq.gz
    │   │   ├── SRR3546451_pass_2.fastq.gz
    │   │   ├── SRR3546452_pass_1.fastq.gz
    │   │   ├── SRR3546452_pass_2.fastq.gz
    │   │   ├── SRR3546453_pass_1.fastq.gz
    │   │   ├── SRR3546453_pass_2.fastq.gz
    │   │   ├── SRR3546454_pass_1.fastq.gz
    │   │   ├── SRR3546454_pass_2.fastq.gz
    │   │   ├── SRR3546455_pass_1.fastq.gz
    │   │   ├── SRR3546455_pass_2.fastq.gz
    │   │   ├── SRR3546457_pass_1.fastq.gz
    │   │   └── SRR3546457_pass_2.fastq.gz
    └── gut
    │   ├── README.md
    │   └── fastq
    │       ├── SRR3466404_pass_1.fastq.gz
    │       ├── SRR3466404_pass_2.fastq.gz
    │       ├── SRR3506419_pass_1.fastq.gz
    │       ├── SRR3506419_pass_2.fastq.gz
    │       ├── SRR3506420_pass_1.fastq.gz
    │       ├── SRR3506420_pass_2.fastq.gz
    │       ├── SRR3546776_pass_1.fastq.gz
    │       ├── SRR3546776_pass_2.fastq.gz
    │       ├── SRR3546778_pass_1.fastq.gz
    │       ├── SRR3546778_pass_2.fastq.gz
    │       ├── SRR3546779_pass_1.fastq.gz
    │       ├── SRR3546779_pass_2.fastq.gz
    │       ├── SRR3546780_pass_1.fastq.gz
    │       ├── SRR3546780_pass_2.fastq.gz
    │       ├── SRR3546781_pass_1.fastq.gz
    │       ├── SRR3546781_pass_2.fastq.gz
    │       ├── SRR3546782_pass_1.fastq.gz
    │       └── SRR3546782_pass_2.fastq.gz
├── Deconseq
    └── README.md
├── Definitions.md
├── FOCUS
    ├── FOCUS.pdf
    ├── README.md
    └── images
    │   └── focus.jpg
├── GenomePeek
    ├── GenomePeek.pdf
    ├── README.md
    └── images
    │   ├── metagenome.png
    │   ├── mixed.png
    │   └── single.png
├── GenomeSequencingOverview
    ├── GenomeSequencingOverview.pdf
    ├── README.md
    ├── Whole_Genome_Sequencing.pdf
    └── images
    │   ├── GenBankGrowth.png
    │   ├── GenomesOnlineGrowth.png
    │   └── GenomicsAndModeling.png
├── Hecatomb
    └── README.md
├── Kraken2
    └── README.md
├── LICENSE
├── Linux
    ├── AWS_SAGC2021_Setup.sh
    ├── AWS_Setup.txt
    ├── Linux.pdf
    ├── MachineSetUp.md
    ├── MachineSetUp.pdf
    ├── README.md
    └── images
    │   ├── image1.png
    │   ├── image10.png
    │   ├── image11.png
    │   ├── image12.png
    │   ├── image13.png
    │   ├── image14.png
    │   ├── image15.png
    │   ├── image16.png
    │   ├── image17.png
    │   ├── image18.png
    │   ├── image19.jpg
    │   ├── image2.png
    │   ├── image20.png
    │   ├── image21.png
    │   ├── image3.png
    │   ├── image4.png
    │   ├── image5.png
    │   ├── image6.png
    │   ├── image7.png
    │   ├── image8.png
    │   ├── image9.png
    │   ├── mobaxterm_key.png
    │   └── mobaxterm_ssh.png
├── MMSeqs2
    └── README.md
├── Metagenomics
    ├── AnnotatingOrfMSeed.md
    ├── AnnotatingOrfMSeed.pdf
    ├── ExampleDataSets.md
    ├── ExampleDataSets.pdf
    ├── Metagenomics.pdf
    ├── README.md
    └── images
    │   ├── 1ng vs 100ng.png
    │   ├── BAC Cloning.png
    │   ├── CDA.png
    │   ├── CDA_Small.png
    │   ├── Earth Microbiome Project.png
    │   ├── Filters.png
    │   ├── Findley_PCA.png
    │   ├── Handelsman1.png
    │   ├── Handelsman2.png
    │   ├── RDP.png
    │   ├── SSU_RNApol.png
    │   ├── Silva.png
    │   ├── Staley_Konopka.png
    │   ├── Subsystems.png
    │   ├── VAMPS.png
    │   ├── Woese_Tree.png
    │   ├── abrolhos.png
    │   ├── acridine orange.png
    │   ├── plate counts.png
    │   ├── plate_count_anomaly.gif
    │   ├── qiita.png
    │   └── who_what_where.png
├── ORFCalling
    ├── ORFCalling.pdf
    ├── README.md
    └── images
    │   ├── blastx.png
    │   └── transcription_translation.png
├── PATRIC
    ├── GenomeSets.md
    ├── GenomeSets.pdf
    ├── PATRIC.pdf
    └── README.md
├── Python
    ├── .gitattributes
    ├── Bc01.fasta
    ├── Bc01.fasta.gz
    ├── BcSamples.tsv.gz
    ├── Python_Lesson_00.ipynb
    ├── Python_Lesson_01.ipynb
    ├── Python_Lesson_02.ipynb
    ├── Python_Lesson_03.ipynb
    ├── Python_Lesson_04.ipynb
    ├── Python_Lesson_05.ipynb
    ├── Python_Lesson_06.ipynb
    ├── Python_Lesson_07.ipynb
    ├── Python_Lesson_08.ipynb
    ├── Python_Lesson_09.ipynb
    ├── Python_Lesson_10.ipynb
    ├── Python_Lesson_11.ipynb
    ├── Python_Lesson_12.ipynb
    ├── README.md
    ├── Zen_of_python.md
    ├── countfasta.py
    ├── countfastq.py
    ├── metadata.tsv.gz
    ├── phylum.tsv.gz
    ├── requirements.txt
    └── tikkala.gbk.gz
├── README.md
├── RTMg
    ├── README.md
    ├── RTMg.pdf
    └── images
    │   └── sens.png
├── References
    ├── README.md
    └── References.pdf
├── RemovingContamination
    └── README.md
├── SUPER-FOCUS
    ├── README.md
    ├── SUPER-FOCUS.pdf
    └── images
    │   └── sf-logo.png
├── SequenceAssembly
    ├── N50.md
    ├── N50.pdf
    ├── README.md
    ├── SequenceAssembly.pdf
    └── images
    │   └── reads_contigs_scaffolds.png
├── SequenceFileFormats
    ├── INSDC_Features.md
    ├── INSDC_Features.pdf
    ├── README.md
    ├── SequenceFileFormats.pdf
    └── images
    │   ├── ascii.png
    │   ├── crAssphageDNA.png
    │   ├── crAssphageQuality.png
    │   ├── fastq.png
    │   └── prinseq.png
├── SequenceQC
    ├── IlluminaAdapters.fa
    ├── README.md
    ├── SequenceQC.pdf
    └── images
    │   ├── BadLength.png
    │   ├── GoodLength.png
    │   └── IonTorrentQual.gif
├── Sequencing
    ├── README.md
    ├── Sequencing.pdf
    └── images
    │   ├── GrowthSRA_SeqCost.png
    │   ├── SangerCrassphage.png
    │   ├── SangerCrassphageError.png
    │   ├── image1.png
    │   ├── image2.png
    │   ├── image3.png
    │   ├── image4.png
    │   ├── image5.png
    │   ├── image6.png
    │   ├── sanger_seq.png
    │   └── tagmentation.png
├── Snakemake
    ├── README.md
    ├── fastp.snakefile
    ├── filter_assemble.snakefile
    └── prinseq.snakefile
├── UPDATING.md
├── UPDATING.sh
├── UPGRADE.md
├── Workshops
    ├── COMBINE_QLD_2024.md
    ├── COMBINE_QLD_2024_binchicken.md
    ├── COMBINE_WA_2024.md
    ├── INRB2023.md
    ├── README.md
    ├── SAGC2021.md
    ├── Workshop_MAG_demo.ipynb
    ├── fastp_788707_20180129.html
    ├── images
    │   ├── bandage.png
    │   ├── k119_21323.allsamples.png
    │   ├── k119_21323.threesamples.png
    │   ├── megahit_bins.png
    │   └── metagenomics_map_uq_2024.png
    ├── makeing_machines.md
    └── snakemake
    │   ├── README.md
    │   ├── envs.zip
    │   ├── envs
    │       ├── focus.yaml
    │       ├── kraken.yaml
    │       ├── megahit.yaml
    │       ├── minimap.yaml
    │       ├── prinseq.yaml
    │       ├── samtools.yaml
    │       └── superfocus.yaml
    │   ├── process_metagenomesSAGC.snakefile
    │   └── process_metagenomesSAGC_conda.snakefile
├── _config.yml
└── tRNA_rRNA
    ├── README.md
    └── tRNA_rRNA.pdf


/.gitattributes:
--------------------------------------------------------------------------------
1 | *.tgz filter=lfs diff=lfs merge=lfs -text
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | # pycharm
107 | .idea/
108 | 


--------------------------------------------------------------------------------
/16S/16S.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/16S/16S.pdf


--------------------------------------------------------------------------------
/16S/images/16Sgraph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/16S/images/16Sgraph.png


--------------------------------------------------------------------------------
/16S/images/banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/16S/images/banner.png


--------------------------------------------------------------------------------
/16S/images/calypso_data_summary.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/16S/images/calypso_data_summary.png


--------------------------------------------------------------------------------
/16S/images/calypso_start.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/16S/images/calypso_start.png


--------------------------------------------------------------------------------
/16S/images/copyrighter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/16S/images/copyrighter.png


--------------------------------------------------------------------------------
/16S/images/dada_provenance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/16S/images/dada_provenance.png


--------------------------------------------------------------------------------
/16S/images/data_filtering.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/16S/images/data_filtering.png


--------------------------------------------------------------------------------
/16S/images/filter_normalize.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/16S/images/filter_normalize.png


--------------------------------------------------------------------------------
/16S/images/next.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/16S/images/next.png


--------------------------------------------------------------------------------
/16S/images/normalization_tfm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/16S/images/normalization_tfm.png


--------------------------------------------------------------------------------
/16S/images/rep_seqs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/16S/images/rep_seqs.png


--------------------------------------------------------------------------------
/16S/images/seq_tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/16S/images/seq_tree.png


--------------------------------------------------------------------------------
/16S/images/successful_normalize.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/16S/images/successful_normalize.png


--------------------------------------------------------------------------------
/16S/images/successful_parse.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/16S/images/successful_parse.png


--------------------------------------------------------------------------------
/16S/images/successful_parsed_counts.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/16S/images/successful_parsed_counts.png


--------------------------------------------------------------------------------
/16S/images/successful_qiime.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/16S/images/successful_qiime.png


--------------------------------------------------------------------------------
/16S/images/table_summary.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/16S/images/table_summary.png


--------------------------------------------------------------------------------
/16S/images/upload_optional_files.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/16S/images/upload_optional_files.png


--------------------------------------------------------------------------------
/16S/images/upload_own_data.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/16S/images/upload_own_data.png


--------------------------------------------------------------------------------
/ANVIO/ANVIO.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/ANVIO/ANVIO.pdf


--------------------------------------------------------------------------------
/ANVIO/images/aws_rules.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/ANVIO/images/aws_rules.png


--------------------------------------------------------------------------------
/ANVIO/images/inbound.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/ANVIO/images/inbound.png


--------------------------------------------------------------------------------
/ANVIO/images/outbound.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/ANVIO/images/outbound.png


--------------------------------------------------------------------------------
/ANVIO/security.md:
--------------------------------------------------------------------------------
 1 | # Creating security rules for AWS instances.
 2 | 
 3 | AWS images are set up by default so that you can only access them via SSH. This is a security issue, you don't want malicious actors using all your resources. In order to access the instance to see, for example, results from anvi'o, we need to add a new set of security rules.
 4 | 
 5 | In the left hand pane of the AWS EC2 Dashboard, click **Security Groups** under **Network and Security**.
 6 | 
 7 | Next, click the blue **Create Security Group** button.
 8 | 
 9 | We need to add four inbound rules:
10 | 1. A rule for ports 8080-8090 to be open. anvi'o usually uses 8080, but that may increment to 8081, 8082, ... if the earlier ports are busy.
11 | 2. Rules for both http (port 80). This is normal web traffic
12 | 3. Rules for https (port 443). This is secure (encrypted) web traffic.
13 | 4. A rule to allow SSH - so that you can access the machine!
14 | 
15 | Create each of these rules so that your inbound tab looks like this:
16 | 
17 | ![](images/inbound.png "How the inbound tab should look").
18 | 
19 | *Note 1:* for http/https/ssh you can just choose them from the pull down menu on the left.
20 | 
21 | *Note 2:* In this example, I have opened the ports up to the world. This is **bad**. A much more secure alternative is to choose **My IP** from the second pull down menu (where it shows **Anywhere** in the example). This will restrict access to the server from your current IP. 
22 | 
23 | That is the most secure option, and you should choose **My IP**. However, if you move locations, you'll need to add another rule, just like this.
24 | 
25 | You don't need to change the *outbound* rules, you can leave them looking like:
26 | 
27 | ![](images/outbound.png "Outbound rules stay as default")
28 | 
29 | Click Create the rules.
30 | 
31 | ## Apply the rules to your instance.
32 | 
33 | Head to the **instances** overview in the dashboard and choose your instance so that the checkbox next to it goes blue.
34 | 
35 | From the **Actions** menu choose **Networking** &rarr; **Change Security Groups**. 
36 | 
37 | Choose your new security group and click apply.
38 | 
39 | Once anvi'o tells you that a server is listening, you should now be able to open your browser, type the IP address (that you can get from the instances overview) and see the visualization.


--------------------------------------------------------------------------------
/ANVIO/security.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/ANVIO/security.pdf


--------------------------------------------------------------------------------
/AnnotationPipelines/AnnotationPipelines.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/AnnotationPipelines/AnnotationPipelines.pdf


--------------------------------------------------------------------------------
/AnnotationPipelines/README.md:
--------------------------------------------------------------------------------
 1 | # Annotation Pipelines
 2 | 
 3 | There are a few robust and well designed microbial genome annotation pipelines that you can use to analyze your genome sequences. Each has its own benefits and drawbacks, and these may dictate which pipeline you end up using.
 4 | 
 5 | * [RAST](http://rast.nmpdr.org/)
 6 | * [PROKKA](https://github.com/tseemann/prokka)
 7 | * [PATRIC](https://patricbrc.org/)
 8 | * [NCBI PGAP](https://www.ncbi.nlm.nih.gov/genome/annotation_prok/)
 9 | 
10 | ## Creating an assembled genome to annotate
11 | 
12 | The same approach that we have talked about in other modules was used to generate a test dataset, namely, [downloading fastq data from SRA](../Databases/SRA) and then [assembling the data with spades](../.SequenceAssembly/). To summarize, these are the commands that were used.
13 | 
14 | I downloaded the reads from [ERS012013](https://www.ncbi.nlm.nih.gov/sra/?term=ERS012013), which is part of Kat Holt's *Klebsiella* dataset (Project ID [PRJEB2111](https://www.ncbi.nlm.nih.gov/bioproject/PRJEB2111)), and assembled them using spades.
15 | 
16 | ```bash
17 | fastq-dump --outdir fastq --gzip --skip-technical  --readids --read-filter pass --dumpbase --split-3 --clip ERS012013
18 | spades.py -o assembly -1 fastq/ERS012013_pass_1.fastq.gz -2 fastq/ERS012013_pass_2.fastq.gz
19 | 
20 | ```
21 | 
22 | Statistic | Value
23 | --- | ---
24 | Number of sequences | 4271
25 | Total length | 8,716,579
26 | Shortest contig | 56
27 | Longest contig | 86,652
28 | N<sub>50</sub> | 11,152
29 | N<sub>75</sub> | 28,825
30 | 
31 | 
32 | In all the cases below we use the `scaffolds.fasta` output from spades for subsequent analysis.
33 | 
34 | ## Example annotation using RAST
35 | 
36 | Start at the [RAST website](https://rast.nmpdr.org/) and from `Your Jobs`  choose `Upload a new Job`. This opens up the file chooser page, and at the file chooser
37 | 
38 | ![file chooser](images/rast_chooser.png "RAST File Chooser")
39 | 
40 | select your `scaffolds.fasta` file. After that file is uploaded, you are presented with a summary of the contigs. Note that RAST may split some of the scaffolds that spades generated, and thus you may have slightly more contigs and slightly shorter sequence size, as shown here. The split happens on runs of `N` bases that spades inserts where it can estimate gaps between contigs based on sequence overlap.
41 | 
42 | ![RAST summary statistics](images/rast_summ_stats.png "RAST Summary Statistics. Note that the scaffolds have been separated into contigs again")
43 | 
44 | The bottom of this page asks for information about the organism you have sequenced. If you enter the taxid, as shown here, the form should populate with information from NCBI.
45 | 
46 | ![RAST Job Information](images/rast_job_info.png "RAST Job Information")
47 | 
48 | There a series of questions about the annotation pipeline. Two recommended options are to build metabolic models and fix frameshifts, especially if you have a draft genome. Fixing frameshifts is controversial because some genomes (notably *Salmonella enterica serovar Typhi*) have a large number of frameshifts that are an evolutionary trait!
49 | 
50 | ![RAST Pipeline](images/rast_pipeline.png "RAST pipeline")
51 | 
52 | *Note*: at this stage you can also choose to customize some of the options for the RAST pipeline.
53 | 
54 | 
55 | ## Example annotation using PROKKA
56 | 
57 | *Note*: The [PROKKA GiutHub Site](https://github.com/tseemann/prokka) contains many other recipes and advances options for annotating the `scaffolds.fasta` file using PROKKA.
58 | 
59 | 
60 | 
61 | 
62 | ## Example submission using PATRIC
63 | 
64 | To annotate the contigs using PATRIC, I first go to the [PATRIC website](https://patricbrc.org/) and log in. If you don't have an account you will need to create one.
65 | 
66 | Create a new workspace called `Klebsiella` by clicking on the `Workspaces` menu and going to your `home` directory, and then clicking on the  ![new folder](images/new_folder.png "new folder icon") new folder icon on the top right.
67 | 
68 | Then use the `p3` commands to submit the `scaffolds.fasta` file for annotation as a genome. You will need to [follow these installation instructions](https://docs.patricbrc.org/cli_tutorial/cli_installation.html) to install the `p3` commands, and at the moment they do not provide a CentOS version so it they are not included on the AWS instance.
69 | 
70 | Once you have installed `p3`, you will need to login:
71 | ```
72 | p3-login
73 | ```
74 | 
75 | and provide the same credentials that you use for the website.
76 | 
77 | For the command, we need to provide several variables:
78 | 
79 | Variable | Definition
80 | ---|---
81 | --contigs-file | the source of the contigs (probably scaffolds.fasta from spades output)
82 | -n | the name we want to use for our genome
83 | -t | the [NCBI Taxonomy ID](https://www.ncbi.nlm.nih.gov/taxonomy). For *Klebsiella pneumoniae* this is [573](https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=573). <br />This is used to ensure that the correct parameters are used for the annotation processes.
84 | -d | the domain (Bacteria, Archaea, Eukarya, or Virus)
85 | 
86 | Then we provide the workspace and the file name to call it in the workspace. 
87 | 
88 | ```bash
89 | p3-submit-genome-annotation --contigs-file scaffolds.fasta -n "Klebsiella pneumoniae NT211489B" -t 573 -d Bacteria /redwards@patricbrc.org/home/Klebsiella "Klebsiella pneumoniae NT211489B"
90 | ```
91 | 
92 | 
93 | 
94 | 
95 | 
96 | 
97 | 
98 | 
99 | 


--------------------------------------------------------------------------------
/AnnotationPipelines/images/new_folder.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/AnnotationPipelines/images/new_folder.png


--------------------------------------------------------------------------------
/AnnotationPipelines/images/rast_chooser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/AnnotationPipelines/images/rast_chooser.png


--------------------------------------------------------------------------------
/AnnotationPipelines/images/rast_job_info.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/AnnotationPipelines/images/rast_job_info.png


--------------------------------------------------------------------------------
/AnnotationPipelines/images/rast_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/AnnotationPipelines/images/rast_pipeline.png


--------------------------------------------------------------------------------
/AnnotationPipelines/images/rast_summ_stats.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/AnnotationPipelines/images/rast_summ_stats.png


--------------------------------------------------------------------------------
/AntibioticResistance/images/resistance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/AntibioticResistance/images/resistance.png


--------------------------------------------------------------------------------
/Assignments/Assignments.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Assignments/Assignments.pdf


--------------------------------------------------------------------------------
/Assignments/GenomicsAssignment/README.md:
--------------------------------------------------------------------------------
 1 | # Whole genome analysis assignments
 2 | 
 3 | This marks the beginning of several assignments where data will flow from one assignment to the other. We are going to study the genome of *Klebsiella pneumonia*. This is one of the so-called [ESKAPE](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4871955/) pathogens that are a serious threat to public health because of the rise of multiple drug resistance. 
 4 | 
 5 | The ESKAPE pathogens are:
 6 | 
 7 | + *Enterococcus faecium*
 8 | + *Staphylococcus aureus*
 9 | + *Klebsiella pneumoniae*
10 | + *Acinetobacter baumannii*
11 | + *Pseudomonas aeruginosa*
12 | + *Enterobacter* species
13 | 
14 | 
15 | In the next series of assignments, we will recapitulate some of the amazing work performed by Professor Kat Holt and colleagues who did a global analysis of the emergence of *Klebsiella* as a pathogen. 
16 | 
17 | This is a particularly nasty pathogen that causes problems worldwide. It has virulence genes (that enable it to cause disease) and antibiotic ﻿﻿﻿﻿﻿﻿﻿﻿﻿resistance genes (that make it harder to treat!) and they hoped to learn many of the differences in these genomes
18 | 
19 | You can read more about their analysis in their open access paper published in [Proceedings of the National Academy of Sciences](http://www.pnas.org/content/early/2015/06/17/1501049112).
20 | 
21 | I particularly encourage you to read [this terrific blog post](https://holtlab.net/2015/06/23/population-genomics-of-klebsiella/) by Prof. Holt where she describes some of the key points of their work. During these assignments, we're going to try and recapitulate some of their findings!
22 | 
23 | Finally, the [metadata associated with the samples is available at microreact](https://microreact.org/project/VJdoJhfkx). You should take a look at that, as it will get you started with understanding the differences in the data.
24 | 
25 | ## Part 1. Download and assemble the data
26 | 
27 | You can find a complete list of the sequence accessions for these samples [here](klebsiella.txt). These IDs are from the [European Nucleotide Archive](https://www.ebi.ac.uk/ena) but you can also download them from the [NCBI sequence read archive](https://www.ncbi.nlm.nih.gov/sra/) which is what we will do here. (By the way, you will notice that we have trimmed out some of the sequences described in the [metadata](https://microreact.org/project/VJdoJhfkx) because they are already assembled! if you want to proceed with some of those, you can skip the assembly step.
28 | 
29 | ### Assembly note
30 |  
31 | Sequence assembly is, by its very nature, extremely memory intensive. For complete assembly of the data using AWS, I recommend using a t2.medium machine that has two cores and 4 GB RAM or a t2.large machine that has four cores and 8 GB RAM. (When working with complex data like metagenomes that we will talk about later, even this is not enough and we step up to machines with hundreds of GB of RAM or even many TB of RAM!)
32 | 
33 | If you have a running AWS instance, you can change the state of the machine:
34 | 
35 | 1. Log in to the AWS Console
36 | 2. Stop the machine by choosing the machine and clicking Stop from the Actions menu (under instance state). This will suspend but not delete it
37 | 3. Once it has stopped, you can change the type by choosing Actions --> Instance Settings --> Change Instance Type
38 | 4. Restart the instance as the new type.
39 | 
40 | ### Getting started with the assignment
41 | 
42 | First, you need to download the data from the SRA using [fastq-dump](../Databases/SRA#fastq-dump). 
43 | 
44 | This should create one, two or three files for your data. Note that fastq-dump also leaves a copy of the data in the `~/ncbi/` directory that you can go ahead and delete to save space.
45 | 
46 | Next, we need to assemble that data. For this assignment we are going to use spades.
47 | 
48 | spades.py has a lot of options - run `spades.py` without any arguments to see what they are (pro tip: you may want to pipe the output of that to `less`). The main ones we will use are `-1` and `-2` for left and right paired end reads, and `-s` for unpaired reads. If `fastq-dump` gave you one file, use `-s` with that file name. If `fastq-dump` gave you two files, one will be used with the `-1` option and the other with the `-2` option, and finally if `fatq-dump` gave you three files, you will use the paired files with `-1` and `-2` and the unpaired file with `-s`. Note that with spades you can specify many multiples of paired end files, and many additional unpaired files.
49 | 
50 | Once you have assembled that data can you generate the data that describes:
51 | 
52 | * Unassembled Size (bp)
53 | * Unassembled size (# reads)
54 | * Number of Contigs
55 | * Longest contig
56 | * N<sub>50</sub>
57 | 
58 | ## Part 2. Identify the open reading frames
59 | 
60 | Once you have assembled the genome into contigs, we are going to annotate the [open reading frames](../ORFCalling/) in the genome.
61 | 
62 | Describe:
63 | 
64 | * The number of predicted genes
65 | * The length of the longest gene (bp)
66 | 
67 | ## Part 3. Identify the RNA genes.
68 | 
69 | The next step is to identify the [RNA genes](../tRNA_rRNA) in the genome. We will identify both the rRNA and the tRNA genes.
70 | 
71 | Describe:
72 | 
73 | * The number of rRNA
74 | * Number of tRNA
75 | 
76 | 
77 | ## Part 4. Identify the functions of the proteins.
78 | 
79 | Function of the longest protein	Number of hypothetical proteins	Number of proteins with known functions
80 | 
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------
/Assignments/GenomicsAssignment/klebsiella.txt:
--------------------------------------------------------------------------------
  1 | ERS005742
  2 | ERS005743
  3 | ERS005744
  4 | ERS005745
  5 | ERS005747
  6 | ERS005748
  7 | ERS005749
  8 | ERS005750
  9 | ERS005751
 10 | ERS005752
 11 | ERS005754
 12 | ERS005755
 13 | ERS005756
 14 | ERS005757
 15 | ERS005758
 16 | ERS005759
 17 | ERS005760
 18 | ERS005762
 19 | ERS005763
 20 | ERS005764
 21 | ERS005765
 22 | ERS005766
 23 | ERS005767
 24 | ERS005768
 25 | ERS005770
 26 | ERS005771
 27 | ERS005772
 28 | ERS005773
 29 | ERS005774
 30 | ERS005775
 31 | ERS005777
 32 | ERS005778
 33 | ERS005779
 34 | ERS005780
 35 | ERS005781
 36 | ERS005783
 37 | ERS005784
 38 | ERS005785
 39 | ERS005786
 40 | ERS005787
 41 | ERS005788
 42 | ERS011784
 43 | ERS011785
 44 | ERS011786
 45 | ERS011787
 46 | ERS011788
 47 | ERS011789
 48 | ERS011790
 49 | ERS011791
 50 | ERS011792
 51 | ERS011794
 52 | ERS011797
 53 | ERS011798
 54 | ERS011799
 55 | ERS011800
 56 | ERS011801
 57 | ERS011802
 58 | ERS011803
 59 | ERS011804
 60 | ERS011805
 61 | ERS011806
 62 | ERS011807
 63 | ERS011808
 64 | ERS011809
 65 | ERS011810
 66 | ERS011811
 67 | ERS011812
 68 | ERS011813
 69 | ERS011814
 70 | ERS011815
 71 | ERS011816
 72 | ERS011817
 73 | ERS011818
 74 | ERS011819
 75 | ERS011820
 76 | ERS011821
 77 | ERS011822
 78 | ERS011823
 79 | ERS011824
 80 | ERS011825
 81 | ERS011826
 82 | ERS011827
 83 | ERS011828
 84 | ERS011829
 85 | ERS011830
 86 | ERS011833
 87 | ERS011834
 88 | ERS011835
 89 | ERS011836
 90 | ERS011837
 91 | ERS011838
 92 | ERS011839
 93 | ERS011840
 94 | ERS011841
 95 | ERS011842
 96 | ERS011843
 97 | ERS011845
 98 | ERS011847
 99 | ERS011848
100 | ERS011849
101 | ERS011850
102 | ERS011851
103 | ERS011852
104 | ERS011853
105 | ERS011854
106 | ERS011855
107 | ERS011856
108 | ERS011857
109 | ERS011858
110 | ERS011859
111 | ERS011860
112 | ERS011861
113 | ERS011862
114 | ERS011863
115 | ERS011864
116 | ERS011865
117 | ERS011866
118 | ERS011867
119 | ERS011868
120 | ERS011869
121 | ERS011870
122 | ERS011871
123 | ERS011872
124 | ERS011873
125 | ERS011878
126 | ERS011879
127 | ERS011880
128 | ERS011882
129 | ERS011883
130 | ERS011884
131 | ERS011885
132 | ERS011886
133 | ERS011887
134 | ERS011888
135 | ERS011890
136 | ERS011891
137 | ERS011892
138 | ERS011893
139 | ERS011894
140 | ERS011895
141 | ERS011896
142 | ERS011897
143 | ERS011898
144 | ERS011900
145 | ERS011901
146 | ERS011902
147 | ERS011903
148 | ERS011904
149 | ERS011905
150 | ERS011906
151 | ERS011907
152 | ERS011908
153 | ERS011909
154 | ERS011910
155 | ERS011911
156 | ERS011912
157 | ERS011913
158 | ERS011914
159 | ERS011915
160 | ERS011916
161 | ERS011917
162 | ERS011918
163 | ERS011919
164 | ERS011920
165 | ERS011921
166 | ERS011922
167 | ERS011923
168 | ERS011924
169 | ERS011925
170 | ERS011926
171 | ERS011927
172 | ERS011930
173 | ERS011931
174 | ERS011932
175 | ERS011933
176 | ERS011934
177 | ERS011935
178 | ERS011936
179 | ERS011938
180 | ERS011939
181 | ERS011940
182 | ERS011941
183 | ERS011942
184 | ERS011943
185 | ERS011944
186 | ERS011945
187 | ERS011946
188 | ERS011947
189 | ERS011948
190 | ERS011949
191 | ERS011950
192 | ERS011951
193 | ERS011952
194 | ERS011953
195 | ERS011954
196 | ERS011955
197 | ERS011956
198 | ERS011957
199 | ERS011958
200 | ERS011959
201 | ERS011960
202 | ERS011961
203 | ERS011962
204 | ERS011963
205 | ERS011964
206 | ERS011965
207 | ERS011966
208 | ERS011968
209 | ERS011969
210 | ERS011970
211 | ERS011971
212 | ERS011972
213 | ERS011973
214 | ERS011974
215 | ERS011975
216 | ERS011976
217 | ERS011977
218 | ERS011978
219 | ERS011979
220 | ERS011980
221 | ERS011981
222 | ERS011982
223 | ERS011983
224 | ERS011984
225 | ERS011985
226 | ERS011986
227 | ERS011987
228 | ERS011988
229 | ERS011989
230 | ERS011990
231 | ERS011991
232 | ERS011992
233 | ERS011993
234 | ERS011996
235 | ERS011997
236 | ERS011998
237 | ERS011999
238 | ERS012001
239 | ERS012002
240 | ERS012003
241 | ERS012004
242 | ERS012005
243 | ERS012006
244 | ERS012008
245 | ERS012009
246 | ERS012010
247 | ERS012011
248 | ERS012012
249 | ERS012013
250 | ERS012014
251 | ERS012015
252 | ERS012016
253 | ERS012017
254 | ERS012019
255 | ERS012020
256 | ERS012021
257 | ERS012022
258 | ERS012023
259 | ERS012024
260 | ERS012026
261 | ERS012027
262 | ERS012028
263 | ERS012029
264 | ERS012030
265 | ERS012031
266 | ERS012032
267 | ERS012033
268 | ERS012034
269 | ERS012035
270 | ERS012036
271 | ERS012037
272 | ERS012038
273 | ERS012039
274 | ERS012040
275 | ERS012041
276 | ERS012042
277 | ERS012043
278 | ERS012044
279 | ERS012045
280 | ERS012046
281 | ERS012047
282 | ERS012048
283 | ERS012049
284 | ERS012050
285 | ERS012051
286 | ERS012052
287 | ERS012053
288 | ERS012054
289 | ERS012055
290 | 


--------------------------------------------------------------------------------
/Assignments/NCBIEDirectAssignment/README.md:
--------------------------------------------------------------------------------
 1 | # Assignment 1
 2 | 
 3 | This is a test to make sure that you have the AWS image up and running and that you can access it and generate meaningful results.
 4 | 
 5 | It will also familiarize you with [NCBI EDirect](../Databases/NCBI_Edirect.html) that we will use during the course.
 6 | 
 7 | The file [genera.txt](https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/master/Assignments/Assignment1/genera.txt) is a plain text file that you can read on the command line using `less` or `more`. The file has 69 different organisms, listed as genus and species, with one entry per line. The assignment is to familiarize yourself with [NCBIs EDirect](../../Databases/NCBI_Edirect.md) tool and to use some advanced bash scripting.
 8 | 
 9 | In the first part of the assignment, you must identify which organism has the most number of genomes in the *assembly* database. You should use on of the `edirect` scripts provided on the AWS image to complete that task as shown in the manual.
10 | 
11 | In the second part of the assignment, you must calculate the **AVERAGE** (mean) genome size of the genomes associated with “**Prevotella buccalis**”. 
12 | 
13 | *Hint*: There is a command called countfasta.py that will likely help you with this step!


--------------------------------------------------------------------------------
/Assignments/NCBIEDirectAssignment/genera.txt:
--------------------------------------------------------------------------------
 1 | Acidaminococcus intestini
 2 | Acinetobacter qingfengensis
 3 | Aeromonas aquariorum
 4 | Agromyces subbeticus
 5 | Aquaspirillum serpens
 6 | Bacillus sonorensis
 7 | Bifidobacterium boum
 8 | Brevundimonas aveniformis
 9 | Burkholderia cenocepacia
10 | Caldanaerobacter subterraneus
11 | Chryseobacterium daeguense
12 | Corynebacterium casei
13 | Coxiella burnetii
14 | Crocinitomicaceae bacterium
15 | Cupriavidus necator
16 | Desulfatibacillum aliphaticivorans
17 | Echinacea purpurea'
18 | Enterococcus quebecensis
19 | Ewingella americana
20 | Exiguobacterium alkaliphilum
21 | Fischerella muscicola
22 | Fluoribacter dumoffii
23 | Gardnerella vaginalis
24 | Gelidibacter mesophilus
25 | Gluconobacter thailandicus
26 | Helicobacter bilis
27 | Henriciella marina
28 | Imtechella halotolerans
29 | Janthinobacterium svalbardensis
30 | Jeotgalicoccus marinus
31 | Kineococcus radiotolerans
32 | Kordia jejudonensis
33 | Kribbella flavida
34 | Lactobacillus xiangfangensis
35 | Leptospiraceae bacterium
36 | Leptospira wolffii
37 | Limnohabitans planktonicus
38 | Mameliella atlantica
39 | Marinobacter similis
40 | Megamonas hypermegale
41 | Meiothermus taiwanensis
42 | Methylacidiphilum kamchatkense
43 | Mycobacterium fragae
44 | Neptuniibacter caesariensis
45 | Niastella populi
46 | Nitrospira japonica
47 | Nocardia seriolae
48 | Paenibacillus swuensis
49 | Phaeobacter inhibens
50 | Photorhabdus heterorhabditis
51 | Porphyrobacter donghaensis
52 | Prevotella buccalis
53 | Pseudoalteromonas ruthenica
54 | Pseudomonas punonensis
55 | Rhizobium selenitireducens
56 | Rhodococcus corynebacterioides
57 | Rhodospirillales bacterium
58 | Roseburia intestinalis
59 | Sphingobium baderi
60 | Sphingomonas haloaromaticamans
61 | Spirosoma montaniterrae
62 | Staphylococcus sciuri
63 | Streptococcus equi
64 | Streptomyces silaceus
65 | Terribacillus aidingensis
66 | Thermobifida halotolerans
67 | Thermocrispum agreste
68 | Vibrio toranzoniae
69 | Virgibacillus halodenitrificans
70 | 


--------------------------------------------------------------------------------
/Assignments/README.md:
--------------------------------------------------------------------------------
 1 | # Assignments
 2 | 
 3 | This folder contains the assignments for Fall 2018 and associated data. The solutions to the assignments will be posted once the assignment is complete. Of course, once the assignment solution is posted no further attempts will be accepted for an assignment!
 4 | 
 5 | All the assignments are posted on the official SDSU CMS (BlackBoard) and that is considered the authoritative site for deadlines, etc. All work must be turned in through Blackboard to receive a grade.
 6 | 
 7 | * [NCBI EDirect](Assignments/NCBIEDirectAssignment) is to familiarize yourself with NCBI EDirect.
 8 | * [Genomics Assignment](Assignments/GenomicsAssignment/) is to analyze complete genomes from *Klebsiella*.
 9 | * [Metagenomics Assignment](Assignments/MetagenomicsAssignment) is to analyze some metagenomics data and describe the organisms that you find there.
10 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | cff-version: 1.1.0
 2 | message: "If you use this software, please cite it as below."
 3 | authors:
 4 |   - family-names: Edwards
 5 |     given-names: Robert
 6 |     orcid: https://orcid.org/0000-0001-8383-8949
 7 | title: "Computational Genomics Manual"
 8 | version: v2
 9 | date-released: 2023-07-24
10 | doi: 10.5281/zenodo.8178842
11 | url: "https://github.com/linsalrob/ComputationalGenomicsManual"
12 | 


--------------------------------------------------------------------------------
/CheckM/CheckM.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/CheckM/CheckM.pdf


--------------------------------------------------------------------------------
/CheckM/README.md:
--------------------------------------------------------------------------------
 1 | # Checking Genome Contamination (Redundancy) and Completeness with CheckM
 2 | 
 3 | When you sequence a genome, and especially when you construct [metagenome assembled genomes](../CrossAssembly),you would like to know whether the genome is complete (i.e. it has all the genes you would expect to be there), and whether there is data from more than one organism in the sequence.
 4 | 
 5 | One approach you can take to explore your genome is to use [GenomePeek](../GenomePeek/), which will show you contamination, but will not show you completeness.
 6 | 
 7 | <a href="https://ecogenomics.github.io/CheckM/" target="_blank"><img src="images/checkm.png" width="600 px" alt="CheckM logo" title="The CheckM logo" /></a>
 8 | 
 9 | An alternative is to use [CheckM](https://ecogenomics.github.io/CheckM/) that does several computations. 
10 | 
11 | CheckM uses [prodigal](../ORFCalling/) to identify the genes in your sequences, 
12 | 
13 | First, it places your genome on a tree, and then it uses that phylogenetic placement to identify sets of genes that should be in your genome.
14 | 
15 | Next, it looks for those genes using [hmmer](../Databases) to search your genome. The completeness is an estimate of the fraction of genes that are expected to be there which were actually found. The contamination is based on identifying the number of single copy genes, that should only be there once.
16 | 
17 | There are lots of different workflows that you can complete using CheckM and  you should [check out their manual](https://github.com/Ecogenomics/CheckM/wiki) for more commands and workflows.
18 | 
19 | In the [previous steps](../CrossAssembly/) we made a bin based on contigs that are correlated with each other across multiple genomes. We can put those contigs into a directory, and use CheckM to test what is our coverage or completeness.
20 | 
21 | The simplest workflow that we use is:
22 | 
23 | ```bash
24 | checkm lineage_wf GenomeBins/ CheckMOut
25 | ```
26 | 
27 | *Note:* If you have more than one thread on your computer you can make this run a lot faster by using them. For example,
28 | 
29 | ```bash
30 | checkm lineage_wf -t 16 GenomeBins/ CheckMOut
31 | ```
32 | will run `checkm` with 16 threads.
33 | 
34 | `GenomeBins` is the name of a directory that has the genome bins that you want to analyze.
35 | 
36 | You can find a detailed description of the steps that CheckM takes [in the lineage_wf](https://github.com/Ecogenomics/CheckM/wiki/Workflows#lineage-specific-workflow) on the [CheckM wiki](https://github.com/Ecogenomics/CheckM/wiki). 
37 | 
38 | *Note;* In the  [previous steps](../CrossAssembly/) we filtered contigs based on their Pearson correlation coefficient. You can make repeated attempts at binning sequences with different correlation coefficients or other parameters, and then testing them with CheckM to see the result.
39 | 
40 | 
41 | 


--------------------------------------------------------------------------------
/CheckM/images/checkm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/CheckM/images/checkm.png


--------------------------------------------------------------------------------
/Conda/README.md:
--------------------------------------------------------------------------------
  1 | # Installing software using CONDA
  2 | 
  3 | We are going to install all our software using conda and bioconda. First, we need to install conda:
  4 | 
  5 | ## Download conda
  6 | 
  7 | Head to the [miniforge download page](https://github.com/conda-forge/miniforge) and download the appropriate installer.
  8 | 
  9 | You want :
 10 | 
 11 |  - **Linux Installer** 
 12 |  - **x86_64**
 13 |  - **Python** with the biggest number
 14 |  - **Miniconda3 Linux 64-bit**
 15 |  
 16 | Here is a quick way to download that script!
 17 | 
 18 | Right click on the appropriate link, and choose *Copy link address*. Go back to your terminal window (probably Putty) and type `wget` and a space, and then paste the URL that you just copied. Press return and it should download the file for you!
 19 | 
 20 | The file should be called `Miniconda3-latest-Linux-x86_64.sh` but in case it is not, just substitute the appropriate file name below. Remember that you can check with `ls -ltr` to see the newest file downloaded.
 21 | 
 22 | Run the miniconda installer:
 23 | 
 24 | ```bash
 25 | bash Miniconda3-latest-Linux-x86_64.sh
 26 | ```
 27 | 
 28 | This will ask you some questions, and you can pretty much accept the default answer to all the questions.
 29 | 
 30 | Once the installer has finished the best way to continue is to log out, and then log back in. This will reset your account and you will have conda activated. At the bottom left of your screen you should see it say `(base)` which means that you are in the base conda installation.
 31 | 
 32 | 
 33 | ## Install your first bioinformatics package
 34 | 
 35 | We are going to install `fastp` and test to see if it works. This will demonstrate how to install a conda package.
 36 | 
 37 | ```bash
 38 | mamba install -c conda-forge -c bioconda fastp 
 39 | ```
 40 | 
 41 | Again, this will work on resolution of the packages for you and ask if you are sure. Once it is complete, you should be able to issue the command:
 42 | 
 43 | ```bash
 44 | fastp -v
 45 | ```
 46 | 
 47 | to see the version of `fastp` that has been installed.
 48 | 
 49 | You can also run 
 50 | 
 51 | ```bash
 52 | fastp -h
 53 | ```
 54 | 
 55 | To get the full `fastp` help menu
 56 | 
 57 | 
 58 | ## Install snakemake
 59 | 
 60 | For the next steps of this tutorial, we are going to use [snakemake](https://snakemake.readthedocs.io/en/stable/) to run some software. So we are going to use `mamba` to install that:
 61 | 
 62 | ```bash
 63 | mamba install -c conda-forge -c bioconda snakemake
 64 | ```
 65 | 
 66 | Once that has completed, `snakemake -v` should show you the current version.
 67 | 
 68 | ## Install other bioinformatics packages
 69 | 
 70 | You can install pretty much any bioinformatics package using conda. The [anaconda website has a complete list](https://anaconda.org/bioconda/repo) and you can visit the [bioconda](https://bioconda.github.io/) page for more information about bioconda.
 71 | 
 72 | but to get started, you might want to install:
 73 | 
 74 | ```bash
 75 | mamba install -c conda-forge -c bioconda fastp minimap2 samtools
 76 | ```
 77 | 
 78 | ## Adding channels
 79 | 
 80 | It is a pain to keep typing `-c conda-forge -c bioconda` so we can just add those two channels to our configuration
 81 | 
 82 | ```bash
 83 | conda config --add channels conda-forge
 84 | conda config --add channels bioconda
 85 | ```
 86 | 
 87 | ## Updating a package
 88 | 
 89 | With `conda` (or `mamba`) you can easily update a package if there is a newer version. For example, to update `snakemake` you would use:
 90 | 
 91 | ```bash
 92 | mamba update snakemake
 93 | ```
 94 | 
 95 | What this means is the _you_ are responsible for ensuring your software is up-to-date. Or not. If you are working on a set of data analyses you may want to keep all the software at the same version so that each time you do an analysis you get comparable answers. With `conda`, you have control over the update cycles, but don't forget from time-to-time you might want to update the software! 
 96 | 
 97 | ## Environments
 98 | 
 99 | If you want to keep different versions of software or run different pipelines you can do that with conda, in what are called `environments`. Each one can have different software. `conda` is clever, because if you have the same software in two different environments you don't need an entire copy of the software. At this stage, you don't need to worry about that, and you can just install everything in the `base` environment. But if you start to run into installation issues, then remember you can separate things into different environments.
100 | 
101 | 


--------------------------------------------------------------------------------
/CrossAssembly/CCOM.md:
--------------------------------------------------------------------------------
 1 | # Binning using CCOM
 2 | 
 3 |  <img align="right" src="images/CCOM.png" alt="Contig Clustering Logo" />
 4 |  As an alternative to manual binning using `crAss`, and using `metabat`, we have developed a website, [Contig Clustering of Metagenomes](https://edwards.sdsu.edu/ContigClustering) where you can upload reads and contigs, and we will run three different binning algorithms: [metabat](https://bitbucket.org/berkeleylab/metabat), [GroopM](http://minillinim.github.io/GroopM/) and [crAss](http://edwards.sdsu.edu/crass/).
 5 |  
 6 | The input files for CCOM require a `contig` file that has to have `.fasta` extension, and all the individual read files used as input to form the contig. For example for the algae data, we have the `contigs.fasta` file, and all the individual reads files `Algae_11.renum.fna`, `Algae_12.renum.fna`, `Algae_13.renum.fna`, `Algae_14.renum.fna` files. 
 7 | 
 8 | *Note*: The extension of the read files must be changed from `.fasta` to `.fna`, this doesn’t change the file format but allows the CCOM tool recognize the reads file from the contig file. Only one set can be run at a time (i.e. you can not run multiple contig files at the same time), but you can upload several independent datasets and wait for them to be processed.. 
 9 | 
10 | The output from CCOM is saved in a zip file called `workfile.zip` that you can download. 
11 | 
12 | Unzip the file and navigate to `/workfile/var/www/html/ContigClustering/cgi-bin/ContigClustering/uploads/XXXXXXXX`. The `XXXXXXXX`  will be a Job ID number that is created by CCOM and is unique to you. This file contains all the temporary files (`.bam`, `.bai` and `.sam` files) generated while running the binning tools. The bins extracted from the contigs are saved in the the folders: MetaBatBins, GroopMBins. 
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/CrossAssembly/CCOM.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/CrossAssembly/CCOM.pdf


--------------------------------------------------------------------------------
/CrossAssembly/CrossAssembly.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/CrossAssembly/CrossAssembly.pdf


--------------------------------------------------------------------------------
/CrossAssembly/Metabat.md:
--------------------------------------------------------------------------------
 1 | # Binning using MetaBat
 2 | 
 3 | One of the most popular programs for binning metagenomes is using [metabat](https://bitbucket.org/berkeleylab/metabat). 
 4 | 
 5 | Metabat does the profiling for you based on tetranucleotide frequency in the samples. It counts the 4-mers, (i.e. `AAAA`, `AAAT`, `AAAG`, `AAAC`, `AATA`, `AATT`, `…`) in the sequences and uses those to suggest which samples should go together. The advantage of `metabat` is that it will return your contigs for you. 
 6 | 
 7 | <img align="right" src="images/metabat.png" alt="Metabat pipeline" />
 8 | 
 9 | Metabat requires one `bam` file for every input sample, and one set of contigs. We can use our existing datasets to try binning with metabat, but we need to map the individual reads to the contigs in separate files (in the previous analysis we mapped all the reads to the contigs in one big file).
10 | 
11 | You can use four pairs of commands to do this (these four commands should each be on a single line):
12 | 
13 | ```bash
14 | bowtie2 -f  -x AlgaeBowtie -U Algae_11.renum.fna > Algae_11.renum.fna.AllReads.sam
15 | samtools view -bS  Algae_11.renum.fna.AllReads.sam  | samtools 
16 | sort – Algae_11..fna.AllReads
17 | 
18 | bowtie2 -f  -x AlgaeBowtie -U Algae_12.renum.fna > Algae_12.renum.fna.AllReads.sam
19 | samtools view -bS  Algae_12.renum.fna.AllReads.sam  | samtools 
20 | sort – Algae_12.renum.fna.AllReads
21 | 
22 | bowtie2 -f  -x AlgaeBowtie -U Algae_13.renum.fna > Algae_13.renum.fna.AllReads.sam
23 | samtools view -bS  Algae_13.renum.fna.AllReads.sam  | samtools 
24 | sort – Algae_13.renum.fna.AllReads
25 | 
26 | bowtie2 -f  -x AlgaeBowtie -U Algae_14.renum.fna > Algae_14.renum.fna.AllReads.sam
27 | samtools view -bS  Algae_14.renum.fna.AllReads.sam  | samtools  
28 | sort – Algae_14.renum.fna.AllReads
29 | ```
30 | 
31 | 
32 | However, this sort of repetitive work is exactly what computers are good at, and so we can have the computer do it for us all at once (make sure you change the Algae part as appropriate):
33 | 
34 | ```bash
35 | for FASTA in Algae_*renum.fna; do bowtie2 -f  -x AlgaeBowtie -U $FASTA > $FASTA.AllReads.sam; samtools view -bS  $FASTA.AllReads.sam  | samtools  
36 | sort - $FASTA.AllReads; done
37 | ```
38 | 
39 | Once we have this, we are set to run metabat:
40 | 
41 | ```bash
42 | runMetaBat.sh --minSamples 3 AlgaeAssembly/contigs.fasta *bam
43 | ```
44 | 
45 | Note that I alter the number of minSamples to be less than the number of samples we have (currently 4).
46 | 
47 | Metabat creates a fasta file for each bin that it thinks is a realistic contig.
48 | 


--------------------------------------------------------------------------------
/CrossAssembly/Metabat.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/CrossAssembly/Metabat.pdf


--------------------------------------------------------------------------------
/CrossAssembly/images/CCOM.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/CrossAssembly/images/CCOM.png


--------------------------------------------------------------------------------
/CrossAssembly/images/metabat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/CrossAssembly/images/metabat.png


--------------------------------------------------------------------------------
/CrossAssembly/images/nineteencontigs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/CrossAssembly/images/nineteencontigs.png


--------------------------------------------------------------------------------
/CrossAssembly/images/spades.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/CrossAssembly/images/spades.png


--------------------------------------------------------------------------------
/Databases/Databases.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Databases/Databases.pdf


--------------------------------------------------------------------------------
/Databases/NCBI_Edirect.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Databases/NCBI_Edirect.pdf


--------------------------------------------------------------------------------
/Databases/SRA.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Databases/SRA.pdf


--------------------------------------------------------------------------------
/Databases/images/1772048.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Databases/images/1772048.png


--------------------------------------------------------------------------------
/Databases/images/DDBJ.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Databases/images/DDBJ.png


--------------------------------------------------------------------------------
/Databases/images/EMBL.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Databases/images/EMBL.png


--------------------------------------------------------------------------------
/Databases/images/GenBankGrowth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Databases/images/GenBankGrowth.png


--------------------------------------------------------------------------------
/Databases/images/GroundwaterSize.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Databases/images/GroundwaterSize.png


--------------------------------------------------------------------------------
/Databases/images/JGI.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Databases/images/JGI.jpg


--------------------------------------------------------------------------------
/Databases/images/JGI.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Databases/images/JGI.png


--------------------------------------------------------------------------------
/Databases/images/Mappers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Databases/images/Mappers.png


--------------------------------------------------------------------------------
/Databases/images/NCBI.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Databases/images/NCBI.png


--------------------------------------------------------------------------------
/Databases/images/NCBIEDirect.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Databases/images/NCBIEDirect.png


--------------------------------------------------------------------------------
/Databases/images/TARASize.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Databases/images/TARASize.png


--------------------------------------------------------------------------------
/Databases/images/UniProt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Databases/images/UniProt.png


--------------------------------------------------------------------------------
/Databases/images/groundwater.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Databases/images/groundwater.png


--------------------------------------------------------------------------------
/Databases/images/kegg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Databases/images/kegg.png


--------------------------------------------------------------------------------
/Databases/images/pdb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Databases/images/pdb.png


--------------------------------------------------------------------------------
/Databases/images/seed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Databases/images/seed.png


--------------------------------------------------------------------------------
/Datasets/.gitattributes:
--------------------------------------------------------------------------------
1 | *.gz filter=lfs diff=lfs merge=lfs -text
2 | *.qza filter=lfs diff=lfs merge=lfs -text
3 | *.tgz filter=lfs diff=lfs merge=lfs -text
4 | 


--------------------------------------------------------------------------------
/Datasets/CF/788707_20171213_S_R1.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:f5cf5d9e2f34b39f94ae48317fc092e75610a9ffffa79f2a42094e02210db693
3 | size 29969364
4 | 


--------------------------------------------------------------------------------
/Datasets/CF/788707_20171213_S_R2.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:aafc219d5fef1d9ac63312c182572d212557beff111120c8008e9fffce24da28
3 | size 33996800
4 | 


--------------------------------------------------------------------------------
/Datasets/CF/788707_20180129_S_R1.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:bbfbde9e455255f501e3b94dda601f127e54dc3b2e14008a078cf9758aeb5798
3 | size 32965335
4 | 


--------------------------------------------------------------------------------
/Datasets/CF/788707_20180129_S_R2.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:ffe308d67fc448dad3667415bb006f4dea26b868953ae52dac41cfcc8e5bec66
3 | size 35590845
4 | 


--------------------------------------------------------------------------------
/Datasets/CF/788707_20180313_S_R1.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:37d176fed5437556bcb2e3389e329e0987cc01c59c4a0444adafc669c2672f49
3 | size 31405881
4 | 


--------------------------------------------------------------------------------
/Datasets/CF/788707_20180313_S_R2.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:91d06b7372178ae53611db498a35d54ebd7672e48b6f891dfa5b270b6e489862
3 | size 34679707
4 | 


--------------------------------------------------------------------------------
/Datasets/CF/788707_20181126_S_R1.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:952edf99d0daacfa271005d6d2b552964d4de76ad714b172e2d3852f6681099d
3 | size 30220732
4 | 


--------------------------------------------------------------------------------
/Datasets/CF/788707_20181126_S_R2.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:aca330e41f8a88a13d8838a83a881a6ccab71b78648d56ed161bb825ad5f1862
3 | size 34194628
4 | 


--------------------------------------------------------------------------------
/Datasets/CF/README.md:
--------------------------------------------------------------------------------
1 | # CF Data
2 | 
3 | A set of CF datasets made by Rob and team. For more information, or if you use these sequences, please talk to Rob
4 | 


--------------------------------------------------------------------------------
/Datasets/Datasets.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Datasets/Datasets.pdf


--------------------------------------------------------------------------------
/Datasets/README.md:
--------------------------------------------------------------------------------
 1 | # Data Sets
 2 | 
 3 | We have gathered several data sets for you to use in the course, and in this manual we use different data sets as examples.
 4 | 
 5 | The main thing you need to know is whether a data set is a 16S amplicon library or a random community metagenome. 
 6 | 
 7 | ## Coral and Algae
 8 | 
 9 | This is coral, algae, CCA, and water  (control) samples from Kevin Walsh in Liz Dinsdale's lab. 
10 | 
11 | This is a random community metagenome data set
12 | 
13 | [Read more about the coral and algae data sets](coral_algae/)
14 | 
15 | ## Ground Water
16 | 
17 | This data comes from SRA project [SRP075429](https://www.ncbi.nlm.nih.gov/Traces/study/?acc=SRP075429) 
18 | 
19 | Hernsdorf AW, Amano Y, Miyakawa K, Ise K, Suzuki Y, Anantharaman K, Probst A, Burstein D, Thomas BC, Banfield JF. 2017. Potential for microbial H2 and metal transformations associated with novel bacteria and archaea in deep terrestrial subsurface sediments. [ISME J 11:1915–1929](https://www.nature.com/articles/ismej201739)
20 | 
21 | This is a random community metagenome data set
22 | 
23 | [Read more about the ground water datasets](ground_water/)
24 | 
25 | ## Gut
26 | 
27 | This data comes from SRA project [SRP074153](https://www.ncbi.nlm.nih.gov/Traces/study/?acc=SRP074153) 
28 | 
29 | This is a random community data set
30 | 
31 | Brooks B, Olm MR, Firek BA, Baker R, Thomas BC, Morowitz MJ, Banfield JF. 2017. Strain-resolved analysis of hospital rooms and infants reveals overlap between the human and room microbiome. [Nat Commun 8:1814](https://www.nature.com/articles/s41467-017-02018-w)
32 | 
33 | [Read more about the gut datasets](gut/)
34 | 
35 | 
36 | ## Drinking Water
37 | 
38 | A drinking water study from the [University of Adelaide, Australia](https://www.adelaide.edu.au/)
39 | 
40 | This is 16S amplicon dataset with [SRP ID SRP059994](https://www.ncbi.nlm.nih.gov/Traces/study/?acc=SRP059994) 
41 | 
42 | [Read more about the drinking water datasets](drinking_water/)
43 | 
44 | 


--------------------------------------------------------------------------------
/Datasets/coral_algae/Algae.tgz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:788d1cef8c0e6be43f54eb04b06ba71dadf32b6c40503ab26cd39a9bdc70806a
3 | size 43291378
4 | 


--------------------------------------------------------------------------------
/Datasets/coral_algae/CCA.tgz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:d24ade3d7ebd8fa9239964bd5834f31349a1cfddf77a7660858b6cb5a2cabc8c
3 | size 31237111
4 | 


--------------------------------------------------------------------------------
/Datasets/coral_algae/Control.tgz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:8d7385134fd449fdd045b0006764781a57238309fb3da1352c2909bb90a99bb3
3 | size 44484766
4 | 


--------------------------------------------------------------------------------
/Datasets/coral_algae/Coral.tgz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:01115c4dd6f3f9f2779377761464ab32b59782a7b1491837d7f6dc16beab11f1
3 | size 43573421
4 | 


--------------------------------------------------------------------------------
/Datasets/coral_algae/README.md:
--------------------------------------------------------------------------------
 1 | # Coral and Algae Data Set
 2 | 
 3 | 
 4 | This is coral, algae, CCA, and water  (control) samples from Kevin Walsh in [Liz Dinsdale's lab](https://dinsdalelab.sdsu.edu/). 
 5 | 
 6 | This is a random community metagenome data set
 7 | 
 8 | There are 50,000 reads in these data sets.
 9 | 
10 | ## Abstract:
11 | Coral reefs are undergoing global microbialization as carbon and energy from higher trophic levels shift into the microbial food web. Increase in labile carbon resources has altered microbial community composition from phototrophs to copiotrophs and super-heterotrophs. Even in oligotrophic systems, super-heterotrophs persist in the rare biosphere. These rare organisms are likely to become abundant with microbialization, but their scarcity inhibits deep sequencing and metabolic analysis with metagenomics. Therefore, we utilized enrichment after pre-exposure of the water column microbiome to benthic organisms, 1) water control, 2) alga, Stypopodium zonale, 3) crustose coralline algae, and 4) the coral, Mussismilia hartti to identify super-heterotrophs in the coral reef environment. We compared enriched communities to metagenomes from coral reef water and the coral Mussismilia braziliensis to compare the presence of dominant genera as population genomes in these native microbial communities. Enrichment selected for super-heterotrophs in the genus, Vibrio, Pseudoalteromonas and Arcobacter with greater sequence coverage of Arcobacter in coral exposures. We assembled two Vibrio, a Pseudoatleromonas and an Arcobacter, which identified previously unannotated sequences. To determine genes that define the ecotypes of the environmental microbes, we compared the population genomes to genomes of related organisms sequenced from cultured isolates. We found the coral reef associated Vibrio population had a higher proportion of genes in the metabolic pathways: Type IV secretion and conjugative transfer, maltose utilization and urea decomposition. Pseudoalteromonas population had more genes involved in sugar utilization pathways while Arcobacter population was distinguished by genes contributed to type VI secretion systems and utilization of alkylphosphates and aromatic compounds. By assembling population genomes, we identified novel genes defining the ecotype relative to the pangenome of culture isolates. Novel gene identification informs the ecology of super-heterotrophs on corals reef independent of the limitations of reference genomes.  
12 | 
13 | ## Data
14 | 
15 | We have made all the data sets available either as separate tarballs. *Note:* when you download these you will most likely need to change the name and then use the command:
16 | 
17 | ```bash
18 | tar xf Algae.tgz
19 | ```
20 | 
21 | to extract the data. (Of course, changing Algae to CCA, Control or Coral as appropriate.)
22 | 
23 | * [Algae.tgz](https://goo.gl/zvnZD4)
24 | * [CCA.tgz](https://goo.gl/LRCXy2)
25 | * [Control.tgz](https://goo.gl/Zbm9TA)
26 | * [Coral.tgz](https://goo.gl/Gf3EvW)
27 | 
28 | ### Algae
29 | 
30 | * [Algae 11](fastq/Algae_11.fastq.gz)
31 | * [Algae 12](fastq/Algae_12.fastq.gz)
32 | * [Algae 13](fastq/Algae_13.fastq.gz)
33 | * [Algae 14](fastq/Algae_14.fastq.gz)
34 | 
35 | ### CCA
36 | 
37 | * [CCA 11](fastq/CCA_11.fastq.gz)
38 | * [CCA 12](fastq/CCA_12.fastq.gz)
39 | * [CCA 13](fastq/CCA_13.fastq.gz)
40 | 
41 | ### Control
42 | 
43 | * [Control 11](fastq/Control_11.fastq.gz)
44 | * [Control 12](fastq/Control_12.fastq.gz)
45 | * [Control 13](fastq/Control_13.fastq.gz)
46 | * [Control 14](fastq/Control_14.fastq.gz)
47 | 
48 | ### Coral
49 | 
50 | * [Coral 11](fastq/Coral_11.fastq.gz)
51 | * [Coral 12](fastq/Coral_12.fastq.gz)
52 | * [Coral 13](fastq/Coral_13.fastq.gz)
53 | * [Coral 14](fastq/Coral_14.fastq.gz)
54 | 
55 | 
56 | 


--------------------------------------------------------------------------------
/Datasets/coral_algae/fastq/Algae_11.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:07616deb6e43f595ad504f48267f0ae0b47c1b42d2f200af748bc5c83116c4d8
3 | size 10565254
4 | 


--------------------------------------------------------------------------------
/Datasets/coral_algae/fastq/Algae_12.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:117d46ee5771dfde3f88d8a25f063941c86e544586fe0fa4834e2b2ab4c14ade
3 | size 11935577
4 | 


--------------------------------------------------------------------------------
/Datasets/coral_algae/fastq/Algae_13.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:6f8553ca2b8ece58b71b86fe04a1c22029f86cebc8b0372ca8f9b71a1e079499
3 | size 12073944
4 | 


--------------------------------------------------------------------------------
/Datasets/coral_algae/fastq/Algae_14.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:2fe6343bf112e44125764db5700133b45d33538ebb9a8910709f295b8d7f3548
3 | size 8709272
4 | 


--------------------------------------------------------------------------------
/Datasets/coral_algae/fastq/CCA_11.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:111908d1f105d136bb3a38d2c71d8595634b9e3024f6d5ce56540ed6e409f7da
3 | size 11616756
4 | 


--------------------------------------------------------------------------------
/Datasets/coral_algae/fastq/CCA_12.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:3ae86b1b2d1c5a422da1a5fa69a053969d5522aa72cd54ae4be37d1c3206b308
3 | size 9332153
4 | 


--------------------------------------------------------------------------------
/Datasets/coral_algae/fastq/CCA_13.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:c1097e535f17724fa78394701a7b3a6bc23ae1b9ed0da8f8c49a34e58a5fe762
3 | size 10282867
4 | 


--------------------------------------------------------------------------------
/Datasets/coral_algae/fastq/Control_11.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:67bb01a83b3a72aff57430e23f7d95f6e71f1e83a1fc17b8748c643d4c2066ae
3 | size 12585168
4 | 


--------------------------------------------------------------------------------
/Datasets/coral_algae/fastq/Control_12.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:3d2a59c530324686b1acb3df6a5354cf61822a03e38d8305a8bd97e2ec39b977
3 | size 11846750
4 | 


--------------------------------------------------------------------------------
/Datasets/coral_algae/fastq/Control_13.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:dff2d0f908ebbdb7937e3d2f08eb8d8c770f6cc9de6787aea616e7a14ffdf355
3 | size 9610860
4 | 


--------------------------------------------------------------------------------
/Datasets/coral_algae/fastq/Control_14.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:cb4a262ab5dfd911dbcb011d37495c4deed788971764030d00759102d7124fab
3 | size 10434503
4 | 


--------------------------------------------------------------------------------
/Datasets/coral_algae/fastq/Coral_11.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:6bc3816478ac10a18045c0f0070f6a59fd0896ec01eb6bb9e5a5ff137f59f1e3
3 | size 11937818
4 | 


--------------------------------------------------------------------------------
/Datasets/coral_algae/fastq/Coral_12.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:4e9e060082995bddd6126319ed6cdee15150cd9858229f03249ae86dadbc7163
3 | size 12070712
4 | 


--------------------------------------------------------------------------------
/Datasets/coral_algae/fastq/Coral_13.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:c55fbc5badc350d3bffcddff0cc754cb0fed8b250d4f8410555ffd44d4c63803
3 | size 9125704
4 | 


--------------------------------------------------------------------------------
/Datasets/coral_algae/fastq/Coral_14.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:fa98300dbc832f482a2be255d7d9192b53724ab8e17d91c3fc51328712e15f62
3 | size 10431856
4 | 


--------------------------------------------------------------------------------
/Datasets/drinking_water/Calypso/README.md:
--------------------------------------------------------------------------------
1 | # Calypso data
2 | 
3 | This is the data required for [Calypso](http://cgenome.net/wiki/index.php/Calypso). 
4 | 
5 | You will need to download each of these three files to 
6 | 


--------------------------------------------------------------------------------
/Datasets/drinking_water/Calypso/feature-table.biom:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Datasets/drinking_water/Calypso/feature-table.biom


--------------------------------------------------------------------------------
/Datasets/drinking_water/Calypso/metadata_calypso.tsv:
--------------------------------------------------------------------------------
1 | Sample ID	Label	Individual or animal	Include flag	Tag	BioSample	Sample name	Library name	MBases	MBytes	AvgSpotLen	Distribution sytem or WTP	Experiment	Publication sample name	collection date	library
2 | SRR2080423	SA tank #3	I1	1	CTCGATG	SAMN03776101	SA 3 tank	SA1_library_R_2012_12_05_20_11_34_user_SN2-26_Auto_user_SN2-26_30.fastq	80	66	143	Distribution system	SRX1075090	SA 3 tank	02-May-2012	SA1
3 | SRR2080425	1.5km post discharge	I1	1	ACCAACT	SAMN03776103	1.5kmPostDis	SA1_library_R_2012_12_05_20_11_34_user_SN2-26_Auto_user_SN2-26_30.fastq	80	66	143	Distribution system	SRX1075092	1.5kmPostDis	03-May-2012	SA1
4 | SRR2080427	SA Source Water 2	I1	1	TCGCAGG	SAMN03776105	SASourceWater2	SA2_library_R_2012_12_11_11_14_16_user_js2-SN2-29_Auto_user_SN2-29_33.fastq	68	56	139	Source water	SRX1075094	SASourceWater2	03-May-2012	SA2
5 | SRR2080434	SA Customer tap 2	I1	1	GCTCGAA	SAMN03776111	SA 2 CT	SA2_library_R_2012_12_11_11_14_16_user_js2-SN2-29_Auto_user_SN2-29_33.fastq	68	56	139	Distribution system	SRX1075101	SA 2 CT	02-May-2012	SA2
6 | SRR2080436	WA1 outlet	I1	1	GGATCAA	SAMN03776091	WA1 outlet	WA_library_R_2013_03_26_15_14_25_user_SN1-86.fastq	59	48	140	Distribution system	SRX1075103	WA1 outlet	03-May-2012	WA
7 | 


--------------------------------------------------------------------------------
/Datasets/drinking_water/Calypso/taxonomy.tsv:
--------------------------------------------------------------------------------
 1 | Feature ID	Taxon	Confidence
 2 | 8965c65564ea3b04bb15317dacddc2e8	k__Bacteria; p__Proteobacteria; c__Deltaproteobacteria; o__MIZ46; f__; g__; s__	0.9464073644349654
 3 | 3f858109e7e20a4f5c0748c463a8a2f6	k__Bacteria; p__Acidobacteria; c__Acidobacteria-6; o__iii1-15; f__; g__; s__	0.9968867063333938
 4 | 68a419933397fea1d4ac19377cc5f2b8	k__Bacteria; p__WS3; c__PRR-12; o__Sediment-1; f__PRR-10; g__; s__	0.9689803287271772
 5 | dcff3de33b5ae90169074fd3a9705a38	k__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales; f__Alicyclobacillaceae; g__Alicyclobacillus; s__	0.9864737465679667
 6 | ae9f4d23c10b6d3e87c7ff38c0a40b67	k__Bacteria; p__Proteobacteria; c__Betaproteobacteria; o__MWH-UniP1; f__; g__; s__	0.7415167776422584
 7 | 964caa41debda91fc4d3648d4347a655	k__Bacteria; p__Firmicutes; c__Bacilli; o__Lactobacillales; f__Streptococcaceae; g__Streptococcus; s__	0.9775350602210992
 8 | f703de3042bc9015b44ffd029644c602	k__Bacteria; p__Firmicutes; c__Bacilli; o__Lactobacillales; f__Streptococcaceae; g__Streptococcus; s__	0.7554837395491494
 9 | a5fa5815af312cb8d19f29f79cc77e5c	k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Pseudomonadales; f__Pseudomonadaceae; g__Pseudomonas; s__	0.7236711842229805
10 | edaebd8b81788273e1d7165dbdcfaa93	k__Bacteria; p__Proteobacteria; c__Betaproteobacteria; o__Burkholderiales; f__Comamonadaceae	0.9973695680171468
11 | f67861c93e8f58d17d6985c69c2e6a4d	k__Bacteria; p__Firmicutes; c__Bacilli	0.9999952681428291
12 | eb31a2131deb44b6c2a11e425727ddd0	k__Bacteria; p__Acidobacteria; c__Acidobacteria-6; o__iii1-15; f__; g__; s__	0.9959990338868077
13 | 0864d052af4c418d311141cf1ac53a96	k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Pseudomonadales; f__Moraxellaceae; g__Acinetobacter; s__johnsonii	0.8657344612862987
14 | 39b9533cab96a88ae7b5e3887d920e26	k__Bacteria; p__Acidobacteria; c__Holophagae; o__Holophagales; f__Holophagaceae; g__; s__	0.9399941865616229
15 | 21a399a771c50adaffd20dfd5105f8b1	k__Bacteria; p__Proteobacteria; c__Betaproteobacteria; o__Burkholderiales	0.9658553184635251
16 | c1a3db6172f8739a3b9ee1004e49f60e	k__Bacteria; p__Acidobacteria; c__Acidobacteria-6; o__iii1-15; f__; g__; s__	0.9954748370411025
17 | 7e079794fe92e5af11116de7406b9bfe	k__Bacteria; p__WS3; c__PRR-12; o__Sediment-1; f__; g__; s__	0.931959289502759
18 | fab76d07d660f36fde1db47059bfe35d	k__Bacteria; p__Proteobacteria; c__Deltaproteobacteria	0.9606315334858244
19 | f0d9bd5af03d4f2d530ac4f910aac6d7	k__Bacteria; p__Proteobacteria; c__Betaproteobacteria	0.9729873431627327
20 | 0752187a937f131ad5090035e3f1d98b	k__Bacteria; p__Proteobacteria; c__Betaproteobacteria; o__Burkholderiales; f__Comamonadaceae; g__Acidovorax	0.7537136997477545
21 | 1fa8ba592ce92c1495bd962df6c24ace	k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Pseudomonadales; f__Pseudomonadaceae; g__Pseudomonas	0.7260916696928624
22 | 7d1e10878271601a8a39a56fd47de29a	k__Bacteria; p__Firmicutes; c__Bacilli; o__Lactobacillales; f__Streptococcaceae; g__Streptococcus	0.9948629982970587
23 | c0bfa07f6fbe6d2394f4e5f99859d027	k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Pseudomonadales; f__Moraxellaceae; g__Acinetobacter; s__lwoffii	0.9108757063490508
24 | 14f86977a08ddf348f5cc79b13a1445a	k__Bacteria; p__Proteobacteria; c__Betaproteobacteria; o__Burkholderiales; f__Comamonadaceae; g__Limnohabitans; s__	0.7772991977335768
25 | 7395063f1b2426d9f899cfe1edc829be	k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Xanthomonadales; f__Xanthomonadaceae; g__Stenotrophomonas	0.9371458123074997
26 | 6c0218580520ccd4d10e547c63a95d9c	k__Bacteria; p__Proteobacteria; c__Deltaproteobacteria; o__NB1-j; f__MND4; g__; s__	0.9987261514366896
27 | 38265b37011008edd57e91762101f8bf	k__Bacteria; p__Proteobacteria; c__Betaproteobacteria; o__Methylophilales; f__Methylophilaceae	0.9272825528566369
28 | 5c008808ecf554b5384c74fa949b4afb	k__Bacteria; p__Acidobacteria; c__Holophagae; o__Holophagales; f__Holophagaceae; g__; s__	0.9617591565649898
29 | 


--------------------------------------------------------------------------------
/Datasets/drinking_water/README.md:
--------------------------------------------------------------------------------
 1 | # Drinking Water
 2 | 
 3 | A drinking water study from U. Adelaide that you can find on the [SRA with accession SRP059994](https://www.ncbi.nlm.nih.gov/Traces/study/?acc=SRP059994)
 4 | 
 5 | This is 16S amplicon dataset.
 6 | 
 7 | Publication:
 8 | Shaw JLA, Monis P, Weyrich LS, Sawade E, Drikas M, Cooper AJ. 2015. Using Amplicon Sequencing To Characterize and Monitor Bacterial Diversity in Drinking Water Distribution Systems. [Appl Environ Microbiol 81:6463–6473](http://aem.asm.org/content/81/18/6463.long)
 9 | 
10 | 
11 | ## Abstract
12 | 
13 | Drinking water assessments use a variety of microbial, physical, and chemical indicators to evaluate water treatment efficiency and product water quality. However, these indicators do not allow the complex biological communities, which can adversely impact the performance of drinking water distribution systems (DWDSs), to be characterized. Entire bacterial communities can be studied quickly and inexpensively using targeted metagenomic amplicon sequencing. Here, amplicon sequencing of the 16S rRNA gene region was performed alongside traditional water quality measures to assess the health, quality, and efficiency of two distinct, full-scale DWDSs: (i) a linear DWDS supplied with unfiltered water subjected to basic disinfection before distribution and (ii) a complex, branching DWDS treated by a four-stage water treatment plant (WTP) prior to disinfection and distribution. In both DWDSs bacterial communities differed significantly after disinfection, demonstrating the effectiveness of both treatment regimes. However, bacterial repopulation occurred further along in the DWDSs, and some end-user samples were more similar to the source water than to the postdisinfection water. Three sample locations appeared to be nitrified, displaying elevated nitrate levels and decreased ammonia levels, and nitrifying bacterial species, such as Nitrospira, were detected. Burkholderiales were abundant in samples containing large amounts of monochloramine, indicating resistance to disinfection. Genera known to contain pathogenic and fecal-associated species were also identified in several locations. From this study, we conclude that metagenomic amplicon sequencing is an informative method to support current compliance-based methods and can be used to reveal bacterial community interactions with the chemical and physical properties of DWDSs
14 | 
15 | Project | Description
16 | --- | ---
17 | SRR2080423 | SA 3 tank
18 | SRR2080425 | 1.5kmPostDis
19 | SRR2080427 | SASourceWater2
20 | SRR2080434 | SA 2 CT
21 | SRR2080436 | WA1 outlet
22 | 
23 | 
24 | We extracted this data with
25 | 
26 | ```
27 | for SRR_ID in SRR2080423 SRR2080425 SRR2080427 SRR2080434 SRR2080436
28 | 	fastq-dump --outdir fastq --gzip --skip-technical  --readids --read-filter pass --dumpbase --split-3 --clip $SRR_ID
29 | done
30 | ```
31 | 
32 | For some reason this data seems mixed up, as there are sequences with each tag in each file. 
33 | 
34 | We resplit the data with these tags.
35 | 
36 | The original number os counts per tag are:
37 | 
38 | Counts | Tag
39 | --- | ---
40 | 303,446 | TCGCAGG
41 | 276,080 | GCTCGAA
42 | 312,009 | CTCGATG
43 | 293,741 | ACCAACT
44 | 269,702 | GGATCAA
45 | 
46 | ## Sequences
47 | 
48 | * [barcodes](fastq/barcodes.fastq.gz)
49 | * [sequences](fastq/sequences.fastq.gz)
50 | 
51 | ## Metadata
52 | 
53 | * [Tab separated format](metadata.tsv)
54 | 
55 | ## Calypso data
56 | 
57 | We have this data available for download for use in Calypso
58 | 
59 | * [biom feature table](Calypso/feature-table.biom)
60 | * [metadata](Calypso/metadata_calypso.tsv)
61 | * [taxonomy](Calypso/taxonomy.tsv)
62 | 
63 | 
64 | 
65 | 
66 | 
67 | 
68 | 
69 | 


--------------------------------------------------------------------------------
/Datasets/drinking_water/fastq/barcodes.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:6a45837c9e3eb6aefada27f3e913475ffaf678dfcd71a8fe56cf80e82a24197c
3 | size 1264600
4 | 


--------------------------------------------------------------------------------
/Datasets/drinking_water/fastq/sequences.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:cea80963b69f71522ba82d68c4f447e86d30a213d35490ab154b2958c6d542bb
3 | size 10186289
4 | 


--------------------------------------------------------------------------------
/Datasets/drinking_water/metadata.tsv:
--------------------------------------------------------------------------------
1 | Sample-id	BioSample	Sample name	Library name	MBases	MBytes	AvgSpotLen	Distribution sytem or WTP	Experiment	Publication sample name	collection date	library	Tag
2 | SRR2080423	SAMN03776101	SA 3 tank	SA1_library_R_2012_12_05_20_11_34_user_SN2-26_Auto_user_SN2-26_30.fastq	80	66	143	Distribution system	SRX1075090	SA 3 tank	02-May-2012	SA1	CTCGATG
3 | SRR2080425	SAMN03776103	1.5kmPostDis	SA1_library_R_2012_12_05_20_11_34_user_SN2-26_Auto_user_SN2-26_30.fastq	80	66	143	Distribution system	SRX1075092	1.5kmPostDis	03-May-2012	SA1	ACCAACT
4 | SRR2080427	SAMN03776105	SASourceWater2	SA2_library_R_2012_12_11_11_14_16_user_js2-SN2-29_Auto_user_SN2-29_33.fastq	68	56	139	Source water	SRX1075094	SASourceWater2	03-May-2012	SA2	TCGCAGG
5 | SRR2080434	SAMN03776111	SA 2 CT	SA2_library_R_2012_12_11_11_14_16_user_js2-SN2-29_Auto_user_SN2-29_33.fastq	68	56	139	Distribution system	SRX1075101	SA 2 CT	02-May-2012	SA2	GCTCGAA
6 | SRR2080436	SAMN03776091	WA1 outlet	WA_library_R_2013_03_26_15_14_25_user_SN1-86.fastq	59	48	140	Distribution system	SRX1075103	WA1 outlet	03-May-2012	WA	GGATCAA
7 | 


--------------------------------------------------------------------------------
/Datasets/ground_water/README.md:
--------------------------------------------------------------------------------
 1 | # Ground Water
 2 | 
 3 | This data is from [SRA project SRP075429](https://www.ncbi.nlm.nih.gov/sra/?term=SRP075429)
 4 | 
 5 | This data comes from: Hernsdorf AW, Amano Y, Miyakawa K, Ise K, Suzuki Y, Anantharaman K, Probst A, Burstein D, Thomas BC, Banfield JF. 2017. Potential for microbial H2 and metal transformations associated with novel bacteria and archaea in deep terrestrial subsurface sediments. [ISME J 11:1915–1929](https://www.nature.com/articles/ismej201739)
 6 | 
 7 | ## Title
 8 | 
 9 | Potential for microbial H2 and metal transformations associated with novel bacteria and archaea in deep terrestrial subsurface sediments
10 | 
11 | ## Abstract
12 | 
13 | Geological sequestration in deep underground repositories is the prevailing proposed route for radioactive waste disposal. After the disposal of radioactive waste in the subsurface, H2 may be produced by corrosion of steel and, ultimately, radionuclides will be exposed to the surrounding environment. To evaluate the potential for microbial activities to impact disposal systems, we explored the microbial community structure and metabolic functions of a sediment-hosted ecosystem at the Horonobe Underground Research Laboratory, Hokkaido, Japan. Overall, we found that the ecosystem hosted organisms from diverse lineages, including many from the phyla that lack isolated representatives. The majority of organisms can metabolize H2, often via oxidative [NiFe] hydrogenases or electron-bifurcating [FeFe] hydrogenases that enable ferredoxin-based pathways, including the ion motive Rnf complex. Many organisms implicated in H2 metabolism are also predicted to catalyze carbon, nitrogen, iron and sulfur transformations. Notably, iron-based metabolism is predicted in a novel lineage of Actinobacteria and in a putative methane-oxidizing ANME-2d archaeon. We infer an ecological model that links microorganisms to sediment-derived resources and predict potential impacts of microbial activity on H2 consumption and retardation of radionuclide migration
14 | 
15 | This is a random community data set
16 | 
17 | ## Design
18 | 
19 | DNA was extracted from the biomass retained on 0.2 um filters using the Extrap Soil DNA Kit Plus ver. 2 (Nippon Steel & Sumikin Eco-Tech Corporation) and sent for 150 bp paired-end sequencing with a 550 bp insert size by Hokkaido System Science Co., Ltd. using an Illumina HiSeq2000
20 | 
21 | ## Runs
22 | 
23 | SRR3546457 SRR3546455 SRR3546454 SRR3546453 SRR3546452 SRR3546451 SRR3546450 SRR3546449
24 | 
25 | 
26 | We downloaded from SRA like this
27 | 
28 | ```
29 | for SRR_ID in SRR3546457 SRR3546455 SRR3546454 SRR3546453 SRR3546452 SRR3546451 SRR3546450 SRR3546449; do
30 | 	fastq-dump --outdir fastq --gzip --skip-technical  --readids --read-filter pass --dumpbase --split-3 --clip -N 5000 -X 255000 $SRR_ID
31 | done
32 | ```
33 | 
34 | ## Data
35 | 
36 | * [SRR3546449 pass 1](fastq/SRR3546449_pass_1.fastq.gz)
37 | * [SRR3546449 pass 2](fastq/SRR3546449_pass_2.fastq.gz)
38 | 
39 | * [SRR3546450 pass 1](fastq/SRR3546450_pass_1.fastq.gz)
40 | * [SRR3546450 pass 2](fastq/SRR3546450_pass_2.fastq.gz)
41 | 
42 | * [SRR3546451 pass 1](fastq/SRR3546451_pass_1.fastq.gz)
43 | * [SRR3546451 pass 2](fastq/SRR3546451_pass_2.fastq.gz)
44 | 
45 | * [SRR3546452 pass 1](fastq/SRR3546452_pass_1.fastq.gz)
46 | * [SRR3546452 pass 2](fastq/SRR3546452_pass_2.fastq.gz)
47 | 
48 | * [SRR3546453 pass 1](fastq/SRR3546453_pass_1.fastq.gz)
49 | * [SRR3546453 pass 2](fastq/SRR3546453_pass_2.fastq.gz)
50 | 
51 | * [SRR3546454 pass 1](fastq/SRR3546454_pass_1.fastq.gz)
52 | * [SRR3546454 pass 2](fastq/SRR3546454_pass_2.fastq.gz)
53 | 
54 | * [SRR3546455 pass 1](fastq/SRR3546455_pass_1.fastq.gz)
55 | * [SRR3546455 pass 2](fastq/SRR3546455_pass_2.fastq.gz)
56 | 
57 | * [SRR3546457 pass 1](fastq/SRR3546457_pass_1.fastq.gz)
58 | * [SRR3546457 pass 2](fastq/SRR3546457_pass_2.fastq.gz)
59 | 


--------------------------------------------------------------------------------
/Datasets/ground_water/fastq/SRR3546449_pass_1.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:57a5999e5038c4832396f72e83df8567656a1b719893b0b45047341dfe86b778
3 | size 868540
4 | 


--------------------------------------------------------------------------------
/Datasets/ground_water/fastq/SRR3546449_pass_2.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:9b77cd99aa9ab595ae8fb38aedcc2f8474a81fc24da820553030b28a647908ff
3 | size 797493
4 | 


--------------------------------------------------------------------------------
/Datasets/ground_water/fastq/SRR3546450_pass_1.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:c92a4bb75490b0f2a0a9599d6503209e0cf8991e8d7827eaf9194db21b8627d3
3 | size 806242
4 | 


--------------------------------------------------------------------------------
/Datasets/ground_water/fastq/SRR3546450_pass_2.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:ce5ba16c7a39a92d0d01fa3a6708a102795d7608ba2218ab03feb9ac909faab4
3 | size 871595
4 | 


--------------------------------------------------------------------------------
/Datasets/ground_water/fastq/SRR3546451_pass_1.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:64753dfa8752fda9daadc207052a0c45678d6292ab4a6a3c540680b4312aee3b
3 | size 820185
4 | 


--------------------------------------------------------------------------------
/Datasets/ground_water/fastq/SRR3546451_pass_2.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:d4df8a50da2788b82fa13ccd9c6e2bf2fcbf2c12772a96c8f4c3368212cf8348
3 | size 889595
4 | 


--------------------------------------------------------------------------------
/Datasets/ground_water/fastq/SRR3546452_pass_1.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:2422b29f627ce3341249dc14c5b9f1191b12ab59e913fa6a47cef02590b62f25
3 | size 792693
4 | 


--------------------------------------------------------------------------------
/Datasets/ground_water/fastq/SRR3546452_pass_2.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:2943af956caa052ca5d8d47a507d29bd0bfdb3f0568a303bf7388238fb3256f4
3 | size 871576
4 | 


--------------------------------------------------------------------------------
/Datasets/ground_water/fastq/SRR3546453_pass_1.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:98caf978b399e2b8696e2b8ba55f89cfd0adb9191942548b38cdea4e355d887a
3 | size 720110
4 | 


--------------------------------------------------------------------------------
/Datasets/ground_water/fastq/SRR3546453_pass_2.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:81260f94df8be1c37c89547fde6092ef60756e0005e76f2fb587ade149bd5a36
3 | size 836492
4 | 


--------------------------------------------------------------------------------
/Datasets/ground_water/fastq/SRR3546454_pass_1.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:dcf36caf4a2fbe07d60f374c4d15bc2d31120e054fddb6d779dc9eda78b12bfa
3 | size 817803
4 | 


--------------------------------------------------------------------------------
/Datasets/ground_water/fastq/SRR3546454_pass_2.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:92876d6ed89d746871d9b301c65d68b56f3ccfb608aeef8aeb6a1fa08990b92b
3 | size 728611
4 | 


--------------------------------------------------------------------------------
/Datasets/ground_water/fastq/SRR3546455_pass_1.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:e7c9a5aaf90ca2e8739ae24e5f4d7efbd3e7ea5a2622b095090f038cae53df37
3 | size 725141
4 | 


--------------------------------------------------------------------------------
/Datasets/ground_water/fastq/SRR3546455_pass_2.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:b3aecc742bbd5de18c6ce0295c481ece6d051989378bef17a01b9764239583f2
3 | size 842751
4 | 


--------------------------------------------------------------------------------
/Datasets/ground_water/fastq/SRR3546457_pass_1.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:f58c9c9da03e4a27c1696a485b124648768f19e0f6781293d7bfbcc5bb86411e
3 | size 792688
4 | 


--------------------------------------------------------------------------------
/Datasets/ground_water/fastq/SRR3546457_pass_2.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:ec9120136cdb25350b948a5561df0b81128e642e361717379052cc633436cbdd
3 | size 708556
4 | 


--------------------------------------------------------------------------------
/Datasets/gut/README.md:
--------------------------------------------------------------------------------
 1 | # Gut data
 2 | 
 3 | This data is from [SRA project SRP074153](https://trace.ncbi.nlm.nih.gov/Traces/sra/?study=SRP074153)
 4 | 
 5 | This is a random community data set
 6 | 
 7 | This data comes from
 8 | 
 9 | Brooks B, Olm MR, Firek BA, Baker R, Thomas BC, Morowitz MJ, Banfield JF. 2017. Strain-resolved analysis of hospital rooms and infants reveals overlap between the human and room microbiome. [Nat Commun 8:1814](https://www.nature.com/articles/s41467-017-02018-w)
10 | 
11 | ## Title
12 | 
13 | Metagenomes from 11 human infant fecal samples hospitalized in the same intensive care unit
14 | 
15 | ## Abstract
16 | 
17 | Bacteria that persist in hospitals can contribute to the establishment of the microbiome in newborns and the spread of hospital-acquired diseases. Yet we know little about microbial communities in hospitals, or about the extent to which persistent vs. recently immigrated bacterial strains establish in the gastrointestinal tracts of hospitalized individuals.In combination with BioProject PRJNA273761 (10 infants / 55 samples) we analyzed strain-resolved genomes obtained from a total of 202 samples collected over a three-year period from 21 infants hospitalized in the same intensive care unit.Strains were rarely shared, consistent with prior analysis of a subset of these data. Enterococcus faecalis and Staphylococcus epidermidis, common gut colonists, exhibit diversity comparable to that of NCBI reference strains, suggesting no recent common ancestor for all populations in this hospital setting. Thus, we infer multiple introduction events for these species. Despite the rarity of shared strains, strains of five species exhibiting a degree of sequence variation consistent with in situ diversification were identified in different infants hospitalized three years apart. Three were also detected in multiple infants in the same year, suggesting that these strains are unusually widely dispersed and persistent in the hospital environment. Persistent strains were not significantly different from non-persistent strains with regards to pathogenicity potential including antibiotic resistance. Notably, non-identical siblings had multiple abundant strains in common, even 30 days after birth and antibiotic administration, suggesting overlapping strain sources and/or genetic selection. Our approach can be used in order to study microbial dynamics in hospitals and provides an important step towards directing health-promoting colonization in hospitalized individuals.
18 | 
19 | ## Design
20 | 
21 | DNA was extracted using the MO BIO PowerSoil DNA Isolation kit; libraries were made Illuminas Nextera kit with average insert sizes of 500/900 bp
22 | 
23 | ## Runs
24 | 
25 | SRR3466404 SRR3506419 SRR3506420 SRR3546776 SRR3546778 SRR3546779 SRR3546780 SRR3546781 SRR3546782
26 | 
27 | 
28 | 
29 | We downloaded the data from SRA like this:
30 | 
31 | ```
32 | for SRR_ID in SRR3466404 SRR3506419 SRR3506420 SRR3546776 SRR3546778 SRR3546779 SRR3546780 SRR3546781 SRR3546782; do
33 | 	fastq-dump --outdir fastq --gzip --skip-technical  --readids --read-filter pass --dumpbase --split-3 --clip -N 5000 -X 255000 $SRR_ID
34 | done
35 | ```
36 | 
37 | ## Data
38 | 
39 | * [SRR3466404 pass 1](fastq/SRR3466404_pass_1.fastq.gz)
40 | * [SRR3466404 pass 2](fastq/SRR3466404_pass_2.fastq.gz)
41 | 
42 | * [SRR3506419 pass 1](fastq/SRR3506419_pass_1.fastq.gz)
43 | * [SRR3506419 pass 2](fastq/SRR3506419_pass_2.fastq.gz)
44 | 
45 | * [SRR3506420 pass 1](fastq/SRR3506420_pass_1.fastq.gz)
46 | * [SRR3506420 pass 2](fastq/SRR3506420_pass_2.fastq.gz)
47 | 
48 | * [SRR3546776 pass 1](fastq/SRR3546776_pass_1.fastq.gz)
49 | * [SRR3546776 pass 2](fastq/SRR3546776_pass_2.fastq.gz)
50 | 
51 | * [SRR3546778 pass 1](fastq/SRR3546778_pass_1.fastq.gz)
52 | * [SRR3546778 pass 2](fastq/SRR3546778_pass_2.fastq.gz)
53 | 
54 | * [SRR3546779 pass 1](fastq/SRR3546779_pass_1.fastq.gz)
55 | * [SRR3546779 pass 2](fastq/SRR3546779_pass_2.fastq.gz)
56 | 
57 | * [SRR3546780 pass 1](fastq/SRR3546780_pass_1.fastq.gz)
58 | * [SRR3546780 pass 2](fastq/SRR3546780_pass_2.fastq.gz)
59 | 
60 | * [SRR3546781 pass 1](fastq/SRR3546781_pass_1.fastq.gz)
61 | * [SRR3546781 pass 2](fastq/SRR3546781_pass_2.fastq.gz)
62 | 
63 | * [SRR3546782 pass 1](fastq/SRR3546782_pass_1.fastq.gz)
64 | * [SRR3546782 pass 2](fastq/SRR3546782_pass_2.fastq.gz)
65 | 
66 | * [SRR3466404 pass 1](SRR3466404_pass_1.fastq.gz)
67 | * [SRR3466404 pass 2](SRR3466404_pass_2.fastq.gz)
68 | 
69 | * [SRR3506419 pass 1](SRR3506419_pass_1.fastq.gz)
70 | * [SRR3506419 pass 2](SRR3506419_pass_2.fastq.gz)
71 | 
72 | * [SRR3506420 pass 1](SRR3506420_pass_1.fastq.gz)
73 | * [SRR3506420 pass 2](SRR3506420_pass_2.fastq.gz)
74 | 
75 | * [SRR3546776 pass 1](SRR3546776_pass_1.fastq.gz)
76 | * [SRR3546776 pass 2](SRR3546776_pass_2.fastq.gz)
77 | 
78 | * [SRR3546778 pass 1](SRR3546778_pass_1.fastq.gz)
79 | * [SRR3546778 pass 2](SRR3546778_pass_2.fastq.gz)
80 | 
81 | * [SRR3546779 pass 1](SRR3546779_pass_1.fastq.gz)
82 | * [SRR3546779 pass 2](SRR3546779_pass_2.fastq.gz)
83 | 
84 | * [SRR3546780 pass 1](SRR3546780_pass_1.fastq.gz)
85 | * [SRR3546780 pass 2](SRR3546780_pass_2.fastq.gz)
86 | 
87 | * [SRR3546781 pass 1](SRR3546781_pass_1.fastq.gz)
88 | * [SRR3546781 pass 2](SRR3546781_pass_2.fastq.gz)
89 | 
90 | * [SRR3546782 pass 1](SRR3546782_pass_1.fastq.gz)
91 | * [SRR3546782 pass 2](SRR3546782_pass_2.fastq.gz)
92 | 


--------------------------------------------------------------------------------
/Datasets/gut/fastq/SRR3466404_pass_1.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:ba04f28d9273ee610eeff00f44d3218712b09ea29b87a29fc627d8f64cd43ce4
3 | size 21778818
4 | 


--------------------------------------------------------------------------------
/Datasets/gut/fastq/SRR3466404_pass_2.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:1f8bd80f3873133ed601213a97cd909480fc9c82150a2b98a3be919019b7b23a
3 | size 22031107
4 | 


--------------------------------------------------------------------------------
/Datasets/gut/fastq/SRR3506419_pass_1.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:5de9ae563869202ad350cf7d0d2546c96626c2dd70f974b046f9997e88a8bb7d
3 | size 22112514
4 | 


--------------------------------------------------------------------------------
/Datasets/gut/fastq/SRR3506419_pass_2.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:ef5fcad343fb59e98c85d6082e87150dd6e439f10e2973b72c1241f44de75a37
3 | size 23048083
4 | 


--------------------------------------------------------------------------------
/Datasets/gut/fastq/SRR3506420_pass_1.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:d72eea05c1f7b7a723c4d5893c1bdb933fe90efdee83f5a3511024d7978c35a4
3 | size 21869920
4 | 


--------------------------------------------------------------------------------
/Datasets/gut/fastq/SRR3506420_pass_2.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:8bd9642391c35452d705ebb9a6e95c6537d3714373e1fb2216aebbd1358e53e2
3 | size 22000618
4 | 


--------------------------------------------------------------------------------
/Datasets/gut/fastq/SRR3546776_pass_1.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:41253eca06118dff562691b9b740744144e25ee2ea4794006865238d19d025a1
3 | size 20694955
4 | 


--------------------------------------------------------------------------------
/Datasets/gut/fastq/SRR3546776_pass_2.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:ba41ab45f10695cca31acb4bbb3a8b2d95d63c03e50fda02b2ec9da9a89be0bb
3 | size 21277584
4 | 


--------------------------------------------------------------------------------
/Datasets/gut/fastq/SRR3546778_pass_1.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:df6dc216b5a7030304877bdd52185a537a4311a0433be079cb1ff30825dce078
3 | size 22250401
4 | 


--------------------------------------------------------------------------------
/Datasets/gut/fastq/SRR3546778_pass_2.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:fdc6f16dd521aa6dc5b12e932c6ea80a9deb507c722768875b10bc078becbf0d
3 | size 22865307
4 | 


--------------------------------------------------------------------------------
/Datasets/gut/fastq/SRR3546779_pass_1.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:5fa5545f4d5c1990b6b3fb60f8e8ad90be0e42899cc066fd1e89eb5c39981500
3 | size 21966458
4 | 


--------------------------------------------------------------------------------
/Datasets/gut/fastq/SRR3546779_pass_2.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:880fc5eb76ce47517202a4acd785b7f71c8db50b4c13fb347f8274b204cd9937
3 | size 21626626
4 | 


--------------------------------------------------------------------------------
/Datasets/gut/fastq/SRR3546780_pass_1.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:89e5a60c6df28ddf2a786506490ad3720722ee52ca9d87da8f87fb348714f5a4
3 | size 22365294
4 | 


--------------------------------------------------------------------------------
/Datasets/gut/fastq/SRR3546780_pass_2.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:d972b3d4ccfe27735059ea3250d429e44dfe31ebb169687431c3ba60cbf76d8c
3 | size 22924288
4 | 


--------------------------------------------------------------------------------
/Datasets/gut/fastq/SRR3546781_pass_1.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:c56f1681f4ecce9c08ee908ea5bbad56efd777c10d34d862021915dbb264eab3
3 | size 21043941
4 | 


--------------------------------------------------------------------------------
/Datasets/gut/fastq/SRR3546781_pass_2.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:71907a0b5721745d4b9532a2c788c7565dbc260e3107c759dc1f2424588dc363
3 | size 22191650
4 | 


--------------------------------------------------------------------------------
/Datasets/gut/fastq/SRR3546782_pass_1.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:53c337146e94ebe53e807e65bac27f0ac9374ce67cec445dd01b48cdcca52065
3 | size 21915635
4 | 


--------------------------------------------------------------------------------
/Datasets/gut/fastq/SRR3546782_pass_2.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:a2083af0f4fafad4b93092a0024d7514e2218b56f7f46d058e37fa404e84743c
3 | size 22613658
4 | 


--------------------------------------------------------------------------------
/Deconseq/README.md:
--------------------------------------------------------------------------------
  1 | # Removing host genome contamination
  2 | 
  3 | We have writen several tools to remove host genomes like [Deconseq](https://deconseq.sourceforge.net/) and written [blog posts](https://edwards.flinders.edu.au/command-line-deconseq/) to describe how to remove host genomes. It's critical that you do this before you analyse your data, because otherwise you will get spurious results when you do your other analyses.
  4 | 
  5 | # Step 1. Find your genome
  6 | 
  7 | You need to find a host genome that you want to remove. If you are using a non-model organism, take a look at the [NCBI Genomes](https://www.ncbi.nlm.nih.gov/genome) collection that has a lot of curated genomes. 
  8 | 
  9 | If you are using the human genome, the NCBI has some specific resources:
 10 | 
 11 | There are several [human genome versions](https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.40_GRCh38.p14/GRCh38_major_release_seqs_for_alignment_pipelines/) specifically designed for inclusion in pipelines. 
 12 | 
 13 | ```
 14 | A. GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz
 15 | 
 16 | A gzipped file that contains FASTA format sequences for the following:
 17 | 1. chromosomes from the GRCh38 Primary Assembly unit.
 18 |    Note: the two PAR regions on chrY have been hard-masked with Ns.
 19 |    The chromosome Y sequence provided therefore has the same
 20 |    coordinates as the GenBank sequence but it is not identical to the
 21 |    GenBank sequence. Similarly, duplicate copies of centromeric arrays
 22 |    and WGS on chromosomes 5, 14, 19, 21 & 22 have been hard-masked
 23 |    with Ns (locations of the unmasked copies are given below).
 24 | 2. mitochondrial genome from the GRCh38 non-nuclear assembly unit.
 25 | 3. unlocalized scaffolds from the GRCh38 Primary Assembly unit.
 26 | 4. unplaced scaffolds from the GRCh38 Primary Assembly unit.
 27 | 5. Epstein-Barr virus (EBV) sequence
 28 |    Note: The EBV sequence is not part of the genome assembly but is
 29 |    included in the analysis set as a sink for alignment of reads that
 30 |    are often present in sequencing samples.
 31 | 
 32 | B. GCA_000001405.15_GRCh38_full_analysis_set.fna.gz
 33 | 
 34 | A gzipped file that contains all the same FASTA formatted sequences as
 35 | GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz, plus:
 36 | 
 37 | 6. alt-scaffolds from the GRCh38 ALT_REF_LOCI_* assembly units.
 38 | 
 39 | C. GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set.fna.gz
 40 | 
 41 | A gzipped file that contains all the same FASTA formatted sequences as
 42 | GCA_000001405.15_GRCh38_full_analysis_set.fna.gz, plus:
 43 | 
 44 | 7.  human decoy sequences from hs38d1 (GCA_000786075.2)
 45 | 
 46 | D. GCA_000001405.15_GRCh38_no_alt_plus_hs38d1_analysis_set.fna.gz
 47 | 
 48 | A gzipped file that contains all the same FASTA formatted sequences as
 49 | GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz, plus:
 50 | 
 51 | 7.  human decoy sequences from hs38d1 (GCA_000786075.2)
 52 | ```
 53 | 
 54 | We usually use [GCA_000001405.15_GRCh38_no_alt_plus_hs38d1_analysis_set.fna.gz](https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.40_GRCh38.p14/GRCh38_major_release_seqs_for_alignment_pipelines/GCA_000001405.15_GRCh38_no_alt_plus_hs38d1_analysis_set.fna.gz) which contains everything, but you do need to remember that it contains adenovirus sequences and if you are interested in those they may get removed.
 55 | 
 56 | If you want to download it, you can use `wget` to get the file:
 57 | 
 58 | ```
 59 | wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.40_GRCh38.p14/GRCh38_major_release_seqs_for_alignment_pipelines/GCA_000001405.15_GRCh38_no_alt_plus_hs38d1_analysis_set.fna.gz
 60 | ```
 61 | 
 62 | 
 63 | ## Use minimap2 and samtools to filter the host sequences
 64 | 
 65 | You can use any [HTS mapper](https://academic.oup.com/bioinformatics/article/28/24/3169/245777), like [bowtie2](https://bowtie-bio.sourceforge.net/bowtie2/index.shtml) or [BWA](https://github.com/lh3/bwa), but nowadays we prefer [minimap2](https://github.com/lh3/minimap2) because it is a lot faster.
 66 | 
 67 | In this example, we're using the human genome from above. But if you have downloaded a different genome, just replace your fasta file. Note that another advantage of `minimap2` is that it handles `gzip` compressed files natively, you don't need to uncompress them!
 68 | 
 69 | ```
 70 | minimap2 --split-prefix=tmp$$ -a -xsr GCA_000001405.15_GRCh38_no_alt_plus_hs38d1_analysis_set.fna.gz R1.fastq.gz R2.fastq.gz | samtools view -bh | samtools sort -o output.bam
 71 | samtools index output.bam
 72 | ```
 73 | 
 74 | ## Filter mapped reads
 75 | 
 76 | Now, we use `samtools` flags to filter out the host and non host sequences. The reads that *ARE* mapped are host, while the reads that ARE *NOT* mapped are non-host.
 77 | 
 78 | We use samtools to get fastq output from the `.bam` format files.
 79 | 
 80 | You can find out what the flags mean using the [samtools flag explainer](https://broadinstitute.github.io/picard/explain-flags.html)
 81 | 
 82 | Here is the [samtools specification](https://samtools.github.io/hts-specs/SAMv1.pdf), and the description of the columns is on page 6.
 83 | 
 84 | ### host sequences
 85 | 
 86 | ```
 87 | mkdir host not_host
 88 | samtools fastq -F 3588 -f 65 output.bam | gzip -c > host/output_S_R1.fastq.gz
 89 | echo "R2 matching host genome:"
 90 | samtools fastq -F 3588 -f 129 output.bam | gzip -c > host/output_S_R2.fastq.gz
 91 | ```
 92 | 
 93 | ### sequences that are not host
 94 | 
 95 | ```
 96 | samtools fastq -F 3584 -f 77 output.bam  | gzip -c > not_host/output_S_R1.fastq.gz
 97 | samtools fastq -F 3584 -f 141 output.bam | gzip -c > not_host/output_S_R2.fastq.gz
 98 | samtools fastq -f 4 -F 1 output.bam | gzip -c > not_host/output_S_Singletons.fastq.gz
 99 | ```
100 | 
101 | 
102 | 
103 | 
104 | 


--------------------------------------------------------------------------------
/Definitions.md:
--------------------------------------------------------------------------------
 1 | # Definitions
 2 | 
 3 | In the [genome assembly and annotation](README.md) description, we introduce several new terms. This is a small definition table in case you are confused!
 4 | 
 5 | Term |  Definition
 6 | --- |  ---
 7 | Open Reading Frame (ORF) | A stretch of amino acids with no stop codon
 8 | Coding Sequence (CDS) | An ORF that could encode a protein
 9 | Protein encoding gene (PEG) | An ORF that could encode a protein
10 | Hypothetical protein | Something that has not been experimentally shown
11 | putative protein | Something that has not been experimentally shown
12 | Polypeptide | A short stretch of amino acids (typically about 20 amino acids or less)
13 | Contig | A contiguous piece of DNA sequence that has been assembled from more than one reads. It is compiled because, as noted above, the 5' end of one sequence overlaps the 3' end of another.
14 | Read | The unit of DNA sequence that comes from a sequencing instrument. A single piece of DNA sequence.
15 | 
16 | 


--------------------------------------------------------------------------------
/FOCUS/FOCUS.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/FOCUS/FOCUS.pdf


--------------------------------------------------------------------------------
/FOCUS/README.md:
--------------------------------------------------------------------------------
 1 | # Annotating the taxonomy of the bins with FOCUS
 2 | 
 3 | [Find Organisms by Compositional Usage (FOCUS)](https://edwards.sdsu.edu/FOCUS) ([Silva _et al._](https://www.ncbi.nlm.nih.gov/pubmed/24949242)) is a method that identifies the organisms present in a metagenomic sample. You can use FOCUS with any random metagenome (not a 16S dataset, though), and it will rapidly generate a list of the organisms that are present. FOCUS does not try and identify the source of each read, instead, it takes a holistic point of view, and uses the complete *k*-mer profile to analyze the metagenome. 
 4 | 
 5 | To learn more about focus, you should [read the paper](https://peerj.com/articles/425/) (it's free!).
 6 | 
 7 | [<img src="images/focus.jpg" width="600 px" align="left" title="FOCUS workflow" />](https://dfzljdn9uc3pi.cloudfront.net/2014/425/1/fig-1-2x.jpg)
 8 | FOCUS starts by making a database of *k*-mers from a set of reference genomes. We use either reference genomes from [NCBI RefSeq](https://www.ncbi.nlm.nih.gov/refseq/) that has about 10,000 microbial genomes, or [PATRIC](https://www.patricbrc.org/) that has about 100,000 microbial genomes! It builds this database first so you don’t have to build the database each time you analyze data. Then, FOCUS reads your data and counts the *k*-mers present in your sample. It compares the counts that it generated from your sample to the counts from the reference database, and estimates the proportion of each organism in the database that, when combined, would give the proportion of *k*-mers in the sample. FOCUS generates that analysis in just a few seconds and allows us to rapidly annotate metagenomes.
 9 | 
10 | We have installed focus.py on the virtual machine (but you will need version 7 or higher), and you can just run it by using the command 
11 | 
12 | ```bash
13 | focus -h
14 | ```
15 | 
16 | It takes a few command line arguments, and you can ask FOCUS to analyze a single metagenome or a collection of metagenomes. We are going to analyze our bins as a collection of metagenomes. This allows us to import the output from FOCUS into a statistical analysis tool, STAMP, and down some downstream processing. 
17 | 
18 | For example, to analyze a directory containing each of the bins, we use the command:
19 | 
20 | ```bash
21 | focus.py -q MetagenomeBins/ -o MetagenomeBinsAnnotations/
22 | ```
23 | 
24 | Alternatively, to analyze the [algae dataset](https://goo.gl/zvnZD4) you can download that data (using `curl`), extract the archive, unzip each of the `fastq` files, and then run
25 | 
26 | ```bash
27 | focus -q Algae -o Algae_Focus
28 | ```
29 | 
30 | You will get the following files, which provide the output at the different [taxonomic rank](https://en.wikipedia.org/wiki/Taxonomic_rank) of each file.
31 | 
32 | * output_All_levels.xls
33 | * output_Kingdom_tabular.xls
34 | * output_Phylum_tabular.xls
35 | * output_Class_tabular.xls
36 | * output_Order_tabular.xls
37 | * output_Family_tabular.xls
38 | * output_Genus_tabular.xls
39 | * output_Species_tabular.xls
40 | * output_Strain_tabular.xls
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------
/FOCUS/images/focus.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/FOCUS/images/focus.jpg


--------------------------------------------------------------------------------
/GenomePeek/GenomePeek.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/GenomePeek/GenomePeek.pdf


--------------------------------------------------------------------------------
/GenomePeek/README.md:
--------------------------------------------------------------------------------
 1 | # Genome Peek
 2 | 
 3 | ## Annotating the taxonomy of bins, metagenomes, and complexity of genomes with [GenomePeek](https://edwards.sdsu.edu/GenomePeek/)
 4 | 
 5 | 
 6 | <img src="images/single.png" align="right" title="GenomePeek with a Single Organism" />
 7 | 
 8 | Another way to identify the taxonomy of the organisms in the database is to identify a few key house-keeping proteins, and match those proteins in your metagenome bins. We developed the [GenomePeek](https://edwards.sdsu.edu/GenomePeek) GenomePeek software to take sequences, (e.g. contigs), and rapidly identify the presence of the 16S gene, _radA/recA_, _rpoB_, and _groEL_ ([McNair and Edwards](https://www.ncbi.nlm.nih.gov/pubmed/26157610)) in those sequences, and to identify the most likely species that are present in your sample. 
 9 | 
10 |  If you have a pure genome, or a pure bin, you will see only a single organism present in the output. For example, the figure shows an analysis of a single genome sample that has a pure 16S gene and is a *Pseudomonas*. 
11 | 
12 | <img src="images/mixed.png" align="left" title="GenomePeek with a mixed population" /> 
13 | 
14 | The second figure shows a mixed sample where there are only two organisms. In this case, there are a *Photobacterium* and a *Vibrio*. These species are closely related and we often isolate one when we isolate the other.
15 | 
16 | Finally, if you upload a metagenome, you’ll see a mixture of all of the organisms in your sample (see the third figure). In this figure, not only have we extracted all the 16S genes from this dataset, we tell you the proportion of each genus in the sample. This is probably the quickest way, but not necessarily the most correct way, to get an overview of your metagenome, whether it is from a 16S dataset or a regular metagenome!
17 | 
18 | Genome Peek provides a mechanism for you to download the sequences that match to each of the genes that we have identified, so you can easily pull out those sequences and view the alignments.
19 | 
20 | [GenomePeek](https://edwards.sdsu.edu/GenomePeek) provides a rapid way to identify the organisms in a metagenome bin, and to look at the completeness and contamination of the bins. All you need to do is go to the [GenomePeek upload page](https://edwards.sdsu.edu/GenomePeek/) and upload your contigs.fasta file. GenomePeek will complete the analysis for you!
21 | 
22 | <img src="images/metagenome.png" title="GenomePeek with a metagenome" width="400px" />
23 | 


--------------------------------------------------------------------------------
/GenomePeek/images/metagenome.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/GenomePeek/images/metagenome.png


--------------------------------------------------------------------------------
/GenomePeek/images/mixed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/GenomePeek/images/mixed.png


--------------------------------------------------------------------------------
/GenomePeek/images/single.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/GenomePeek/images/single.png


--------------------------------------------------------------------------------
/GenomeSequencingOverview/GenomeSequencingOverview.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/GenomeSequencingOverview/GenomeSequencingOverview.pdf


--------------------------------------------------------------------------------
/GenomeSequencingOverview/Whole_Genome_Sequencing.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/GenomeSequencingOverview/Whole_Genome_Sequencing.pdf


--------------------------------------------------------------------------------
/GenomeSequencingOverview/images/GenBankGrowth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/GenomeSequencingOverview/images/GenBankGrowth.png


--------------------------------------------------------------------------------
/GenomeSequencingOverview/images/GenomesOnlineGrowth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/GenomeSequencingOverview/images/GenomesOnlineGrowth.png


--------------------------------------------------------------------------------
/GenomeSequencingOverview/images/GenomicsAndModeling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/GenomeSequencingOverview/images/GenomicsAndModeling.png


--------------------------------------------------------------------------------
/Kraken2/README.md:
--------------------------------------------------------------------------------
 1 | # Kraken2
 2 | 
 3 | Kraken2 uses _k_-mers to identify the taxonomy of the microbes in your sample. In essence, they have taken all complete genomes, and then identified all _k_-mers that are unique to each taxonomic level. Through some nifty computing, and special [data structures](https://www.youtube.com/watch?v=zgCnMvvw6Oo&list=PLpPXw4zFa0uKKhaSz87IowJnOTzh9tiBk), they have figured out how to search this very efficiently.
 4 | 
 5 | There are a wide range of pre-built [kraken databases](https://benlangmead.github.io/aws-indexes/k2) that you can download, so you do not need to go to the effort of building them yourself.
 6 | 
 7 | 
 8 | When [installing Kraken2](https://github.com/DerrickWood/kraken2/wiki/Manual#installation), I recommend setting the `KRAKEN2_DB_PATH` and `KRAKEN2_DEFAULT_DB` variables, and then you do not need to specify them on the command line.
 9 | 
10 | 
11 | To run Kraken2, use this incantation:
12 | 
13 | ```bash
14 | kraken2 --paired --threads 4 --report kraken_taxonomy.txt --output kraken_output.txt \
15 | 	fastq/reads_1.fastq fastq/reads_2.fastq
16 | ```
17 | 
18 | This will output two files:
19 | 
20 | * `$SRR.kraken_output.txt` contains the [standard kraken output](https://github.com/DerrickWood/kraken2/wiki/Manual#output-formats):
21 |     - A code (_C_ or _U_) indicating whether the read was classified or not
22 |     - The read ID from the fastq file
23 |     - The taxonomy ID assigned to the read if it is classified, or 0 if it is not classified
24 |     - The length of the sequence in base pairs. Because we are using paired end reads, there are two lengths (R1\|R2)
25 |     - A space-separated list of the lowest common ancestor for each sequence that indicates how many kmers map to which taxonomic IDs. Because we have paired end information, there is a `|:|` separator between the R1 and R2 information
26 | * `$SRR.kraken_taxonomy.txt` contains the [standard kraken report](https://github.com/DerrickWood/kraken2/wiki/Manual#sample-report-output-format):
27 |     - Percent of fragments at that taxonomic level
28 |     - Number of fragments at that taxonomic level (the sum of fragments at this level and all those below this level)
29 |     - Number of fragments exactly at that taxonomic level
30 |     - A taxonomic level code:  `U`nclassified, `R`oot, `D`omain, `K`ingdom, `P`hylum, `C`lass, `O`rder, `F`amily, `G`enus, or `S`pecies. If the taxonomy is not one of these the number indicates the levels between this node and the appropriate node. See [the docs](https://github.com/DerrickWood/kraken2/wiki/Manual#sample-report-output-format) for more information.
31 |     - NCBI Taxonomic name
32 |     - Scientific name
33 | 
34 | 
35 | For more information about Kraken2, [see the wiki page](https://github.com/DerrickWood/kraken2/wiki/Manual)
36 | 
37 | 
38 | If you are using the HPC at Flinders University, the details on [this page](https://fame.flinders.edu.au) will show you how to install and use Kraken2
39 | 
40 | 
41 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Rob Edwards
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Linux/Linux.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Linux/Linux.pdf


--------------------------------------------------------------------------------
/Linux/MachineSetUp.md:
--------------------------------------------------------------------------------
1 | # How to set up a CentOS server for this course
2 | 
3 | 
4 | 


--------------------------------------------------------------------------------
/Linux/MachineSetUp.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Linux/MachineSetUp.pdf


--------------------------------------------------------------------------------
/Linux/images/image1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Linux/images/image1.png


--------------------------------------------------------------------------------
/Linux/images/image10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Linux/images/image10.png


--------------------------------------------------------------------------------
/Linux/images/image11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Linux/images/image11.png


--------------------------------------------------------------------------------
/Linux/images/image12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Linux/images/image12.png


--------------------------------------------------------------------------------
/Linux/images/image13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Linux/images/image13.png


--------------------------------------------------------------------------------
/Linux/images/image14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Linux/images/image14.png


--------------------------------------------------------------------------------
/Linux/images/image15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Linux/images/image15.png


--------------------------------------------------------------------------------
/Linux/images/image16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Linux/images/image16.png


--------------------------------------------------------------------------------
/Linux/images/image17.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Linux/images/image17.png


--------------------------------------------------------------------------------
/Linux/images/image18.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Linux/images/image18.png


--------------------------------------------------------------------------------
/Linux/images/image19.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Linux/images/image19.jpg


--------------------------------------------------------------------------------
/Linux/images/image2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Linux/images/image2.png


--------------------------------------------------------------------------------
/Linux/images/image20.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Linux/images/image20.png


--------------------------------------------------------------------------------
/Linux/images/image21.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Linux/images/image21.png


--------------------------------------------------------------------------------
/Linux/images/image3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Linux/images/image3.png


--------------------------------------------------------------------------------
/Linux/images/image4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Linux/images/image4.png


--------------------------------------------------------------------------------
/Linux/images/image5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Linux/images/image5.png


--------------------------------------------------------------------------------
/Linux/images/image6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Linux/images/image6.png


--------------------------------------------------------------------------------
/Linux/images/image7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Linux/images/image7.png


--------------------------------------------------------------------------------
/Linux/images/image8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Linux/images/image8.png


--------------------------------------------------------------------------------
/Linux/images/image9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Linux/images/image9.png


--------------------------------------------------------------------------------
/Linux/images/mobaxterm_key.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Linux/images/mobaxterm_key.png


--------------------------------------------------------------------------------
/Linux/images/mobaxterm_ssh.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Linux/images/mobaxterm_ssh.png


--------------------------------------------------------------------------------
/MMSeqs2/README.md:
--------------------------------------------------------------------------------
  1 | # MMSeqs2
  2 | 
  3 | [MMSeqs2](https://github.com/soedinglab/MMseqs2) is a fast sequence searching algorithm that we use in replace of [blast](https://blast.ncbi.nlm.nih.gov/Blast.cgi) or [diamond](https://github.com/bbuchfink/diamond)
  4 | 
  5 | MMSeqs2 has a lot of different options, and we have not yet included them all here, but you can find full details about MMSeqs2 in their [detailed manual](https://mmseqs.com/latest/userguide.pdf).
  6 | 
  7 | # MMSeqs2 databases
  8 | 
  9 | Like many tools, MMSeqs2 has precomputed databases that you can download. 
 10 | 
 11 | There is a [complete list on their website](https://github.com/soedinglab/MMseqs2/wiki#downloading-databases)
 12 | 
 13 | You can download a database with the `databases` command. For example, to download the [UniRef50](https://www.uniprot.org/help/uniref) database:
 14 | 
 15 | ```bash
 16 | mkdir -p UniRef50
 17 | mmseqs databases --threads 8 UniRef50 UniRef50/UniRef50 /tmp
 18 | ```
 19 | 
 20 | Some of the databases have taxonomy included with them, and that enables you to use `mmseqs easy-taxonomy` to explore the metagenome.
 21 | 
 22 | # Easy Taxonomy
 23 | 
 24 | We use the MMSeqs2 easy taxonomy a _lot_ for analysing metagenomes, especially by comparing to the [UniRef50](https://www.uniprot.org/help/uniref) database.
 25 | 
 26 | First, `mmseqs easy-taxonomy` _requires_ `fasta` files and does not work with `fastq` files. We have a [fast way to convert fastq to fasta](https://edwards.flinders.edu.au/fastq-to-fasta/) or you can find some tools online.
 27 | 
 28 | We also take advantage of `mmseqs` [sensitivity sweep](https://github.com/soedinglab/MMseqs2/wiki#set-sensitivity--s-parameter) but you should consider comparing [sensitivity and resources](https://github.com/soedinglab/MMseqs2/wiki#optimizing-sensitivity-and-consumption-of-resources). There is a lot of discussion on the [MMSeqs2 wiki](https://github.com/soedinglab/MMseqs2/wiki) about setting sensitivity.
 29 | 
 30 | 
 31 | We typically use this command to run the easy taxonomy:
 32 | 
 33 | 
 34 | ```bash
 35 | mkdir easy-taxonomy
 36 | mmseqs easy-taxonomy sequence.fasta UniRef50/UniRef50 easy-taxonomy/sequence_taxonomy /tmp --start-sens 1 --sens-steps 3 -s 7 --threads 32
 37 | ```
 38 | 
 39 | The results will be in a series of files in the `easy-taxonomy` directory, whose names start with `sequence_taxonomy`:
 40 | 
 41 | 
 42 | - `sequence_taxonomy_lca.tsv.gz`: The lowest common ancestor of the sequences in tab separated text.
 43 | 
 44 | Example output:
 45 | 
 46 | ```
 47 | R100400180029:20220829140225:V350082744:2:1145432:5:58/2/2      310915  species Pangasianodon hypophthalmus     2       2       1       0.540
 48 | ```
 49 | 
 50 | Columns are:
 51 | 1. the sequencing read
 52 | 2. the taxonomy ID from [NCBI taxonomy](https://www.ncbi.nlm.nih.gov/datasets/taxonomy/tree). For example, this is [310915](https://www.ncbi.nlm.nih.gov/datasets/taxonomy/310915/)
 53 | 3. the taxonomic clade. `Species` in this example
 54 | 4. The organism name. `Pangasianodon hypophthalmus`
 55 | 
 56 | 
 57 | 
 58 | - `sequence_taxonomy_report.gz` a Kraken2 style output report
 59 | 
 60 | Example output:
 61 | 
 62 | ```
 63 | 0.8561  9653    9653    species 310915                                                          Pangasianodon hypophthalmus
 64 | ```
 65 | 
 66 | The columns are
 67 | 1. The fraction of reads that map at this clade
 68 | 2. The number of reads that map at the clade, or lower
 69 | 3. The number of reads that map at exactly this clade
 70 | 4. The taxonomic level
 71 | 5. The taxonomy ID. For example, this is [310915](https://www.ncbi.nlm.nih.gov/datasets/taxonomy/310915/)
 72 | 6. The taxonomy name
 73 | 
 74 | - `sequence_taxonomy_tophit_aln.gz` the `blast m8` format 
 75 | 
 76 | Example output:
 77 | 
 78 | ```
 79 | R100400180029:20220829140225:V350082744:2:1145432:5:58/2/2      UniRef50_UPI00147C5152  0.382   163     30      0       0       50      0       163     1.796E-26       108
 80 | ```
 81 | 
 82 | The columns are:
 83 | 
 84 | 1. Sequence Read
 85 | 2. Match database ID. In this case from the [UniRef50](https://www.uniprot.org/) we have sequence [UniRef50_UPI00147C5152](https://www.uniprot.org/uniref/UniRef50_UPI00147C5152)
 86 | 3. Similarity (38.2% identity)
 87 | 4. Alignment length (163 bases) 
 88 | 5. Gaps (30 bases)
 89 | 6. Mismatches (0 bases)
 90 | 7. Start on the sequence read (0)
 91 | 8. End on the sequence read (50)
 92 | 9. Start on the database sequence (0)
 93 | 10. End on the database sequence (163)
 94 | 11. E-value (1.796E-26)
 95 | 12. Bit score
 96 | 
 97 | 
 98 | - `sequence_taxonomy_tophit_report.gz` the taxonomy and matches to all of the proteins
 99 | 
100 | Example output
101 | 
102 | ```
103 | UniRef50_UPI00147C5152  6970    0.312   1849.374        0.367   310915  species Pangasianodon hypophthalmus
104 | ```
105 | 
106 | The columns are:
107 | 
108 | 1. Database ID. In this case from the [UniRef50](https://www.uniprot.org/) we have sequence [UniRef50_UPI00147C5152](https://www.uniprot.org/uniref/UniRef50_UPI00147C5152)
109 | 2. Number of sequences aligning to target
110 | 3. Unique coverage of target uniqueAlignedResidues / targetLength
111 | 4. Target coverage alignedResidues / targetLength
112 | 5. Average sequence identity
113 | 6. [NCBI taxonomy](https://www.ncbi.nlm.nih.gov/datasets/taxonomy/tree) identifier
114 | 7. Taxonomic level `species`
115 | 8. Taxonomic name, in this case `Pangasianodon hypophthalmus`
116 | 
117 | 
118 | 
119 | 
120 | 
121 | 
122 | 


--------------------------------------------------------------------------------
/Metagenomics/AnnotatingOrfMSeed.md:
--------------------------------------------------------------------------------
 1 | # Annotating the functions in the bins using OrfM and the SEED
 2 | 
 3 | Another way to annotate the metagenomes is to identify all the open reading frames (i.e. stretches of DNA that start at a translational start site (typically: ATG), and end at a translational stop site (typically: TAA, TGA, TAG). In complete genomics, there are heuristics used to filter those open reading frames to ensure that we have the open reading frames are most likely to be the real protein encoding genes. For example, if you have two overlapping open reading frames it is much more likely that the longer sequence is a real protein encoding gene and the shorter sequence is less likely to be a real protein encoding gene. 
 4 | 
 5 | For metagenomics, however, we can identify all the open reading frames, and then compare them to the databases and suggest that the open reading frames that have real matches are real, while the others are potentially not real open reading frames, and we can ignore them. 
 6 | 
 7 | A quick way to identify all the open reading frames is to use orfM (64), a very fast open reading frame extractor. This just takes a fasta file as its argument, and generates a fasta file with amino acids in it:
 8 | 
 9 | ```bash
10 | orfm contigs.fna  > orfs.faa
11 | ```
12 | 
13 | Notice that here our DNA file has the standard extension `.fna` for “fasta nucleic acids” (i.e. DNA) and our output file has the standard extension `.faa` for “fasta amino acids” (i.e. proteins). These may or may not be recognized by your operating system though! 
14 | 
15 | Note also, that we are redirecting the output to the `orfs.faa` file as we discussed with [Linux](../Linux)
16 | 
17 | There are lots of ways that you can compare these sequences to databases. One way is to copy and paste a few of the sequences into the [BLAST web interface](https://blast.ncbi.nlm.nih.gov/Blast.cgi). However, you can only do a few of the amino acid sequences at a time.
18 | 
19 | Another way to annotate the sequences is using the SEED server framework (65). We have included that on the virtual box image. You can rapidly assign functions to proteins using the command:
20 | 
21 | ```bash
22 | svr_assign_using_figfams < orfs.faa > orfs.fn
23 | ```
24 | 
25 | Here, we are redirecting the input from `orfs.faa` and redirecting the output to `orfs.fn`.
26 | 
27 | This will give you a function for those sequences that we can find a “reliable” match to. By default, a “reliable” match means that there are three 7-mer amino acid matches between your protein sequence and our database, however you can change that reliability to get more hits. For more information, run `svr_assign_using_figfams` with a `-h` flag to see more options.
28 | 
29 | It is worth taking a few of those sequences, to which you have a match from the SEED servers and running blast using the NCBI website above and exploring those hits. 
30 | 
31 | * Does the SEED annotation make sense? 
32 | * Does the NCBI annotation make sense? 
33 | * Which organisms are matched? 


--------------------------------------------------------------------------------
/Metagenomics/AnnotatingOrfMSeed.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Metagenomics/AnnotatingOrfMSeed.pdf


--------------------------------------------------------------------------------
/Metagenomics/ExampleDataSets.md:
--------------------------------------------------------------------------------
 1 | # Sample Metagenomes
 2 | 
 3 | <img src="images/abrolhos.png" alt="Abrolhos Islands" align="right" />
 4 | 
 5 | We have several sample metagenomes that we provide for use in the course. However, please feel free to use your own metagenomes and data sets, and replace their filenames with the names we use throughout the course.
 6 | 
 7 | In the [Datasets](../Datasets) directory, we have collated several different projects:
 8 | 
 9 | ## [Coral and Algae](../Datasets/coral_algae)
10 | 
11 | These sample data sets are from the Abrolhos region of Brazil, and are some low diversity metagenomes from an experiment where we tested the effects of Coral, Algae, CCA, or no treatment on the growth of microbes over time.
12 | 
13 | There are four groups of data:
14 | 
15 | * *Algae treatment* &mdash; 4 replicates (Algae_11, Algae_12, Algae_13, Algae_14)
16 | * *CCA treatment* &mdash; 3 replicates (CCA_11, CCA_12, CCA_13)
17 | * *Control treatment* &mdash; 4 replicates (Control_11, Control_12, Control_13, Control_14)
18 | * *Coral treatment* &mdash; 4 replicates (Coral_11, Coral_12, Coral_13, Coral_14)
19 | 
20 | These samples were sequenced on an Ion Torrent, and so you will see quality differences and we’ll need to use the `--iontorrent` flag when assembling them using `spades.py`.
21 | 
22 | In many of the examples in this manual, I use the Algae samples to demonstrate the commands. Be sure to switch the `Algae_12.fna` name to the file name that you are working on.
23 | 
24 | [Read more about the coral and algae data sets](../Datasets/coral_algae/)
25 | 
26 | ## [Drinking water](../Datasets/drinking_water)
27 | 
28 | The drinking water study is from the [University of Adelaide, Australia](https://www.adelaide.edu.au/), and they used 16S amplicon sequencing to explore microbes in the drinking water. The project is available at the NCBI as [SRP ID SRP059994](https://www.ncbi.nlm.nih.gov/Traces/study/?acc=SRP059994) 
29 | 
30 | The work is described in Shaw JLA, Monis P, Weyrich LS, Sawade E, Drikas M, Cooper AJ. 2015. Using Amplicon Sequencing To Characterize and Monitor Bacterial Diversity in Drinking Water Distribution Systems. [Appl Environ Microbiol 81:6463–6473](http://aem.asm.org/content/81/18/6463.long)
31 | 
32 | [Read more about the drinking water datasets](../Datasets/drinking_water/)
33 | 
34 | ## [Ground Water](../Datasets/ground_water)
35 | 
36 | This random community metagenomics data set comes from SRA project [SRP075429](https://www.ncbi.nlm.nih.gov/Traces/study/?acc=SRP075429) where they looked at groundwater from Japan. 
37 | 
38 | 
39 | The work was published in Hernsdorf AW, Amano Y, Miyakawa K, Ise K, Suzuki Y, Anantharaman K, Probst A, Burstein D, Thomas BC, Banfield JF. 2017. Potential for microbial H2 and metal transformations associated with novel bacteria and archaea in deep terrestrial subsurface sediments. [ISME J 11:1915–1929](https://www.nature.com/articles/ismej201739)
40 | 
41 | [Read more about the ground water datasets](../Datasets/ground_water/)
42 | 
43 | ## [Gut](../Datasets/gut)
44 | 
45 | The gut random community data set is also from the Banfield lab, from SRA project [SRP074153](https://www.ncbi.nlm.nih.gov/Traces/study/?acc=SRP074153) where they looked at the infants in NICU and hospitals. 
46 | 
47 | The work was published in Brooks B, Olm MR, Firek BA, Baker R, Thomas BC, Morowitz MJ, Banfield JF. 2017. Strain-resolved analysis of hospital rooms and infants reveals overlap between the human and room microbiome. [Nat Commun 8:1814](https://www.nature.com/articles/s41467-017-02018-w)
48 | 
49 | [Read more about the gut datasets](../Datasets/gut/)
50 | 
51 | 


--------------------------------------------------------------------------------
/Metagenomics/ExampleDataSets.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Metagenomics/ExampleDataSets.pdf


--------------------------------------------------------------------------------
/Metagenomics/Metagenomics.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Metagenomics/Metagenomics.pdf


--------------------------------------------------------------------------------
/Metagenomics/images/1ng vs 100ng.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Metagenomics/images/1ng vs 100ng.png


--------------------------------------------------------------------------------
/Metagenomics/images/BAC Cloning.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Metagenomics/images/BAC Cloning.png


--------------------------------------------------------------------------------
/Metagenomics/images/CDA.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Metagenomics/images/CDA.png


--------------------------------------------------------------------------------
/Metagenomics/images/CDA_Small.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Metagenomics/images/CDA_Small.png


--------------------------------------------------------------------------------
/Metagenomics/images/Earth Microbiome Project.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Metagenomics/images/Earth Microbiome Project.png


--------------------------------------------------------------------------------
/Metagenomics/images/Filters.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Metagenomics/images/Filters.png


--------------------------------------------------------------------------------
/Metagenomics/images/Findley_PCA.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Metagenomics/images/Findley_PCA.png


--------------------------------------------------------------------------------
/Metagenomics/images/Handelsman1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Metagenomics/images/Handelsman1.png


--------------------------------------------------------------------------------
/Metagenomics/images/Handelsman2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Metagenomics/images/Handelsman2.png


--------------------------------------------------------------------------------
/Metagenomics/images/RDP.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Metagenomics/images/RDP.png


--------------------------------------------------------------------------------
/Metagenomics/images/SSU_RNApol.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Metagenomics/images/SSU_RNApol.png


--------------------------------------------------------------------------------
/Metagenomics/images/Silva.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Metagenomics/images/Silva.png


--------------------------------------------------------------------------------
/Metagenomics/images/Staley_Konopka.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Metagenomics/images/Staley_Konopka.png


--------------------------------------------------------------------------------
/Metagenomics/images/Subsystems.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Metagenomics/images/Subsystems.png


--------------------------------------------------------------------------------
/Metagenomics/images/VAMPS.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Metagenomics/images/VAMPS.png


--------------------------------------------------------------------------------
/Metagenomics/images/Woese_Tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Metagenomics/images/Woese_Tree.png


--------------------------------------------------------------------------------
/Metagenomics/images/abrolhos.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Metagenomics/images/abrolhos.png


--------------------------------------------------------------------------------
/Metagenomics/images/acridine orange.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Metagenomics/images/acridine orange.png


--------------------------------------------------------------------------------
/Metagenomics/images/plate counts.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Metagenomics/images/plate counts.png


--------------------------------------------------------------------------------
/Metagenomics/images/plate_count_anomaly.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Metagenomics/images/plate_count_anomaly.gif


--------------------------------------------------------------------------------
/Metagenomics/images/qiita.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Metagenomics/images/qiita.png


--------------------------------------------------------------------------------
/Metagenomics/images/who_what_where.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Metagenomics/images/who_what_where.png


--------------------------------------------------------------------------------
/ORFCalling/ORFCalling.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/ORFCalling/ORFCalling.pdf


--------------------------------------------------------------------------------
/ORFCalling/images/blastx.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/ORFCalling/images/blastx.png


--------------------------------------------------------------------------------
/ORFCalling/images/transcription_translation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/ORFCalling/images/transcription_translation.png


--------------------------------------------------------------------------------
/PATRIC/GenomeSets.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/PATRIC/GenomeSets.pdf


--------------------------------------------------------------------------------
/PATRIC/PATRIC.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/PATRIC/PATRIC.pdf


--------------------------------------------------------------------------------
/PATRIC/README.md:
--------------------------------------------------------------------------------
 1 | # Introduction to PATRIC
 2 | 
 3 | [PATRIC](https://www.patricbrc.org/) is a framework constructed to support comparative analysis of microbial genomes.  It includes four components: *data*, *tools*, *services*, and *interfaces*.
 4 | 
 5 | * Data:
 6 |   * The Data component includes two subcomponents:
 7 | 	* An integration of publicly available data (including over 200,000 microbial genomes and a growing body of drug resistance data)
 8 | 	* Workspaces  where data and results can be saved and re-used
 9 | 
10 | * Tools:
11 |   * PATRIC includes a collection of tools that support 
12 | 	* Extraction of data that can then be used in a focused analysis
13 | 	* Support for processing extracted data in the form of tab-separated tables
14 | 
15 | * Services:
16 |   * A collection of Apps that implement a rich set of bioinformatic algorithms to support comparative analysis
17 | 
18 | * Interfaces:
19 |   * PATRIC supports two primary user-interfaces: 
20 | 	* The GUI that you can access at the [PATRIC](https://www.patricbrc.org/) website.
21 | 	* A command line interface that supports invocation of tools for users wishing to craft more customized requests
22 | 
23 | ## Effective Use of the PATRIC Resource
24 | 
25 | Effective use of [PATRIC](https://www.patricbrc.org/) requires that you understand the basic overview along with a reasonably small set of operations that allow you to extract and process data. [PATRIC](https://www.patricbrc.org/) is a rich collection of data and tools.  We are goingt to present you with a sequence of tasks that you can master fairly easily and will leave you with a highly functional (but somewhat limited) set of skills.  Then, as you occasionally find you need to expand your abilities, you can use one of the [PATRIC](https://www.patricbrc.org/) tutorials and/or request for help from the PATRIC support team for more help.
26 | 
27 | ## Gaining the Basic Set of Skills
28 | 
29 | Before you start, you will need to install the [PATRIC command line](https://docs.patricbrc.org//cli_tutorial/index.html#installing-the-cli-release) tools. There are also a series of [tutorials](https://docs.patricbrc.org//cli_tutorial/cli_getting_started.html) about using the command line tools that complement the tutorials here.
30 | 
31 | We have put together a series of short, directed tutorials that will help you get acquainted with [PATRIC](https://www.patricbrc.org/) using the command line.
32 | 
33 | 1. [Constructing Genome Sets](GenomeSets)
34 | 
35 | 


--------------------------------------------------------------------------------
/Python/.gitattributes:
--------------------------------------------------------------------------------
1 | *.gz filter=lfs diff=lfs merge=lfs -text
2 | 


--------------------------------------------------------------------------------
/Python/Bc01.fasta.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:2f1f3a67e85edfbfe887c3aaadde3f9037f6d9c06995dd5dbf8d1c1569e96072
3 | size 30372
4 | 


--------------------------------------------------------------------------------
/Python/BcSamples.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:97c7bdba7756b89a10975399ce63cb84129b22ae9ac0b0316847b5c146664aa6
3 | size 12336
4 | 


--------------------------------------------------------------------------------
/Python/Zen_of_python.md:
--------------------------------------------------------------------------------
 1 | # The Zen of Python
 2 | 
 3 | The Zen of Python, by Tim Peters.
 4 | 
 5 | ```text
 6 | Beautiful is better than ugly.
 7 | Explicit is better than implicit.
 8 | Simple is better than complex.
 9 | Complex is better than complicated.
10 | Flat is better than nested.
11 | Sparse is better than dense.
12 | Readability counts.
13 | Special cases aren't special enough to break the rules.
14 | Although practicality beats purity.
15 | Errors should never pass silently.
16 | Unless explicitly silenced.
17 | In the face of ambiguity, refuse the temptation to guess.
18 | There should be one-- and preferably only one --obvious way to do it.
19 | Although that way may not be obvious at first unless you're Dutch.
20 | Now is better than never.
21 | Although never is often better than *right* now.
22 | If the implementation is hard to explain, it's a bad idea.
23 | If the implementation is easy to explain, it may be a good idea.
24 | Namespaces are one honking great idea -- let's do more of those!
25 | ```
26 | 
27 | You can find the [zen of python](https://en.wikipedia.org/wiki/Zen_of_Python) by typing
28 | 
29 | ```python
30 | import this
31 | ```
32 | 


--------------------------------------------------------------------------------
/Python/countfasta.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | """
  4 | Count the characters in a fasta file. We summarize the longest and shortest reads
  5 | and the N50 of the data set.
  6 | """
  7 | 
  8 | import os
  9 | import sys
 10 | import gzip
 11 | import argparse
 12 | 
 13 | __author__ = 'Rob Edwards'
 14 | 
 15 | 
 16 | def read_fasta(fname: str, whole_id: bool = False, qual: bool = False) -> dict:
 17 |     """
 18 |     Read a fasta file and return a hash.
 19 | 
 20 |     If wholeId is set to false only the first part of the ID
 21 |     (upto the first white space) is returned
 22 | 
 23 |     :param fname: The file name to read
 24 |     :param whole_id: Whether to keep the whole id, or trim to first whitespace (default = all)
 25 |     :param qual: these are quality scores (so add a space between lines!)
 26 |     :return: dict
 27 |     """
 28 | 
 29 |     try:
 30 |         if fname.endswith('.gz'):
 31 |             f = gzip.open(fname, 'rt')
 32 |         elif fname.endswith('.lrz'):
 33 |             f = subprocess.Popen(['/usr/bin/lrunzip', '-q', '-d', '-f', '-o-', fname], stdout=subprocess.PIPE).stdout
 34 |         else:
 35 |             f = open(fname, 'r')
 36 |     except IOError as e:
 37 |         sys.stderr.write(str(e) + "\n")
 38 |         sys.stderr.write("Message: \n" + str(e.message) + "\n")
 39 |         sys.exit("Unable to open file " + fname)
 40 | 
 41 |     seqs = {}
 42 |     seq = ""
 43 |     seqid = ""
 44 |     for line in f:
 45 |         line = line.rstrip('\r\n')
 46 |         if line.startswith(">"):
 47 |             if seqid != "":
 48 |                 seqs[seqid] = seq
 49 |                 seq = ""
 50 |             seqid = line.replace(">", "", 1)
 51 |             if not whole_id and seqid.count(" ") > 0:
 52 |                 seqids = seqid.split(" ")
 53 |                 seqid = seqids[0]
 54 |         else:
 55 |             if qual:
 56 |                 seq += " " + line
 57 |             else:
 58 |                 seq += line
 59 | 
 60 |     seqs[seqid] = seq.strip()
 61 |     return seqs
 62 | 
 63 | 
 64 | 
 65 | if __name__ == "__main__":
 66 |     parser = argparse.ArgumentParser(description=' ')
 67 |     parser.add_argument('-f', nargs='+', help='fasta file')
 68 |     parser.add_argument('-d', nargs='+', help='directory of fasta files')
 69 |     parser.add_argument('-l', help='list the lengths for each sequence (default = not to)', action='store_true')
 70 |     parser.add_argument('-m', help='minimum length fo be inclued', type=int, default=0)
 71 |     parser.add_argument('-n', help='do NOT print the summary at the end', action='store_true')
 72 |     parser.add_argument('-t', help='tab separated output. Fields: [SeqID, # seqs, total bp, shortest, longest, N50, N75, auN]', action='store_true')
 73 |     parser.add_argument('-v', help='more output', action='store_true')
 74 |     args = parser.parse_args()
 75 | 
 76 |     if not args.f and not args.d:
 77 |         sys.stderr.write(f"FATAL: Please specify either -f or -d or use -h for more help\n")
 78 |         sys.exit(1)
 79 | 
 80 |     if args.f:
 81 |         files = args.f
 82 |     else:
 83 |         files = []
 84 | 
 85 |     if args.d:
 86 |         for subdir in args.d:
 87 |             for f in os.listdir(subdir):
 88 |                 files.append(os.path.join(subdir, f))
 89 |                 if args.v:
 90 |                     sys.stderr.write(f"Added {files[-1]}\n")
 91 | 
 92 |     overall = {'number': 0, 'total': 0, 'shortest':1e6, 'longest': 0}
 93 | 
 94 | 
 95 |     for faf in files:
 96 |         if args.v:
 97 |             sys.stderr.write(f"Counting sequences in {faf}\n")
 98 |         fa=read_fasta(faf)
 99 | 
100 |         if len(fa.keys()) == 1 and list(fa.keys())[0] == '':
101 |             sys.stderr.write(f"No sequences found in {faf}\n")
102 |             sys.exit(0)
103 | 
104 |         if args.l:
105 |             for i in fa:
106 |                 print("{}\t{}".format(i, len(fa[i])))
107 |             print()
108 | 
109 |         lensall=[len(fa[i]) for i in fa]
110 |         lens = list(filter(lambda x: x > args.m, lensall))
111 |         lens.sort()
112 |         length=sum(lens)
113 | 
114 |         len_so_far = 0
115 |         n50 = None
116 |         n75 = None
117 |         auN = 0
118 |         for i in lens:
119 |             len_so_far += i
120 |             if not n50 and len_so_far >= length * 0.5:
121 |                 n50 = i
122 |             if not n75 and len_so_far >= length * 0.75:
123 |                 n75 = i
124 |             auN += i**2
125 | 
126 |         auN /= length
127 | 
128 |         if args.n:
129 |             continue
130 | 
131 |         if args.t:
132 |             print("\t".join(map(str, [faf, len(lens), length, lens[0], lens[-1], n50, n75, auN])))
133 |         else:
134 |             print(f"""
135 | File name: {faf}
136 | Number of sequences: {len(lens):,}
137 | Total length: {length:,}
138 | Shortest: {lens[0]:,}
139 | Longest: {lens[-1]:,}
140 | N50: {n50:,}
141 | N75: {n75:,}
142 | auN: {int(auN):,}  """
143 |             )
144 | 
145 | 


--------------------------------------------------------------------------------
/Python/countfastq.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | """
  4 | Count the characters in a fasta file. We summarize the longest and shortest reads
  5 | and the N50 of the data set.
  6 | """
  7 | 
  8 | import os
  9 | import sys
 10 | import gzip
 11 | import argparse
 12 | 
 13 | __author__ = 'Rob Edwards'
 14 | 
 15 | 
 16 | def stream_fastq(fqfile):
 17 |     """Read a fastq file and provide an iterable of the sequence ID, the
 18 |     full header, the sequence, and the quaity scores.
 19 | 
 20 |     Note that the sequence ID is the header up until the first space,
 21 |     while the header is the whole header.
 22 |     """
 23 | 
 24 |     if fqfile.endswith('.gz'):
 25 |         qin = gzip.open(fqfile, 'rt')
 26 |     else:
 27 |         qin = open(fqfile, 'r')
 28 | 
 29 |     linecounter = 0
 30 |     while True:
 31 |         header = qin.readline()
 32 |         linecounter += 1
 33 |         if not header:
 34 |             break
 35 |         if not header.startswith("@"):
 36 |             print(f"The file {fqfile} does not appear to be a four-line fastq file at line {linecounter}", file=sys.stderr)
 37 |             sys.exit(-1)
 38 |         header = header.strip()
 39 |         seqidparts = header.split(' ')
 40 |         seqid = seqidparts[0]
 41 |         seqid = seqid.replace('@', '')
 42 |         seq = qin.readline().strip()
 43 |         linecounter += 1
 44 |         qualheader = qin.readline()
 45 |         if not qualheader.startswith("+"):
 46 |             print(f"The file does not appear to be a four-line fastq file at line {linecounter}", file=sys.stderr)
 47 |             sys.exit(-1)
 48 |         linecounter += 1
 49 |         qualscores = qin.readline().strip()
 50 |         linecounter += 1
 51 |         header = header.replace('@', '', 1)
 52 |         if len(qualscores) != len(seq):
 53 |             print(f"The sequence and qual scores are not the same length at line {linecounter}", file=sys.stderr)
 54 |             sys.exit(-1)
 55 |         yield seqid, header, seq, qualscores
 56 | 
 57 | 
 58 | 
 59 | if __name__ == "__main__":
 60 |     parser = argparse.ArgumentParser(description=' ')
 61 |     parser.add_argument('-f', nargs='+', help='fastq file')
 62 |     parser.add_argument('-d', nargs='+', help='directory of fastq files')
 63 |     parser.add_argument('-t', help='tab separated summmary of name, total len, shortest, longest, n50, n75', action="store_true")
 64 |     parser.add_argument('-s', help="summarize the counts. Useful if you run on a directory", action="store_true")
 65 |     args = parser.parse_args()
 66 | 
 67 |     if not args.f and not args.d:
 68 |         sys.stderr.write(f"FATAL: Please specify either -f or -d or use -h for more help\n")
 69 |         sys.exit(1)
 70 | 
 71 |     if args.f:
 72 |         files = args.f
 73 |     else:
 74 |         files = []
 75 | 
 76 |     if args.d:
 77 |         for subdir in args.d:
 78 |             for f in os.listdir(subdir):
 79 |                 if 'fastq' in f or 'fq' in f:
 80 |                     files.append(os.path.join(subdir, f))
 81 |                 else:
 82 |                     sys.stderr.write(f"Skipped {os.path.join(subdir, f)}. Does not appear to be fastq\n")
 83 | 
 84 |     overall = {'number': 0, 'total': 0, 'shortest':1e6, 'longest': 0}
 85 |     for faf in files:
 86 |         if not os.path.exists(faf):
 87 |             sys.stderr.write(f"FATAL: {faf} not found\n")
 88 |             sys.exit(1)
 89 | 
 90 |         lens = []
 91 |         for (sid, label, seq, qual) in stream_fastq(faf):
 92 |             lens.append(len(seq))
 93 |         lens.sort()
 94 |         length=sum(lens)
 95 | 
 96 |         len_so_far = 0
 97 |         n50 = None
 98 |         n75 = None
 99 |         auN = 0
100 |         for i in lens:
101 |             len_so_far += i
102 |             if not n50 and len_so_far >= length * 0.5:
103 |                 n50 = i
104 |             if not n75 and len_so_far >= length * 0.75:
105 |                 n75 = i
106 |             auN += i**2
107 | 
108 |         auN /= length
109 | 
110 |         if args.t:
111 |             print(f"{faf}\t{len(lens):,}\t{length:,}\t{lens[0]:,}\t" \
112 |                   + f"{lens[-1]:,}\t{n50:,}\t{n75:,}\t{int(auN):,}")
113 |         else:
114 |             print(f"""
115 | File name: {faf}
116 | Number of sequences: {len(lens):,}
117 | Total length: {length:,}
118 | Shortest: {lens[0]:,}
119 | Longest: {lens[-1]:,}
120 | N50: {n50:,}
121 | N75: {n75:,}
122 | auN: {int(auN):,}  """
123 |             )
124 |         overall['number'] += len(lens)
125 |         overall['total']  += length
126 |         if lens[0] < overall['shortest']:
127 |             overall['shortest'] = lens[0]
128 |         if lens[-1] > overall['longest']:
129 |             overall['longest'] = lens[-1]
130 | 
131 | if args.s:
132 |     print(f"""
133 | 
134 | OVERALL SUMMARY
135 | Number of sequences: {overall['number']:,}
136 | Total length: {overall['total']:,}
137 | Shortest: {overall['shortest']:,}
138 | Longest: {overall['longest']:,}
139 | """
140 |       )
141 | 
142 | 
143 | 
144 | 


--------------------------------------------------------------------------------
/Python/metadata.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:6d7ef9f6f925d7be0fabf5cfc536d57c99bbd99324f4936e023fe35ba12fe3af
3 | size 1519
4 | 


--------------------------------------------------------------------------------
/Python/phylum.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:5c78c5c78fe6c20efb0a346d3abd280efd9f227f91a80df40d278de59d5c78fb
3 | size 21627
4 | 


--------------------------------------------------------------------------------
/Python/requirements.txt:
--------------------------------------------------------------------------------
1 | biopython
2 | 
3 | 


--------------------------------------------------------------------------------
/Python/tikkala.gbk.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:be06c5ec2db9e69a365244f4f662f3d5c5872379d74b96695190e3976fc020dd
3 | size 68229
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![Edwards Lab](https://img.shields.io/badge/Bioinformatics-EdwardsLab-03A9F4)](https://edwards.flinders.edu.au)
 2 | [![DOI](https://www.zenodo.org/badge/146160006.svg)](https://www.zenodo.org/badge/latestdoi/146160006)
 3 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
 4 | ![GitHub language count](https://img.shields.io/github/languages/count/linsalrob/ComputationalGenomicsManual)
 5 | 
 6 | 
 7 | # ComputationalGenomicsManual
 8 | 
 9 | Robs manual for the computational genomics and bioinformatics class
10 | 
11 | # About this manual
12 | 
13 | Rob, Liz Dinsdale, Tom Jeffries, Bruno Gomez-Gil, Jim Mitchell, and several other colleagues and friends have been teaching genomics and metagenomics for a long time. They have written this manual over the course of several years, and in a variety of formats. Rob moved it to markdown using GitHub in Fall 2018 as part of his computational genomics class.
14 | 
15 | You can view this [manual online](https://linsalrob.github.io/ComputationalGenomicsManual/)
16 | 
17 | # Companion Videos
18 | 
19 | Companion videos that accompany this class are available on You Tube on [Rob's YouTube Playlist](https://www.youtube.com/playlist?list=PLpPXw4zFa0uLMHwSZ7DMeLGjIUgo1IBbn).
20 | 
21 | # Chapter Index
22 | 
23 | Chapter | Contents
24 | --- | --- | 
25 | 1\. | [Linux](Linux/)
26 | 2\. | [Conda](Conda/)
27 | 3\. | [Python](Python/)
28 | 4\. | [Snakemake](Snakemake)
29 | 5\. | [Sequencing Overview](Sequencing/)
30 | 6\. | [Sequence File Formats](SequenceFileFormats/)
31 | 7\. | [Sequence Quality Control](SequenceQC/)
32 | 8\. | [Databases](Databases/)
33 | 8a. | - [NCBI Edirect](Databases/NCBI_Edirect.md)
34 | 8b. | - [NCBI SRA](Databases/SRA.md)
35 | 9\. | [Genome Sequencing Overview](GenomeSequencingOverview)
36 | 10\. | [Sequence Assembly](SequenceAssembly)
37 | 11\. | [ORF Calling](ORFCalling/)
38 | 12\. | [tRNA and rRNA identification](tRNA_rRNA/)
39 | 13\. | [Annotation Pipelines](AnnotationPipelines/)
40 | 14\. | [Metagenomics](Metagenomics/)
41 | 15\. | - [Example Data Sets](Metagenomics/ExampleDataSets.md)
42 | 16\. | [Cross Assembly](CrossAssembly/)
43 | 16a. | - [Metabat](CrossAssembly/Metabat.md)
44 | 16b. | - [CCOM](CrossAssembly/CCOM.md)
45 | 17\. | [16S sequencing](16S/)
46 | 18\. | [Host removal](Deconseq/)
47 | 19\. | [FOCUS](FOCUS/)
48 | 20\. | [Kraken](Kraken2)
49 | 21\. | [SUPER-FOCUS](SUPER-FOCUS/)
50 | 22\. | [GenomePeek](GenomePeek/)
51 | 23\. | [RTMg](RTMg/)
52 | 24\. | [OrfM and the SEED](Metagenomics/AnnotatingOrfMSeed.md)
53 | 25\. | [ANVI'O](ANVIO/)
54 | 26\. | [CheckM](CheckM/)
55 | 
56 | 
57 | # Workshops.
58 | 
59 | We are using this content in a [variety of workshops](Workshops/)
60 | 
61 | 
62 | # Assignments. 
63 | 
64 | Solutions are still not shown, but you can work through some of these
65 | 
66 | * [NCBI EDirect](Assignments/NCBIEDirectAssignment) is to familiarize yourself with NCBI EDirect.
67 | * [Genomics Assignment](Assignments/GenomicsAssignment/) is to analyze complete genomes from *Klebsiella*.
68 | * [Metagenomics Assignment](Assignments/MetagenomicsAssignment) is to analyze some metagenomics data and describe the organisms that you find there.
69 | 
70 | 
71 | # Datasets
72 | 
73 | We have several different [datasets](Datasets/) available for you to use to try the course work out. There are both 16S and random metagenomes, and links to genomics data.
74 | 
75 | 
76 | # PDFs
77 | 
78 | Note: The PDFs are automatically created from the markdown, and loose some of the images and links. You should probably use the HTML version most of the time.
79 | 
80 | # About Copyright Information
81 | 
82 | Some of the images used in this manual are currently copyright other people. As noted above, Rob and friends wrote this manual over many years and added the images and cartoons to lighten the manual. We are in the process of identifying the copyright holders and/or identifying images that are not copyrighted. If your rights have been infringed upon, if you would like to provide an indemnification, or if you would like to provide a non-copyrighted image, please contact Rob.
83 | 
84 | # Copyright
85 | 
86 | This manual is Copyright Robert A. Edwards. 2018.
87 | 
88 | # Citation
89 | 
90 | If you wish to cite this manual, please cite: *Edwards, R. 2018. Computational Genomics. https://linsalrob.github.io/ComputationalGenomicsManual/. Accessed [today's date]* DOI: 10.5281/zenodo.7883375
91 | 
92 | # References
93 | 
94 | We have an [extensive list of references](References/) available, but if you find something missing that we should have cited (a) we're sorry, we tried to remember all of them and (b) please email Rob or provide a pull request and we'll add it.
95 | 


--------------------------------------------------------------------------------
/RTMg/RTMg.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/RTMg/RTMg.pdf


--------------------------------------------------------------------------------
/RTMg/images/sens.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/RTMg/images/sens.png


--------------------------------------------------------------------------------
/References/References.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/References/References.pdf


--------------------------------------------------------------------------------
/SUPER-FOCUS/README.md:
--------------------------------------------------------------------------------
 1 | # Annotating the function in the bins with SUPER-FOCUS
 2 | 
 3 | 
 4 | ![](images/sf-logo.png "Superfocus logo")
 5 | Subsystems Profile by database reduction using FOCUS (SUPER-FOCUS) combines the taxonomic profiling using FOCUS, which is rapid, and the functional profiling using *k*-mers or faster searches than blast using alternate algorithms including [diamond](https://github.com/bbuchfink/diamond) or [RAPsearch](http://omics.informatics.indiana.edu/mg/RAPSearch2/). 
 6 | 
 7 | 
 8 | SUPER-FOCUS input is be the same as FOCUS &mdash; you just need a  folder with all the sequences from your metagenome. You can have one or more sequences in the folder, and they can be reads, or [metagenome assembled genomes](../CrossAssembly). 
 9 | 
10 | First download the database by running 
11 | 
12 | ```bash
13 | superfocus__downloadDB -a diamond
14 | ```
15 | to download the database for the diamond search algorithm (which is the fastest, but also requires the most memory).
16 | 
17 | Then to run superfocus, you use the command (e.g. for Algae_MetaBatBins) 
18 | 
19 | ```bash
20 | superfocus.py -q Algae_MetaBatBins -m 1 -db DB_90 -dir Algae_MetaBatBins_Superfocus_results
21 | ``` 
22 | 
23 | The SUPER-FOCUS results for all the bins are saved in a folder called `SUPER-FOCUS` and are self explanatory!
24 | 
25 | 


--------------------------------------------------------------------------------
/SUPER-FOCUS/SUPER-FOCUS.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/SUPER-FOCUS/SUPER-FOCUS.pdf


--------------------------------------------------------------------------------
/SUPER-FOCUS/images/sf-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/SUPER-FOCUS/images/sf-logo.png


--------------------------------------------------------------------------------
/SequenceAssembly/N50.md:
--------------------------------------------------------------------------------
 1 | # N<sub>50</sub> and N<sub>75</sub>
 2 | 
 3 | Two of the essential metrics for assembly are N<sub>50</sub> and N<sub>75</sub>. Both of these are a measure of how long the contigs are. The idea is that you order the contigs from shortest to longest, and find the length of the contig that contains half (for N<sub>50</sub>) or three quarters (for N<sub>75</sub>) of the data. If you have a more complete assembly the numbers should be larger, while with shorter assemblies, these numbers will be less.
 4 | 
 5 | There is script called `countfasta.py` that we have provided that takes a single argument and counts the number of fasta characters in the file. (There is a similar metric, called L<sub>50</sub> that reports the number of contigs shorter than the contig that contains the 50% point in sequence length, but no one uses this!).
 6 | 
 7 | Example usage of `countfasta.py`:
 8 | 
 9 | ```bash
10 | countfasta.py -f AlgaeAssembly/contigs.fasta 
11 | 
12 | Total length: 5426326
13 | Shortest: 57 (NODE_5409_length_57_cov_18.5)
14 | Longest: 47734 (NODE_1_length_47734_cov_9.85858)
15 | 
16 | N50: 1396 (NODE_1006_length_1396_cov_1.72782)
17 | N75: 2183 (NODE_387_length_2183_cov_1.83224)
18 | ```
19 | 
20 | 


--------------------------------------------------------------------------------
/SequenceAssembly/N50.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/SequenceAssembly/N50.pdf


--------------------------------------------------------------------------------
/SequenceAssembly/SequenceAssembly.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/SequenceAssembly/SequenceAssembly.pdf


--------------------------------------------------------------------------------
/SequenceAssembly/images/reads_contigs_scaffolds.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/SequenceAssembly/images/reads_contigs_scaffolds.png


--------------------------------------------------------------------------------
/SequenceFileFormats/INSDC_Features.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/SequenceFileFormats/INSDC_Features.pdf


--------------------------------------------------------------------------------
/SequenceFileFormats/SequenceFileFormats.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/SequenceFileFormats/SequenceFileFormats.pdf


--------------------------------------------------------------------------------
/SequenceFileFormats/images/ascii.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/SequenceFileFormats/images/ascii.png


--------------------------------------------------------------------------------
/SequenceFileFormats/images/crAssphageDNA.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/SequenceFileFormats/images/crAssphageDNA.png


--------------------------------------------------------------------------------
/SequenceFileFormats/images/crAssphageQuality.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/SequenceFileFormats/images/crAssphageQuality.png


--------------------------------------------------------------------------------
/SequenceFileFormats/images/fastq.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/SequenceFileFormats/images/fastq.png


--------------------------------------------------------------------------------
/SequenceFileFormats/images/prinseq.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/SequenceFileFormats/images/prinseq.png


--------------------------------------------------------------------------------
/SequenceQC/IlluminaAdapters.fa:
--------------------------------------------------------------------------------
 1 | >Nextera_Mate_left [Also Nextera Read 1 and2 Adapter, and AmpliSeq for Illumina Panels]
 2 | CTGTCTCTTATACACATCT
 3 | >Nextera_Mate_right
 4 | AGATGTGTATAAGAGACAG
 5 | >Nextera_Transposase_R1 [Also Nextera Amplisq I5r]
 6 | TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG
 7 | >Nextera_Transposase_R2 [Also Ampliseq I7r]
 8 | GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG
 9 | >IDT_for_Illumina_Nextera_I7l [Also Nextera PCR I7l, Ampliseq I7l
10 | CAAGCAGAAGACGGCATACGAGAT
11 | >IDT_for_Illumina_Nextera_I7r [Also Nextera PCR I7r]
12 | GTCTCGTGGGCTCGG
13 | >IDT_for_Illumina_Nextera_I5l [Also Nextera PCR I5l, TruSeq UDI I5l, TruSeq CDI I5l, TruSeq Universal Adapter]
14 | AATGATACGGCGACCACCGAGATCTACAC
15 | >IDT_for_Illumina_Nextera_I5r [Also Nextera PCR I5r
16 | TCGTCGGCAGCGTC
17 | >TruSeq_R1 [Also TruSeq CDI R1]
18 | AGATCGGAAGAGCACACGTCTGAACTCCAGTCA
19 | >TruSeq_R2 [Also TruSeq CDI R2]
20 | AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT
21 | >TruSeq_I7l [Also TruSeq UDI I7l, TruSeq CDI I7l]
22 | GATCGGAAGAGCACACGTCTGAACTCCAGTCAC
23 | >TruSeq_I7r [Also TruSeq UDI I7r. TruSeq CDI I7r]
24 | ATCTCGTATGCCGTCTTCTGCTTG
25 | >TruSeq_I5r [Also TruSeq UDI I5r, TruSeq CDI I5r]
26 | ACACTCTTTCCCTACACGACGCTCTTCCGATCT
27 | >TruSeq_Small_RNA
28 | TGGAATTCTCGGGTGCCAAGG
29 | 


--------------------------------------------------------------------------------
/SequenceQC/SequenceQC.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/SequenceQC/SequenceQC.pdf


--------------------------------------------------------------------------------
/SequenceQC/images/BadLength.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/SequenceQC/images/BadLength.png


--------------------------------------------------------------------------------
/SequenceQC/images/GoodLength.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/SequenceQC/images/GoodLength.png


--------------------------------------------------------------------------------
/SequenceQC/images/IonTorrentQual.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/SequenceQC/images/IonTorrentQual.gif


--------------------------------------------------------------------------------
/Sequencing/Sequencing.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Sequencing/Sequencing.pdf


--------------------------------------------------------------------------------
/Sequencing/images/GrowthSRA_SeqCost.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Sequencing/images/GrowthSRA_SeqCost.png


--------------------------------------------------------------------------------
/Sequencing/images/SangerCrassphage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Sequencing/images/SangerCrassphage.png


--------------------------------------------------------------------------------
/Sequencing/images/SangerCrassphageError.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Sequencing/images/SangerCrassphageError.png


--------------------------------------------------------------------------------
/Sequencing/images/image1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Sequencing/images/image1.png


--------------------------------------------------------------------------------
/Sequencing/images/image2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Sequencing/images/image2.png


--------------------------------------------------------------------------------
/Sequencing/images/image3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Sequencing/images/image3.png


--------------------------------------------------------------------------------
/Sequencing/images/image4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Sequencing/images/image4.png


--------------------------------------------------------------------------------
/Sequencing/images/image5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Sequencing/images/image5.png


--------------------------------------------------------------------------------
/Sequencing/images/image6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Sequencing/images/image6.png


--------------------------------------------------------------------------------
/Sequencing/images/sanger_seq.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Sequencing/images/sanger_seq.png


--------------------------------------------------------------------------------
/Sequencing/images/tagmentation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Sequencing/images/tagmentation.png


--------------------------------------------------------------------------------
/Snakemake/README.md:
--------------------------------------------------------------------------------
  1 | # Using snakemake
  2 | 
  3 | Snakemake gets its inspiration from the program _Make_.
  4 | _Make_ is an old program that was originally designed to compile and install software.
  5 | It was appropriated by bioinformaticians because the program is ideal for writing pipelines.
  6 | 
  7 | In other examples, we started with two fastq files, removed adapter sequences, mapped them to the human genome, and then separated out the human and not human sequences.
  8 | 
  9 | We can combine all of that into a single `snakemake` file, and it will do all of the steps for us.
 10 | 
 11 | [snakemake](https://snakemake.readthedocs.io/en/stable/) is a way to write reproducible code. There are lots of [tutorials about snakemake](https://www.google.com/search?q=snakemake+tutorial) and we have a lot of [Snakemake](https://edwards.flinders.edu.au/?s=snakemake) tutorials on the Edwards' lab website.
 12 | 
 13 | ### The dataset
 14 | 
 15 | You can use any dataset you like for this step, but for the example, we'll use the [CF data](../Datasets/CF)
 16 | 
 17 | ## Writing the snakefile
 18 | 
 19 | 
 20 | You can use any text editor to write your `snakefile`. I recommend `nano` on your linux machine:
 21 | 
 22 | ```bash
 23 | nano
 24 | ```
 25 | 
 26 | The main commands are shown at the bottom of the screen. In these commands, the `^` means press the `ctrl` key.
 27 | 
 28 | ```
 29 | ^G Get Help      ^O Write Out     ^W Where Is      ^K Cut Text      ^J Justify       ^C Cur Pos       M-U Undo         M-A Mark Text    M-] To Bracket   M-Q Previous     ^B Back
 30 | ^X Exit          ^R Read File     ^\ Replace       ^U Paste Text    ^T To Spell      ^_ Go To Line    M-E Redo         M-6 Copy Text    ^Q Where Was     M-W Next         ^F Forward
 31 | ```
 32 | 
 33 | 
 34 | Snakemake uses standard [python](https://www.python.org/) python commands, and so if you have used python before you should find it familiar. There are some additions on top of python that you will need to learn.
 35 | 
 36 | Using the same concept as before, you start with your destination, _then_ you describe the journey.
 37 | __By default, Snakemake runs the first rule it encounters in the Snakefile.__
 38 | This is important because we use the first rule--`rule all`-- to declare the pipeline targets.
 39 | There are no outputs or shell command for rule all,
 40 | so the rule just tells Snakemake that we need the file "A.sam".
 41 | 
 42 | Create a file called `Snakefile` by using `nano`:
 43 | 
 44 | ```
 45 | nano Snakefile
 46 | ```
 47 | 
 48 | and add this text:
 49 | 
 50 | 
 51 | ```python
 52 | 
 53 | rule all:
 54 |     input:
 55 |         "fastp/788707_20180129_S_R1.fastq.gz",
 56 |         "fastp/788707_20180129_S_R2.fastq.gz"
 57 | 
 58 | rule run_fastp:
 59 |     input:
 60 |         r1 = "788707_20180129_S_R1.fastq.gz",
 61 |         r2 = "788707_20180129_S_R2.fastq.gz",
 62 |         ad = "IlluminaAdapters.fa"
 63 |     output:
 64 |         r1 = "fastp/788707_20180129_S_R1.fastq.gz",
 65 |         r2 = "fastp/788707_20180129_S_R2.fastq.gz"
 66 |     shell:
 67 |         """
 68 |         fastp -n 1 -l 100 -i {input.r1} -I {input.r2} -o {output.r1} -O {output.r2} --adapter_fasta {input.ad}
 69 |         """
 70 | ```
 71 | 
 72 | Exit `nano` by pressing `ctrl-x` and then pressing `y` to save the file.
 73 | 
 74 | This is your pipeline and Snakemake will recognise it when you run Snakemake.
 75 | The only thing you need to tell Snakemake at this stage is how many concurrent jobs to run.
 76 | For only 1 job at a time, you would run this:
 77 | 
 78 | ```bash
 79 | snakemake -j 1
 80 | ```
 81 | 
 82 | By default, `snakemake` looks for a file called `Snakefile` but that is not very imaginative, and you probably also want to call it something meaningful so you can find it again. You can use the `-s` flag to snakemake to get it to use a different file.
 83 | 
 84 | I normally name my snakefiles after the pipeline or process, and use a `.smk` file extension, so perhaps you might call this `preprocess.smk`. 
 85 | 
 86 | 
 87 | ```bash 
 88 | snakemake -s preprocess.smk -j 1
 89 | ```
 90 | 
 91 | # adding the minimap command
 92 | 
 93 | Edit your `Snakefile` it looks like this. 
 94 | 
 95 | *NOTE Make sure you add the additional line ("788707_20180129.bam.bai") to the `rule all` input!*
 96 | 
 97 | ```
 98 | nano Snakefile
 99 | ```
100 | 
101 | ```python
102 | 
103 | rule all:
104 |     input:
105 |         "fastp/788707_20180129_S_R1.fastq.gz",
106 |         "fastp/788707_20180129_S_R2.fastq.gz",
107 |         "788707_20180129.bam.bai"
108 | 
109 | rule run_fastp:
110 |     input:
111 |         r1 = "788707_20180129_S_R1.fastq.gz",
112 |         r2 = "788707_20180129_S_R2.fastq.gz",
113 |         ad = "IlluminaAdapters.fa"
114 |     output:
115 |         r1 = "fastp/788707_20180129_S_R1.fastq.gz",
116 |         r2 = "fastp/788707_20180129_S_R2.fastq.gz"
117 |     shell:
118 |         """
119 |         fastp -n 1 -l 100 -i {input.r1} -I {input.r2} -o {output.r1} -O {output.r2} --adapter_fasta {input.ad}
120 |         """
121 | 
122 | rule run_minimap:
123 |     input:
124 |         r1 = "fastp/788707_20180129_S_R1.fastq.gz",
125 |         r2 = "fastp/788707_20180129_S_R2.fastq.gz",
126 |         ref = "GCA_000001405.15_GRCh38_no_alt_plus_hs38d1_analysis_set.fna.gz"
127 |     output:
128 |         bam = "788707_20180129.bam"
129 |     shell:
130 |         """
131 |         minimap2 -t 16 --split-prefix=tmp$$ -a -xsr {input.ref} {input.r1} {input.r2} | samtools view -bh | samtools sort -o {output.bam}
132 |         """
133 | 
134 | rule index_bam:
135 |     input:
136 |         bam = "788707_20180129.bam"
137 |     output:
138 |         bai = "788707_20180129.bam.bai"
139 |     shell:
140 |         """
141 |         samtools index {input.bam}
142 |         """
143 | 
144 | ```
145 | 
146 | 
147 | This modified snakefile adds three additional commands:
148 | 
149 | 1. In the `rule all` we added the "788707_20180129.bam.bai" file that is the final output of the indexed bam file
150 | 2. The `rule run_minimap` performs the minimap alignment and creates the bam file
151 | 3. The `rule index_bam` indexes the bam file to make accessing it much faster
152 | 
153 | 
154 | *But note!* In this command, we don't request the creation of the file `788707_20180129.bam`, but because it is required as an input to the command `index_bam`, snakemake knows that it has to make that as an output file. The _only_ way to make that file is to run the rule `run_minimap`, and that requires as input the two `fastq` files in the `fastp` folder, and so that needs the rule `run fastp` to complete! 
155 | 
156 | That's the beauty of snakemake, now you can always run the same commands.
157 | 
158 | 
159 | 
160 | 
161 | 
162 | 
163 | 
164 | 
165 | 
166 | 
167 | 
168 | 
169 | 


--------------------------------------------------------------------------------
/Snakemake/fastp.snakefile:
--------------------------------------------------------------------------------
 1 | """
 2 | 
 3 | An example snakefile
 4 | 
 5 | You may use this snakefile as you wish, but I am not liable if you do
 6 | use it!
 7 | 
 8 | Rob Edwards, July 2023
 9 | 
10 | """
11 | 
12 | 
13 | import os
14 | import sys
15 | 
16 | 
17 | # set this to whatever the name of your directory
18 | # with the reads is. If you are following along with the
19 | # tutorial, you can leave this as fastq
20 | READDIR = 'fastq'
21 | 
22 | # Note that this example requires an R1 file AND an R2 file
23 | # and that each file should match *_R1.* and *_R2.* 
24 | SAMPLES,EXTENSIONS = glob_wildcards(os.path.join(READDIR, '{sample}_R1.{extentions}'))
25 | 
26 | # just get the first file extension as we don't need to iterate all of them
27 | file_extension = EXTENSIONS[0]
28 | 
29 | 
30 | # just check there is something to actually do!
31 | if len(SAMPLES) == 0:
32 |     sys.stderr.write("FATAL: We could not detect any samples at all.\n")
33 |     sys.stderr.write(f"Do you have a directory called {READDIR} with some fastq files in it?\n")
34 |     sys.stderr.write("Do those fastq files have _R1. and _R2.?\n")
35 |     sys.exit()
36 | 
37 | 
38 | rule all:
39 |     input:
40 |         expand(os.path.join("fastp", "{sample}_R1.fastq.gz"), sample=SAMPLES)
41 | 
42 | rule run_fastp:
43 |     input:
44 |         r1 = os.path.join(READDIR, "{sample}_R1." + file_extension),
45 |         r2 = os.path.join(READDIR, "{sample}_R2." + file_extension),
46 |         ad = "IlluminaAdapters.fa"
47 |     output:
48 |         r1 = os.path.join("fastp", "{sample}_R1." + file_extension),
49 |         r2 = os.path.join("prinseq", "{sample}_R2." + file_extension),
50 |         fp = temporary("fastp.html")
51 |         fj = temporary("fastp.json")
52 |     shell:
53 |         """
54 |         fastp fastp -n 1 -l 100 -i {input.r1} -I {input.r2} -o {output.r1} -O {output.r2} --adapter_fasta {input.ad}
55 |         """
56 | 
57 | 
58 | 


--------------------------------------------------------------------------------
/Snakemake/filter_assemble.snakefile:
--------------------------------------------------------------------------------
 1 | rule all:
 2 |     input:
 3 |         "fastp/788707_20180129_S_R1.fastq.gz",
 4 |         "fastp/788707_20180129_S_R2.fastq.gz",
 5 |         "788707_20180129.bam.bai",
 6 |         "human/788707_20180129_S_R1.fastq.gz",
 7 |         "not_human/788707_20180129_S_R1.fastq.gz",
 8 | 
 9 | rule run_fastp:
10 |     input:
11 |         r1 = "788707_20180129_S_R1.fastq.gz",
12 |         r2 = "788707_20180129_S_R2.fastq.gz",
13 |         ad = "IlluminaAdapters.fa"
14 |     output:
15 |         r1 = "fastp/788707_20180129_S_R1.fastq.gz",
16 |         r2 = "fastp/788707_20180129_S_R2.fastq.gz"
17 |     shell:
18 |         """
19 |         fastp -n 1 -l 100 -i {input.r1} -I {input.r2} -o {output.r1} -O {output.r2} --adapter_fasta {input.ad}
20 |         """
21 | 
22 | rule run_minimap:
23 |     input:
24 |         r1 = "fastp/788707_20180129_S_R1.fastq.gz",
25 |         r2 = "fastp/788707_20180129_S_R2.fastq.gz",
26 |         ref = "GCA_000001405.15_GRCh38_no_alt_plus_hs38d1_analysis_set.fna.gz"
27 |     output:
28 |         bam = "788707_20180129.bam"
29 |     shell:
30 |         """
31 |         minimap2 -t 16 --split-prefix=tmp$$ -a -xsr {input.ref} {input.r1} {input.r2} | samtools view -bh | samtools sort -o {output.bam}
32 |         """
33 | 
34 | rule index_bam:
35 |     input:
36 |         bam = "788707_20180129.bam"
37 |     output:
38 |         bai = "788707_20180129.bam.bai"
39 |     shell:
40 |         """
41 |         samtools index {input.bam}
42 |         """
43 | 
44 | rule human:
45 |     input:
46 |         bam = "788707_20180129.bam"
47 |     output:
48 |         r1 = "human/788707_20180129_S_R1.fastq.gz",
49 |         r2 = "human/788707_20180129_S_R2.fastq.gz"
50 |     shell:
51 |         """
52 |         samtools fastq -F 3588 -f 65 {input.bam} | gzip -c > {output.r1};
53 |         samtools fastq -F 3588 -f 129 {input.bam} | gzip -c > {output.r2};
54 |         """
55 | 
56 | rule not_human:
57 |     input:
58 |         bam = "788707_20180129.bam"
59 |     output:
60 |         r1 = "not_human/788707_20180129_S_R1.fastq.gz",
61 |         r2 = "not_human/788707_20180129_S_R2.fastq.gz"
62 |     shell:
63 |         """
64 |         samtools fastq -F 3584 -f 77 {input.bam} | gzip -c > {output.r1};
65 |         samtools fastq -F 3584 -f 141 {input.bam} | gzip -c > {output.r2};
66 |         """
67 | 
68 | rule assemble:
69 |     input:
70 |         r1 = "not_human/788707_20180129_S_R1.fastq.gz",
71 |         r2 = "not_human/788707_20180129_S_R2.fastq.gz"
72 |     output:
73 |         directory("not_human_assembly")
74 |     shell:
75 |         """
76 |         spades.py --meta -1 {input.r1} -2 {input.r2} -o not_human_assembly -t 8
77 |         """
78 | 


--------------------------------------------------------------------------------
/Snakemake/prinseq.snakefile:
--------------------------------------------------------------------------------
 1 | """
 2 | 
 3 | An example snakefile written for use on deepthought.
 4 | 
 5 | You may use this snakefile as you wish, but I am not liable if you do
 6 | use it!
 7 | 
 8 | Rob Edwards, September 2020
 9 | 
10 | """
11 | 
12 | 
13 | import os
14 | import sys
15 | 
16 | 
17 | # set this to whatever the name of your directory
18 | # with the reads is. If you are following along with the
19 | # tutorial, you can leave this as fastq
20 | READDIR = 'fastq'
21 | 
22 | # Note that this example requires an R1 file AND an R2 file
23 | # and that each file should match *_R1.* and *_R2.* 
24 | SAMPLES,EXTENSIONS = glob_wildcards(os.path.join(READDIR, '{sample}_R1.{extentions}'))
25 | 
26 | # just get the first file extension as we don't need to iterate all of them
27 | file_extension = EXTENSIONS[0]
28 | 
29 | 
30 | # just check there is something to actually do!
31 | if len(SAMPLES) == 0:
32 |     sys.stderr.write("FATAL: We could not detect any samples at all.\n")
33 |     sys.stderr.write(f"Do you have a directory called {READDIR} with some fastq files in it?\n")
34 |     sys.stderr.write("Do those fastq files have _R1. and _R2.?\n")
35 |     sys.exit()
36 | 
37 | 
38 | rule all:
39 |     input:
40 |         expand(os.path.join("prinseq", "{sample}_R1.good.fastq"), sample=SAMPLES)
41 | 
42 | rule run_prinseq:
43 |     input:
44 |         r1 = os.path.join(READDIR, "{sample}_R1." + file_extension),
45 |         r2 = os.path.join(READDIR, "{sample}_R2." + file_extension)
46 |     output:
47 |         r1 = os.path.join("prinseq", "{sample}_R1.good.fastq"),
48 |         s1 = os.path.join("prinseq", "{sample}_R1.single.fastq"),
49 |         b1 = os.path.join("prinseq", "{sample}_R1.bad.fastq"),
50 |         r2 = os.path.join("prinseq", "{sample}_R2.good.fastq"),
51 |         s2 = os.path.join("prinseq", "{sample}_R2.single.fastq"),
52 |         b2 = os.path.join("prinseq", "{sample}_R2.bad.fastq")
53 |     shell:
54 |         """
55 |         prinseq++ -min_len 60 -min_qual_mean 25 -ns_max_n 1 -derep 1 \
56 |             -out_format 0 -trim_tail_left 5 -trim_tail_right 5 \
57 |             -ns_max_n 5  -trim_qual_type min -trim_qual_left 30 \
58 |             -trim_qual_right 30 -trim_qual_window 10 \
59 |             -out_good {output.r1} -out_single {output.s1} -out_bad {output.b1} \
60 |             -out_good2 {output.r2} -out_single2 {output.s2} -out_bad2 {output.b2} \
61 |             -fastq {input.r1} \
62 |             -fastq2 {input.r2};
63 |         """
64 | 
65 | 
66 | 


--------------------------------------------------------------------------------
/UPDATING.md:
--------------------------------------------------------------------------------
 1 | # How to update this manual
 2 | 
 3 | If you wish to contribute to the manual, please contact Rob. He'll welcome all kinds of help! The easiest way to do so is to clone the git repository, and then make your changes in your clone. Once you are done with that, make a pull request here and Rob can merge your changes with his.
 4 | 
 5 | If you are not sure how to do any of that, ask Rob and he'll point you in the right direction to get started.
 6 | 
 7 | 
 8 | # Creating PDFs
 9 | 
10 | In each directory we include PDFs of the current state of the README. These are made with pandoc, and you can recreate the PDF at any time using the command (of course, chaning FILE to the appropriate name):
11 | 
12 | ```
13 | pandoc README.md -f markdown --latex-engine=xelatex --columns 100 --smart -s -o FILE.pdf
14 | ```
15 | 
16 | or for all directories:
17 | 
18 | ```
19 | for DIR in $(find -maxdepth 1 -type d | sed -e 's/.\/\..*//; s/^\.$//; s/\.\///'); do
20 | 	echo $DIR
21 | 	cd $DIR
22 | 	for FILE in *.md; do
23 | 		if [ $FILE eq "README.md" ]; then
24 | 			$OUTPUT=$DIR;
25 | 		else
26 | 			$OUTPUT=$(echo $FILE | sed -e 's/.md//');
27 | 		fi
28 | 		echo -e "\tOUTPUT FILE FOR $FILE IS $OUTPUT";
29 | 
30 | 		if [ ! -e $OUTPUT.pdf ] || [ "$FILE" -nt "$OUTPUT.pdf" ]; then
31 | 			echo -e  "\nCreating PDF of $FILE in $DIR";
32 | 			#pandoc $FILE -f markdown --latex-engine=xelatex --columns 100 --smart -s -o $OUTPUT.pdf
33 | 		fi
34 | 	done
35 | 	cd ../
36 | done
37 | ```
38 | 
39 | ```bash
40 | for DIR in $(find -maxdepth 1 -type d | sed -e 's/.\/\..*//; s/^\.$//; s/\.\///'); do echo $DIR; cd $DIR; if [ ! -e $DIR.pdf ] || [ "README.md" -nt "$DIR.pdf" ]; then echo -e "\tCreating PDF in $DIR"; pandoc README.md -f markdown --latex-engine=xelatex --columns 100 --smart -s -o $DIR.pdf; fi; cd ../; done
41 | ```
42 | 


--------------------------------------------------------------------------------
/UPDATING.sh:
--------------------------------------------------------------------------------
 1 | for DIR in $(find -maxdepth 1 -type d | sed -e 's/.\/\..*//; s/^\.$//; s/\.\///'); do
 2 | 	echo $DIR
 3 | 	cd $DIR
 4 | 	for FILE in *.md; do
 5 | 		if [ $FILE == "README.md" ]; then
 6 | 			OUTPUT=$DIR;
 7 | 		else
 8 | 			OUTPUT=$(echo $FILE | sed -e 's/.md//');
 9 | 		fi
10 | 
11 | 		if [ ! -e $OUTPUT.pdf ] || [ "$FILE" -nt "$OUTPUT.pdf" ]; then
12 | 			echo -e  "\tCreating PDF of $FILE as $OUTPUT.pdf in $DIR";
13 | 			pandoc $FILE -f markdown --latex-engine=xelatex --columns 100 --smart -s -o $OUTPUT.pdf
14 | 		fi
15 | 	done
16 | 	cd ../
17 | done
18 | 


--------------------------------------------------------------------------------
/UPGRADE.md:
--------------------------------------------------------------------------------
 1 | # Upgrading Images
 2 | 
 3 | We strive to keep the SDSU Computational Genomics Image upto date, and sometimes that means updating the system software, and sometimes fixing installation bugs and issues. From time to time you may need to upgrade the image.
 4 | 
 5 | There are a couple of steps that can make life easier for you!.
 6 | 
 7 | If you have a running image with some data on it, leave it running. Now, launch a new instance of the latest version of our image. You now have two running instances, an old instance with your data and a shiny new instance with no data on it.
 8 | 
 9 | We're going to copy the data across, but before we do that we need to move the private [ssh key](Linux#public-and-private-ssh-keys) onto the server (but don't worry, because we're going to delete the server shortly).
10 | 
11 | scp your `pem` file from your laptop to the server.
12 | 
13 | ```bash
14 | scp -i id_rsa.pem ~/.ssh/id_rsa.pem ec2-user@xxx.xxx.xxx.xxx:~/.ssh/
15 | ```
16 | _Note_ You need to change ~/.ssh/id_rsa.pem to the location of your `pem` file that you acknowledge that you have when you start an AWS instance. Also, you need to change `xxx.xxx.xxx.xxx` to the IP of the older server that still has your data on it.
17 | 
18 | Next, login to the older server
19 | 
20 | ```bash
21 | ssh xxx.xxx.xxx.xxx
22 | ```
23 | 
24 | and then scp the data to the new server. Note that you need to tell ssh where your secret `pem` file is located:
25 | 
26 | ```bash
27 | scp -i ~/.ssh/file.pem -r * ec2-user@yyy.yyy.yyy.yyy:
28 | ```
29 | _Note_ change `yyy.yyy.yyy.yyy` to the ip address of the new server. Also, don't forget the colon on the end of the line!
30 | 
31 | This will ask you if you want to accept the identity of the new server. Go ahead and type `yes`.
32 | 
33 | If you login to the new server, you should see all your data there! If not, take a look at the old server and see what you did wrong ... did you forget the `:` on the end of the line?
34 | 
35 | Once you have all the data on the new server, you can log back into the AWS console. Select the older server and choose `Actions --> Instance State --> Terminate`. That will delete it, and everything including your private key.
36 | 


--------------------------------------------------------------------------------
/Workshops/COMBINE_QLD_2024_binchicken.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## Hands-on with Bin Chicken
 3 | 
 4 | ### Bin Chicken installation and setup
 5 | 
 6 | See https://aroneys.github.io/binchicken for more details.
 7 | 
 8 | ```bash
 9 | mamba create -n binchicken -c bioconda -c conda-forge 'binchicken>=0.12.5'
10 | conda activate binchicken
11 | binchicken build \
12 |   --conda-prefix /storage/data/.conda \
13 |   --singlem-metapackage /storage/data/metapackage \
14 |   --checkm2-db /storage/data/checkm2
15 | ```
16 | 
17 | ### Single-sample assembly with multi-sample binning
18 | 
19 | Start by running `binchicken single` to prepare the data to assemble each sample individually.
20 | 
21 | ```bash
22 | binchicken single \
23 |     --forward 788707_20171213_S_R1.fastq.gz 788707_20180129_S_R1.fastq.gz 788707_20180313_S_R1.fastq.gz 788707_20181126_S_R1.fastq.gz \
24 |     --reverse 788707_20171213_S_R2.fastq.gz 788707_20180129_S_R2.fastq.gz 788707_20180313_S_R2.fastq.gz 788707_20181126_S_R2.fastq.gz \
25 |     --output single_assembly
26 | ```
27 | 
28 | The suggested assemblies with their respective binning samples can be found at `single_assembly/coassemble/target/elusive_clusters.tsv`.
29 | In this case, only two of the samples are considered likely to recover genomes. These samples are 788707_20180313_S and 788707_20180129_S.
30 | The other samples are probably too small (they are heavily subsampled) to recover genomes.
31 | 
32 | The actual assembly and binning can be run by adding `--run-aviary`.
33 | Note that with 1 core, the assemblies will take ~30 minutes each.
34 | 
35 | ```bash
36 | binchicken single \
37 |     --forward 788707_20171213_S_R1.fastq.gz 788707_20180129_S_R1.fastq.gz 788707_20180313_S_R1.fastq.gz 788707_20181126_S_R1.fastq.gz \
38 |     --reverse 788707_20171213_S_R2.fastq.gz 788707_20180129_S_R2.fastq.gz 788707_20180313_S_R2.fastq.gz 788707_20181126_S_R2.fastq.gz \
39 |     --output single_assembly --run-aviary --cores 5
40 | ```
41 | 
42 | The assembly and binning for each sample is found at `single_assembly/coassemble/coassemble/`.
43 | Each sample should have a folder containing `assemble` for the assembly and `recover` for the binning.
44 | The bins for each sample are found in `recover/bins`, with genome info at `recover/bins/bin_info.tsv`.
45 | 
46 | The recovered bins are likely only ~40-60% complete, with fairly high contamination.
47 | This is probably due the small sample size, but the genome could still be analysed further with e.g. GTDBtk to find out their taxonomy.
48 | 
49 | ### Coassembly with multi-sample binning
50 | 
51 | Now that we have run single-sample assembly for the decent samples, we can run coassembly across the dataset.
52 | Because 
53 | 
54 | ```bash
55 | binchicken coassemble \
56 |     --forward 788707_20171213_S_R1.fastq.gz 788707_20180129_S_R1.fastq.gz 788707_20180313_S_R1.fastq.gz 788707_20181126_S_R1.fastq.gz \
57 |     --reverse 788707_20171213_S_R2.fastq.gz 788707_20180129_S_R2.fastq.gz 788707_20180313_S_R2.fastq.gz 788707_20181126_S_R2.fastq.gz \
58 |     --output coassembly --max-coassembly-samples 5
59 | ```
60 | 
61 | The suggested coassemblies with their respective binning samples can be found at `coassembly/coassemble/target/elusive_clusters.tsv`.
62 | Since they share single-copy marker genes, the samples 788707_20180129_S and 788707_20180313_S are suggested for coassembly.
63 | 
64 | We can run the actual coassembly and binning as before by adding `--run-aviary`.
65 | Note that with 1 core, the coassembly will take ~1 hour.
66 | 
67 | ```bash
68 | binchicken coassemble \
69 |     --forward 788707_20171213_S_R1.fastq.gz 788707_20180129_S_R1.fastq.gz 788707_20180313_S_R1.fastq.gz 788707_20181126_S_R1.fastq.gz \
70 |     --reverse 788707_20171213_S_R2.fastq.gz 788707_20180129_S_R2.fastq.gz 788707_20180313_S_R2.fastq.gz 788707_20181126_S_R2.fastq.gz \
71 |     --output coassembly --max-coassembly-samples 5 --run-aviary --cores 5
72 | ```
73 | 
74 | ## References
75 | 
76 | - Alneberg, J., Bjarnason, B.S., de Bruijn, I., Schirmer, M., Quick, J., Ijaz, U.Z., Lahti, L., Loman, N.J., Andersson, A.F., Quince, C., 2014. Binning metagenomic contigs by coverage and composition. Nat. Methods 11, 1144–1146. https://doi.org/10.1038/nmeth.3103
77 | - Ayling, M., Clark, M.D., Leggett, R.M., 2020. New approaches for metagenome assembly with short reads. Brief. Bioinform. 21, 584–594. https://doi.org/10.1093/bib/bbz020
78 | - Kang, D.D., Froula, J., Egan, R., Wang, Z., 2015. MetaBAT, an efficient tool for accurately reconstructing single genomes from complex microbial communities. PeerJ. https://doi.org/10.7717/peerj.1165
79 | - Kang, D.D., Li, F., Kirton, E., Thomas, A., Egan, R., An, H., Wang, Z., 2019. MetaBAT 2: an adaptive binning algorithm for robust and efficient genome reconstruction from metagenome assemblies. PeerJ 7, e7359. https://doi.org/10.7717/peerj.7359
80 | - Li, D., Liu, C.-M., Luo, R., Sadakane, K., Lam, T.-W., 2015. MEGAHIT: an ultra-fast single-node solution for large and complex metagenomics assembly via succinct de Bruijn graph. Bioinformatics 31, 1674–1676. https://doi.org/10.1093/bioinformatics/btv033
81 | - Mallawaarachchi, V., Wickramarachchi, A., Xue, H., Papudeshi, B., Grigson, S.R., Bouras, G., Prahl, R.E., Kaphle, A., Verich, A., Talamantes-Becerra, B., Dinsdale, E.A., Edwards, R.A., 2024. Solving genomic puzzles: computational methods for metagenomic binning. Brief. Bioinform. 25, bbae372. https://doi.org/10.1093/bib/bbae372
82 | - Nissen, J.N., Johansen, J., Allesøe, R.L., Sønderby, C.K., Armenteros, J.J.A., Grønbech, C.H., Jensen, L.J., Nielsen, H.B., Petersen, T.N., Winther, O., Rasmussen, S., 2021. Improved metagenome binning and assembly using deep variational autoencoders. Nat. Biotechnol. 39, 555–560. https://doi.org/10.1038/s41587-020-00777-4
83 | - Nurk, S., Meleshko, D., Korobeynikov, A., Pevzner, P.A., 2017. metaSPAdes: a new versatile metagenomic assembler. Genome Res. 27, 824–834. https://doi.org/10.1101/gr.213959.116
84 | - Pan, S., Zhao, X.-M., Coelho, L.P., 2023. SemiBin2: self-supervised contrastive learning leads to better MAGs for short- and long-read sequencing. Bioinformatics 39, i21–i29. https://doi.org/10.1093/bioinformatics/btad209
85 | - Pan, S., Zhu, C., Zhao, X.-M., Coelho, L.P., 2022. A deep siamese neural network improves metagenome-assembled genomes in microbiome datasets across different environments. Nat. Commun. 13, 2326. https://doi.org/10.1038/s41467-022-29843-y
86 | - Sieber, C.M.K., Probst, A.J., Sharrar, A., Thomas, B.C., Hess, M., Tringe, S.G., Banfield, J.F., 2018. Recovery of genomes from metagenomes via a dereplication, aggregation and scoring strategy. Nat. Microbiol. 3, 836–843. https://doi.org/10/gfwwfg
87 | - Wang, Z., You, R., Han, H., Liu, W., Sun, F., Zhu, S., 2024. Effective binning of metagenomic contigs using contrastive multi-view representation learning. Nat. Commun. 15, 585. https://doi.org/10.1038/s41467-023-44290-z
88 | - Wu, Y.-W., Simmons, B.A., Singer, S.W., 2016. MaxBin 2.0: an automated binning algorithm to recover genomes from multiple metagenomic datasets. Bioinforma. Oxf. Engl. 32, 605–607. https://doi.org/10.1093/bioinformatics/btv638
89 | 


--------------------------------------------------------------------------------
/Workshops/README.md:
--------------------------------------------------------------------------------
1 | # Workshops
2 | 
3 | We teach a variety of workshops from one to a few days covering this material. Here are the instructions for some of those
4 | 
5 | * [SAGC 2021](SAGC2021.md)
6 | * [INRB Kinshasa 2023](INRB2023.md)
7 | * [COMBINE Workshop 2024 QLD](COMBINE_QLD_2024.md)
8 | * [COMBINE Workshop 2024 WA](COMBINE_WA_2024.md)
9 | 


--------------------------------------------------------------------------------
/Workshops/images/bandage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Workshops/images/bandage.png


--------------------------------------------------------------------------------
/Workshops/images/k119_21323.allsamples.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Workshops/images/k119_21323.allsamples.png


--------------------------------------------------------------------------------
/Workshops/images/k119_21323.threesamples.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Workshops/images/k119_21323.threesamples.png


--------------------------------------------------------------------------------
/Workshops/images/megahit_bins.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Workshops/images/megahit_bins.png


--------------------------------------------------------------------------------
/Workshops/images/metagenomics_map_uq_2024.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Workshops/images/metagenomics_map_uq_2024.png


--------------------------------------------------------------------------------
/Workshops/makeing_machines.md:
--------------------------------------------------------------------------------
  1 | # Making machines for the workshops
  2 | 
  3 | We usually use [Google Cloud](https://console.google.com), [AWS](https://us-west-2.signin.aws.amazon.com/) or [Nectar](https://dashboard.rc.nectar.org.au/)
  4 | 
  5 | - When booting a new machine, make sure that you allow SSH access in the security groups. It needs a group that provides port 22 access to all IP address (0.0.0.0/0)
  6 | - Make sure you add additional space for user accounts, usually as an external volume:
  7 |     - mount the volume in the web server (e.g. as `/dev/vdb1`)
  8 |     - use `fdisk /dev/vdb` to make a new partition
  9 |     - use `mkfs.ext4 /dev/vdb1` to format that
 10 |     - mount it on `/storage`: `mkdir /storage && mount /dev/vdb1 /storage`
 11 | - make user and data directories: `mkdir /storage/users/ /storage/data`
 12 | - set the ubuntu password so you can enable passwords: `sudo passwd ubuntu`
 13 | - enable passwords:
 14 | 
 15 | ```
 16 | sudo vi /etc/ssh/sshd_config
 17 | # Change PasswordAuthentication no to yes.
 18 | # Save and exit
 19 | sudo /etc/init.d/sshd reload
 20 | 
 21 | # for debian use
 22 | service sshd reload
 23 | 
 24 | # while you are at it:
 25 | 
 26 | apt update && apt install build-essential && apt -y dist-upgrade 
 27 | 
 28 | ```
 29 | 
 30 | 
 31 | 
 32 | 
 33 | 
 34 | - create the users with [create_newusers.py](https://raw.githubusercontent.com/linsalrob/EdwardsLab/master/bin/create_newusers.py). Note that this version sets their home directory to `/storage/users/$USER`
 35 | 
 36 | ```
 37 | python ~/GitHubs/EdwardsLab/bin/create_newusers.py -m storage/users -n 200 -s 5
 38 | ```
 39 | 
 40 | - copy these files across to the ubuntu machine and use `newusers` to setup the accounts:
 41 | 
 42 | ```
 43 | scp -i ~/.ssh/xxxxxxxxxxxxx_rsa users.tsv accounts.tsv ubuntu@aaa.bbb.ccc.ddd:
 44 | # login to the host
 45 | sudo newusers users.tsv
 46 | ```
 47 | 
 48 | - check the you can log in with e.g. user number 100.
 49 | 
 50 | 
 51 | # Installing pony
 52 | 
 53 | We are trying [pony linux](https://github.com/NCGAS/PonyLinux/blob/master/installation_instructions.md) 
 54 | 
 55 | ## install software
 56 | 
 57 | - install git/texinfo: `apt-get update && apt-get install -y git texinfo`
 58 | - install ponysay: 
 59 | 
 60 | ```
 61 | cd /root
 62 | git clone https://github.com/erkin/ponysay.git
 63 | cd ponysay/
 64 | python3 setup.py --freedom=partial install
 65 | ```
 66 | 
 67 | Now install PonyLinux somewhere everyone can use it:
 68 | 
 69 | ```
 70 | cd /storage
 71 | mkdir git
 72 | cd git
 73 | git clone https://github.com/NCGAS/PonyLinux.git
 74 | ```
 75 | 
 76 | Next, link this to each users home directory:
 77 | 
 78 | ```
 79 | cd /storage/users/
 80 | for USER in user*; do ln -s /storage/git/PonyLinux $USER/PonyLinux; chown $USER:$USER $USER/PonyLinux; done 
 81 | ```
 82 | 
 83 | Now when they log in, they can `cd PonyLinux; ./PonyLinux.sh`
 84 | 
 85 | 
 86 | # Install bioconda
 87 | 
 88 | Download miniforge:
 89 | 
 90 | 
 91 | ```
 92 | curl -LO https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh
 93 | bash  Miniforge3-Linux-x86_64.sh
 94 | ```
 95 | 
 96 | 
 97 | Make a ~/.bash_profile file
 98 | 
 99 | ```
100 | vi bash_profile
101 | 
102 | ```
103 | 
104 | Paste this text:
105 | ```
106 | if [ -f ~/.bashrc ]; then    
107 | 	. ~/.bashrc
108 | fi
109 | ```
110 | 
111 | Then copy to all users:
112 | 
113 | ```
114 | for USER in user0*; do cp bash_profile $USER/.bash_profile; chown $USER:$USER $USER/.bash_profile; done
115 | ```
116 | 
117 | 
118 | # Install binchicken and download databases
119 | 
120 | 
121 | ```
122 | mamba create -n binchicken -y -c bioconda -c conda-forge "binchicken>=0.12.5"
123 | 
124 | mamba activate binchicken
125 | binchicken build  --conda-prefix miniforge3  --singlem-metapackage metapackage --checkm2-db checkm2 --download-databases
126 | 
127 | ```
128 | 
129 | 
130 | 
131 | 
132 | 
133 | 
134 | 
135 | 
136 | 
137 | 
138 | 


--------------------------------------------------------------------------------
/Workshops/snakemake/README.md:
--------------------------------------------------------------------------------
1 | # Snakemake scripts
2 | 
3 | You should use these to run everything in one go, especially if you use a cluster!
4 | 
5 | - [process_metagenomesSAGC_conda.snakefile](process_metagenomesSAGC_conda.snakefile) A snakefile that uses conda (add the `--use-conda` flag) and installs the software you need. You will also need to download and extract [envs.zip](envs.zip), the conda environments.
6 | - [process_metagenomesSAGC.snakefile](process_metagenomesSAGC.snakefile) A snakefile that assumes you have everything installed. This may not work!
7 | 


--------------------------------------------------------------------------------
/Workshops/snakemake/envs.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/Workshops/snakemake/envs.zip


--------------------------------------------------------------------------------
/Workshops/snakemake/envs/focus.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |     - bioconda
3 | dependencies:
4 |     - focus
5 | 


--------------------------------------------------------------------------------
/Workshops/snakemake/envs/kraken.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |     - bioconda
3 | dependencies:
4 |     - kraken2
5 | 


--------------------------------------------------------------------------------
/Workshops/snakemake/envs/megahit.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |     - bioconda
3 | dependencies:
4 |     - megahit
5 | 


--------------------------------------------------------------------------------
/Workshops/snakemake/envs/minimap.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |     - bioconda
3 | dependencies:
4 |     - minimap2
5 |     - samtools
6 | 


--------------------------------------------------------------------------------
/Workshops/snakemake/envs/prinseq.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |     - conda-forge
3 |     - bioconda
4 | dependencies:
5 |     - prinseq-plus-plus
6 | 


--------------------------------------------------------------------------------
/Workshops/snakemake/envs/samtools.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |     - bioconda
3 | dependencies:
4 |     - samtools
5 | 


--------------------------------------------------------------------------------
/Workshops/snakemake/envs/superfocus.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |     - bioconda
3 | dependencies:
4 |     - super-focus
5 | 


--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-slate


--------------------------------------------------------------------------------
/tRNA_rRNA/README.md:
--------------------------------------------------------------------------------
 1 | # Finding tRNA genes
 2 | 
 3 | For finding tRNA genes we recommend [tRNA Scan](http://eddylab.org/software.html). This uses a stochastic context free grammar to identify the tRNA genes. By default, tRNA scan tries to find eukaryotic tRNA genes, but you can easily change the specification to bacterial tRNA genes using the -B flag:
 4 | 
 5 | ```bash
 6 | tRNAscan-SE -B -o trna.out assembly/scaffolds.fasta
 7 | ```
 8 | 
 9 | You should hopefully find one tRNA gene per codon (at least) in your genome!
10 | 
11 | _Note_ that the Lowe lab at UCSC has an [online tRNA Scan server](http://lowelab.ucsc.edu/tRNAscan-SE/), although you maybe limited to upload limitations.
12 | 
13 | _Note 2:_ You will need at least version 5 of the SDSU Computational Genomics Image to run tRNAscan-SE. You may need to [upgrade your image](../UPGRADE.md).
14 | 
15 | # Finding rRNA genes
16 | 
17 | For rRNA genes, we recommend [barrnap](https://github.com/tseemann/barrnap) for ribosomal RNA predictions.
18 | 
19 | barrnap has pre-built hidden Markov models for ribosomal RNA genes from Bacteria, Archaea, or Eukarya, and uses `nhmmer`, part of the [HMMER 3.1](http://hmmer.org/) suite, to compare your sequences to the prebuilt hidden Markov model profiles.
20 | 
21 | barrnap is already installed on the AWS image and you can easily run it with this command:
22 | 
23 | ```bash
24 | barrnap assembly/scaffolds.fasta
25 | ```
26 | 
27 | The output will include some information about the program, and then the locations of the ribosomal RNA genes that have been identified. Recall that bacteria should contain 5S, 16S, and 23SrRNA genes. One of the problems that you are likely to run into is that the ribosomal RNA genes are so similar that the assembly algorithms tend to break at rRNA genes. Recall, from [sequence assembly](../SequenceAssembly) that assembly algorithms have a big problem with highly repetitive sequences.
28 | 
29 | If you are assembling a genome, you should pull out the sequences that map to the ribosomal RNA genes and assemble them separately, and then figure out the organization of the genome using a biological method like PCR or a long range sequencing method like PacBio or Nanopore.


--------------------------------------------------------------------------------
/tRNA_rRNA/tRNA_rRNA.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/b8fe88e86e1fcbaeea988ec583a2d7d7c16a564a/tRNA_rRNA/tRNA_rRNA.pdf


--------------------------------------------------------------------------------