├── .gitattributes
├── .gitignore
├── .gitmodules
├── ANI
    ├── C
    │   ├── Version1.1
    │   │   ├── Libraries
    │   │   │   ├── FNACharactersLib.c
    │   │   │   ├── FNACharactersLib.h
    │   │   │   ├── levenshteinDistanceLib.c
    │   │   │   ├── levenshteinDistanceLib.h
    │   │   │   ├── numberOfLinesLib.c
    │   │   │   └── numberOfLinesLib.h
    │   │   ├── Main
    │   │   │   └── main.c
    │   │   └── Tests
    │   │   │   ├── ANITestCase
    │   │   │       └── main.c
    │   │   │   ├── TESTS.txt
    │   │   │   ├── fnaFiles
    │   │   │       ├── 5mers.fna
    │   │   │       ├── 5mers2.fna
    │   │   │       ├── NC_021215.fna
    │   │   │       └── NC_022886.fna
    │   │   │   └── kmersTestCase
    │   │   │       └── main.c
    │   ├── Version1.2
    │   │   ├── Libraries
    │   │   │   ├── FNACharactersLib.c
    │   │   │   ├── FNACharactersLib.h
    │   │   │   ├── levenshteinDistanceLib.c
    │   │   │   ├── levenshteinDistanceLib.h
    │   │   │   ├── numberOfLinesLib.c
    │   │   │   └── numberOfLinesLib.h
    │   │   ├── Main
    │   │   │   └── main.c
    │   │   └── Tests
    │   │   │   └── TESTS.txt
    │   └── Version1
    │   │   ├── Libraries
    │   │       ├── FNACharactersLib
    │   │       │   ├── FNACharactersLib.c
    │   │       │   └── FNACharactersLib.h
    │   │       ├── levenshteinDistanceLib
    │   │       │   ├── levenshteinDistanceLib.c
    │   │       │   └── levenshteinDistanceLib.h
    │   │       ├── numberOfLinesLib
    │   │       │   ├── numberOfLinesLib.c
    │   │       │   └── numberOfLinesLib.h
    │   │       └── queryKmersLib
    │   │       │   ├── queryKmersLib.c
    │   │       │   └── queryKmersLib.h
    │   │   ├── Main
    │   │       ├── README
    │   │       └── main.c
    │   │   └── Tests
    │   │       ├── ANI
    │   │           ├── 5mers.fna
    │   │           ├── 5mers2.fna
    │   │           └── main.c
    │   │       ├── TESTS
    │   │       └── kmers
    │   │           └── 5mers
    │   │               ├── 5mers.fna
    │   │               └── main.c
    └── Python
    │   └── Version1.2
    │       └── GUI
    │           └── Main
    │               ├── README
    │               └── aniGUI.py
├── Adapters
    └── IlluminaAdapters.fa
├── AlphaFold
    ├── best_scores.py
    └── ranking_debug.json
├── AustralianMetagenomes
    ├── README.md
    └── australian_metagenomes.txt
├── AuthorInformation
    ├── author.py
    └── parse_addresses.py
├── Bangers
    ├── JQ995537.faa.gz
    ├── README.md
    ├── amino_acids.txt
    ├── bangers.c
    ├── kseq.h
    └── test.out
├── CF
    ├── count_coverage.py
    └── filter_metagenomes.py
├── CommunityAssembly
    ├── README.md
    ├── count.py
    └── distribution.py
├── ENA
    └── parse_ENA_xml.py
├── Flinders
    ├── __init_.py
    ├── alignment_score.py
    ├── cd-hit-cluster-sizes.py
    ├── cd-hit-to-clusters.py
    ├── kmer_sim.py
    ├── plot_scores_3d.py
    ├── score_independently.py
    └── substitution_rules.py
├── GregFrederickson
    ├── parse_xls_to_taxonomy.py
    └── patric_to_tax.py
├── Jody
    └── blast2subsys.py
├── JohnMolkili
    ├── correlate_contigs.py
    ├── count_reads_to_contigs.py
    ├── join.py
    ├── separate_contig_by_group.py
    └── sequence_coverage.py
├── LICENSE
├── LizMetagenomes
    ├── README.md
    ├── envs
    │   ├── focus.yaml
    │   ├── prinseq.yaml
    │   └── superfocus.yaml
    ├── process_shark_metagenomes.snakefile
    └── read_destiny.py
├── ModelSEED
    ├── json_keys.py
    ├── json_keys_keys.py
    ├── json_list_keys.py
    └── parse_biochemistry.py
├── PAF
    └── summarize_hits.py
├── PythonClass
    ├── parse_genbank.py
    ├── plot_blast.py
    ├── random_sequence.py
    └── sequence.gb
├── RAST
    ├── RAST-alljobs.pl
    ├── RAST-jobs.pl
    ├── RAST-retrieve-jobs.pl
    ├── RAST-status.pl
    ├── RAST-submit-jobs.pl
    └── make_assigned_functions.pl
├── README.md
├── ViralBioinformaticsTools
    ├── README.md
    ├── git_hub_dates.tsv.gz
    ├── github_urls
    ├── proj_start_stop.tsv.gz
    └── viral_bioinformatics_tools.tsv.gz
├── VirusDiscoveryProject
    └── DataSelection
    │   ├── README.md
    │   ├── datasets.ipynb
    │   ├── phage_size_selection.txt
    │   ├── random_selection.txt
    │   ├── size_selection.txt
    │   └── wgs_datasets.tsv.gz
├── annotations
    ├── bacterial_pathogens.py
    └── singlem_reads_to_contigs.py
├── assembly
    ├── README.md
    ├── assemble.snakefile
    ├── mummerplot.snakefile
    ├── mummerplot.yaml
    ├── mummerplot_barcoded.snakefile
    ├── nanopore_assembly_barcoded.snakefile
    ├── nanopore_phage_assembly.snakefile
    ├── nanopore_phage_assembly_simple.snakefile
    └── nanopore_phage_assembly_simple_nohost.snakefile
├── bam
    ├── assign_mapped_reads.py
    ├── bam2fasta.py
    ├── bam2fastq.py
    ├── bam2fastq_paired.py
    ├── bam2reads.py
    ├── count_bam_hits.py
    ├── coverage_average.py
    ├── coverage_depth.py
    ├── fastq_not_in_bam.py
    ├── fastq_pairs.py
    ├── kurtosis.py
    ├── list_reads.py
    ├── read_differences.py
    └── samtools2table.pl
├── bin
    ├── Makefile
    ├── NSF_bibtex_by_year.py
    ├── NSF_conflicts.py
    ├── all_4mers.py
    ├── average_quality_scores.pl
    ├── blast2seq.py
    ├── cd-hit2fasta.py
    ├── checkR1R2.sh
    ├── check_fasta.py
    ├── cif2pdb.py
    ├── clustering.py
    ├── correlation_clustering.py
    ├── correlation_clusters_to_fasta.py
    ├── correlations.py
    ├── count_fasta.c
    ├── count_fastq.c
    ├── countfasta.py
    ├── countfastq.py
    ├── countgenbank.py
    ├── countgfa.py
    ├── cpgs.py
    ├── crAss_contig_correlations.py
    ├── crc64.py
    ├── create_newusers.py
    ├── distances.py
    ├── download_sra_lists.sh
    ├── dump_all_tables.py
    ├── embl_export.py
    ├── environment_violin_lot.py
    ├── expedition_xml2csv.py
    ├── extract.py
    ├── extract_fasta_sequence.pl
    ├── factorial.py
    ├── fake_fastq.py
    ├── fasta2sequence.pl
    ├── fasta_split.c
    ├── fastapercent.pl
    ├── fastg2gfa.c
    ├── fastq2fasta.c
    ├── fastq2fasta.cpp
    ├── fastq2fasta.py
    ├── fastq_average_qual.cpp
    ├── fastq_avqual.c
    ├── filter_fasta_length.py
    ├── filter_fastq.py
    ├── filter_seq_by_length.py
    ├── genbank2fasta.pl
    ├── genbank2flatfile.pl
    ├── genbank2fna.pl
    ├── genbank2sequences.py
    ├── genbank_count_motifs.py
    ├── genbank_list_features.py
    ├── genbanktable2fasta.pl
    ├── get_genbank.pl
    ├── get_genbank_batch.pl
    ├── get_genbank_batch_proteins.pl
    ├── get_lastlogs.sh
    ├── get_wgs_eutils.pl
    ├── getopt.cpp
    ├── gfa2fasta.sh
    ├── greedy_clustering.py
    ├── index_to_contig.py
    ├── joinlists.pl
    ├── json_validator.py
    ├── jsonl2tsv.py
    ├── kseq.h
    ├── latlon2km.py
    ├── longest_contig.py
    ├── merge_last_logs.py
    ├── merge_pdf.py
    ├── pair_fastq_bloom.py
    ├── pair_fastq_fast.py
    ├── pair_fastq_files.py
    ├── pair_fastq_lowmem.py
    ├── pairwise_percent_ids.py
    ├── parse_websites.py
    ├── parsebz2xml.py
    ├── pdb2fa.py
    ├── plot_pairwise_percents.py
    ├── print_taxonomy.py
    ├── rc.pl
    ├── rename_fasta.py
    ├── renumber_fasta.pl
    ├── renumber_fasta.py
    ├── renumber_merge_fasta.py
    ├── resample.py
    ├── riddler.py
    ├── samtools.pl
    ├── separate_multigenbank.py
    ├── separatemultifasta.pl
    ├── separatemultifasta.py
    ├── sge_summary.pl
    ├── sort_fasta_by_len.pl
    ├── sort_fasta_by_len_lengths_only.pl
    ├── stream_fasta.py
    ├── test.py
    ├── transpose.py
    ├── update_blastdb.sh
    ├── xml2csv.py
    ├── xml_print_all_attributes.py
    └── zotkill.pl
├── blast
    ├── blast2taxonomy.py
    ├── blast_to_network.py
    ├── blast_to_sequences.py
    ├── filter_fastq_by_blast.py
    ├── plot_blast.py
    ├── simple_blast_plot.py
    └── summarize_blast.py
├── bwt
    └── generate_table.py
├── cartopy
    ├── crAssphage_cophenetic.py
    ├── crAssphage_distance.py
    ├── crAssphage_ete.py
    └── example.py
├── cluster
    ├── split_blast_queries_edwards.pl
    ├── split_blast_queries_edwards_blastplus.pl
    └── submit2cluster_edwards
├── concoct
    ├── concoct_bins_to_reads.py
    └── concoct_csv_to_fasta.py
├── covid19
    ├── README.md
    ├── nCoV-BarGraph.py
    └── nCoV-Viz.py
├── cpp
    ├── fastq
    │   ├── fastq.cbp
    │   ├── fastq2fasta.cpp
    │   ├── include
    │   │   └── stream_fastq.h
    │   ├── main.cpp
    │   └── src
    │   │   └── stream_fastq.cpp
    ├── two-bit-optimized.c
    └── two-bit.cpp
├── crAssphage
    ├── NCBI_SRA_Submission.py
    ├── NCBI_add_biosample_to_tsv.py
    ├── NCBI_submission.py
    ├── README.md
    ├── average_seq_dist.py
    ├── check_duplicates.py
    ├── check_gp.py
    ├── collapse_bam_variants.py
    ├── collectors_curve.py
    ├── compare2sra_all.sh
    ├── countries.py
    ├── coverage_heatmap.py
    ├── coverage_heatmap_orfs.py
    ├── dnadist2anova.py
    ├── extract_genotypes.py
    ├── extract_pcr_reads_from_fq.py
    ├── extract_pcr_regions.py
    ├── fastq2crassphage.sh
    ├── kmer_table.py
    ├── mutation_freqs.py
    ├── pcr_fastq_coverage.py
    ├── phylip2clustal.py
    ├── plot_contig_sizes.py
    ├── plot_coverage_ABC.py
    ├── plot_genotypes.py
    ├── print_ondrej_pcr_regions.py
    ├── runs_that_match.py
    ├── snp_frequency.py
    ├── tom_jeffries_data.py
    └── transpose_and_join.pl.py
├── deconvolute_minion_reads
    ├── README.md
    ├── fastq
    │   ├── __init__.py
    │   └── sequences.py
    └── split_fastq.py
├── django
    └── django_notes.md
├── dna
    └── randomise_dna.py
├── email
    └── extract_email_from_pst.py
├── fasta
    ├── extract_sequence.py
    ├── fasta_qual_to_fastq.py
    ├── length_filter.py
    ├── lengths.py
    ├── reservoir_sample_fasta.py
    ├── sequence_len_distributions.py
    ├── split_contigs.py
    ├── split_fasta_r1r2.py
    ├── subsample_fasta.py
    └── test.fasta
├── fastq
    ├── README.md
    ├── average_quality.py
    ├── change_fastq_pair_symbol.c
    ├── compare_directory_fastq_counts.py
    ├── deduplicate_fastq.py
    ├── filter_fastq.py
    ├── filter_fastq_length.c
    ├── index_fastq.py
    ├── index_in_fastq.py
    ├── percent_quality.py
    ├── predict_primers.py
    ├── print_fastq.c
    ├── random_split_paired_fastq.py
    ├── randomly_sample_fastq.py
    ├── randomly_sample_fastq_to_dir.py
    ├── split_by_tags.py
    ├── split_fastq_files.py
    ├── split_fastq_sequences.py
    ├── test.fastq.gz
    ├── trim_fastq.py
    └── trim_primers.py
├── fifo
    └── README.md
├── gfa
    └── find_complete_circles.py
├── github
    └── get_repo_dates.py
├── h5py
    ├── files_to_h5.py
    ├── files_to_h5_2d.py
    ├── matrix_to_h5.py
    ├── read_h5.ipynb
    ├── test_data.py
    └── tmp.h5
├── hecatomb
    └── track_sequences.py
├── hmms
    └── run_hmmer.py
├── include
    └── kseq.h
├── isolation_sources
    ├── README.md
    └── genera-environment.py
├── jplacer
    ├── README.md
    ├── README2.md
    ├── add_metadata_to_matrix.py
    ├── color_based_on_fastq.py
    ├── count_metagenomes.py
    ├── create_colorstrip.py
    ├── create_multibar.py
    ├── explore_tree.py
    ├── fastq2color_strip.py
    ├── fastq2ids.py
    ├── generate_color_strip.py
    ├── parse_jplacer.py
    ├── parse_rename_write.py
    ├── rename_tree.py
    ├── test_taxonomy.py
    └── tree_to_cophenetic_matrix.py
├── jupyter
    ├── Bacteroides_prophage_lengths.json
    ├── Emma_subsystems
    │   ├── EagleRay_level1.tsv.gz
    │   ├── EagleRay_level1_norm_all.tsv.gz
    │   ├── EagleRay_level1_norm_ss.tsv.gz
    │   ├── NorfolkWater_level1.tsv.gz
    │   ├── NorfolkWater_level1_norm_all.tsv.gz
    │   ├── NorfolkWater_level1_norm_ss.tsv.gz
    │   ├── PortJackson_level1.tsv.gz
    │   ├── PortJackson_level1_norm_all.tsv.gz
    │   ├── PortJackson_level1_norm_ss.tsv.gz
    │   ├── TigerSharks_level1.tsv.gz
    │   ├── TigerSharks_level1_norm_all.tsv.gz
    │   ├── TigerSharks_level1_norm_ss.tsv.gz
    │   └── eagle_ray_types
    │   │   ├── level1_norm_all.tsv.gz
    │   │   ├── level1_norm_ss.tsv.gz
    │   │   ├── level1_raw.tsv.gz
    │   │   ├── level2_norm_all.tsv.gz
    │   │   ├── level2_norm_ss.tsv.gz
    │   │   ├── level2_raw.tsv.gz
    │   │   ├── ss_typed_norm_all.tsv.gz
    │   │   ├── ss_typed_norm_ss.tsv.gz
    │   │   └── ss_typed_raw.tsv.gz
    ├── IBD_Data_PCAs.ipynb
    ├── Lactobacillus_prophage_lengths.json
    ├── Lactobacillus_prophage_lengths2.json
    ├── SarahHeatmaps.ipynb
    ├── bacteroides_prophages.png
    ├── circles.ipynb
    ├── class
    │   └── phylum.tsv
    ├── data
    │   ├── get_headers.pl
    │   ├── rockart_subsystems
    │   │   ├── all_norm_all.tsv
    │   │   ├── all_norm_ss.tsv
    │   │   ├── all_raw.tsv
    │   │   ├── class_norm_all.tsv
    │   │   ├── class_norm_ss.tsv
    │   │   ├── class_raw.tsv
    │   │   ├── level1_idx.tsv
    │   │   ├── level1_norm_all.tsv
    │   │   ├── level1_norm_ss.r.tsv
    │   │   ├── level1_norm_ss.tsv
    │   │   ├── level1_raw.tsv
    │   │   ├── level2_idx.tsv
    │   │   ├── level2_norm_all.tsv
    │   │   ├── level2_norm_ss.r.tsv
    │   │   ├── level2_norm_ss.tsv
    │   │   ├── level2_raw.tsv
    │   │   ├── subsystems_norm_all.tsv
    │   │   ├── subsystems_norm_ss.r.tsv
    │   │   ├── subsystems_norm_ss.tsv
    │   │   ├── subsystems_norm_ss_idx.tsv
    │   │   └── subsystems_raw.tsv
    │   └── rockart_taxonomy
    │   │   ├── all_levels.tsv
    │   │   ├── class.tsv
    │   │   ├── family.tsv
    │   │   ├── genus.tsv
    │   │   ├── order.tsv
    │   │   ├── phylum.idx
    │   │   ├── phylum.r.tsv
    │   │   ├── phylum.sample.idx
    │   │   ├── phylum.sample.tsv
    │   │   ├── phylum.tsv
    │   │   ├── species.tsv
    │   │   └── superkingdom.tsv
    ├── example2.tsv
    ├── gfa_to_fasta.ipynb
    ├── heatmap.ipynb
    ├── histogram.ipynb
    ├── jess_countries.ipynb
    ├── jess_pca.ipynb
    ├── lactobacillus_prophages.png
    ├── liz_spreadsheets.ipynb
    ├── merged.ipynb
    ├── pca.ipynb
    ├── phyloseq2pandas.ipynb
    ├── rds_to_py.ipynb
    ├── reduced_protein_alphabet.ipynb
    ├── sarah_data
    │   ├── mmseqs_taxonomy
    │   │   ├── all_levels.tsv.gz
    │   │   ├── all_levels_renamed.tsv.gz
    │   │   ├── animation
    │   │   │   ├── img_10.png
    │   │   │   ├── img_11.png
    │   │   │   ├── img_12.png
    │   │   │   ├── img_13.png
    │   │   │   ├── img_14.png
    │   │   │   ├── img_15.png
    │   │   │   ├── img_16.png
    │   │   │   ├── img_2.png
    │   │   │   ├── img_3.png
    │   │   │   ├── img_4.png
    │   │   │   ├── img_5.png
    │   │   │   ├── img_6.png
    │   │   │   ├── img_7.png
    │   │   │   ├── img_8.png
    │   │   │   ├── img_9.png
    │   │   │   └── taxonomy.gif
    │   │   ├── class.tsv.gz
    │   │   ├── class_renamed.tsv.gz
    │   │   ├── family.tsv.gz
    │   │   ├── family_renamed.tsv.gz
    │   │   ├── genus.tsv.gz
    │   │   ├── genus_renamed.tsv.gz
    │   │   ├── order.tsv.gz
    │   │   ├── order_renamed.tsv.gz
    │   │   ├── pca_by_approach.png
    │   │   ├── pca_by_filter.png
    │   │   ├── pca_by_method.png
    │   │   ├── pca_by_replicate.png
    │   │   ├── pca_by_sample.png
    │   │   ├── pca_combined_replicates.png
    │   │   ├── phylum.tsv.gz
    │   │   ├── phylum_renamed.tsv.gz
    │   │   ├── species.tsv.gz
    │   │   ├── species_renamed.tsv.gz
    │   │   ├── superkingdom.tsv.gz
    │   │   └── superkingdom_renamed.tsv.gz
    │   ├── sarah_subsystems_pca.ipynb
    │   ├── sarah_taxonomy_pca.ipynb
    │   └── subsystems
    │   │   ├── README.md
    │   │   ├── all_norm_all.tsv.gz
    │   │   ├── all_norm_all_renamed.tsv.gz
    │   │   ├── all_norm_ss.tsv.gz
    │   │   ├── all_norm_ss_renamed.tsv.gz
    │   │   ├── all_raw.tsv.gz
    │   │   ├── all_raw_renamed.tsv.gz
    │   │   ├── animation
    │   │       ├── img_10.png
    │   │       ├── img_11.png
    │   │       ├── img_12.png
    │   │       ├── img_13.png
    │   │       ├── img_14.png
    │   │       ├── img_15.png
    │   │       ├── img_16.png
    │   │       ├── img_2.png
    │   │       ├── img_3.png
    │   │       ├── img_4.png
    │   │       ├── img_5.png
    │   │       ├── img_6.png
    │   │       ├── img_7.png
    │   │       ├── img_8.png
    │   │       ├── img_9.png
    │   │       └── subsystems.gif
    │   │   ├── class_norm_all.tsv.gz
    │   │   ├── class_norm_all_renamed.tsv.gz
    │   │   ├── class_norm_ss.tsv.gz
    │   │   ├── class_norm_ss_renamed.tsv.gz
    │   │   ├── class_raw.tsv.gz
    │   │   ├── class_raw_renamed.tsv.gz
    │   │   ├── level1_norm_all.tsv.gz
    │   │   ├── level1_norm_all_renamed.tsv.gz
    │   │   ├── level1_norm_ss.tsv.gz
    │   │   ├── level1_norm_ss_renamed.tsv.gz
    │   │   ├── level1_raw.tsv.gz
    │   │   ├── level1_raw_renamed.tsv.gz
    │   │   ├── level2.png
    │   │   ├── level2_norm_all.tsv.gz
    │   │   ├── level2_norm_all_renamed.tsv.gz
    │   │   ├── level2_norm_ss.tsv.gz
    │   │   ├── level2_norm_ss_renamed.tsv.gz
    │   │   ├── level2_raw.tsv.gz
    │   │   ├── level2_raw_renamed.tsv.gz
    │   │   ├── normalised_subsystem_level2.png
    │   │   ├── pca_by_approach.png
    │   │   ├── pca_by_filter.png
    │   │   ├── pca_by_method.png
    │   │   ├── pca_by_replicate.png
    │   │   ├── pca_by_sample.png
    │   │   ├── pca_combined_replicates.png
    │   │   ├── subsystems_norm_all.tsv.gz
    │   │   ├── subsystems_norm_all_renamed.tsv.gz
    │   │   ├── subsystems_norm_ss.tsv.gz
    │   │   ├── subsystems_norm_ss_renamed.tsv.gz
    │   │   ├── subsystems_raw.tsv.gz
    │   │   └── subsystems_raw_renamed.tsv.gz
    ├── subsystems_data.tsv.gz
    ├── subsystems_data_all.tsv.gz
    ├── taxonomy_data.tsv.gz
    ├── taxonomy_data_all.tsv.gz
    └── test2.ipynb
├── kbase
    ├── json_to_model.py
    └── parse_json.py
├── kmers
    ├── check_strand.py
    ├── compare_kmers_genbank.py
    ├── count_kmers.py
    ├── count_kmers_genbank.py
    ├── count_kmers_ordered_genbank.py
    ├── count_leading_kmers.py
    ├── count_trailing_kmers.py
    ├── find_kmers.py
    ├── hashcode.py
    ├── kmer_entropy.py
    ├── kmer_entropy3.3.py
    ├── kmer_entropy_sa.py
    ├── kmer_union_intersection.py
    ├── mash_env.py
    ├── plot_kmer_evenness.py
    └── reassemble_kmers.py
├── kyle
    └── kmerbias.py
├── manipulate_genomes
    ├── README.md
    ├── filter_from_blast.py
    ├── rotate_phage.py
    ├── trim_fasta.py
    └── upstream_regions.py
├── matplotlib graphs
    ├── 16S_smooth.py
    ├── 3d_scatter_plot.py
    ├── KernelDensityEstimator.ipynb
    ├── KernelRegression.ipynb
    ├── kde.py
    ├── plot_16S_coverage.py
    ├── plot_16S_coverage_kde.py
    ├── plot_16S_coverage_kernelregression.py
    ├── plot_16s_coverage_all.py
    ├── show_fig.py
    └── xy_scatter.py
├── mmseqs
    ├── dummy_database.sqlite
    └── easy_taxonomy_to_function.py
├── mongodb
    ├── find_biomasses.py
    ├── load_models.py
    ├── print_keys.py
    ├── search_mongo.py
    ├── simple_find.py
    └── simple_load_models.py
├── mummer
    ├── reverse_complement_fasta.py
    └── six_mers.py
├── nanopore
    └── split_fastq_by_barcode.py
├── ncbi
    ├── accession2taxonomy.py
    ├── all_taxid_by_rank.py
    ├── blast2taxonmy.py
    ├── blast2taxonomy_col.py
    ├── blast2taxonomy_sqlite.py
    ├── combine_gbff_fna.py
    ├── datasets
    │   ├── genome_information.py
    │   └── one_genome_information.py
    ├── filter_uniprot50_by_taxonomy.py
    ├── filter_uniprot50_precalculated.py
    ├── genbank_phages_via_ftp.py
    ├── get_protein_sequence.py
    ├── get_wgs_eutils.pl
    ├── mikes_taxonomy.py
    ├── name2animal_plant.py
    ├── parse_genbank.py
    ├── parse_sra.py
    ├── patric_add_taxonomy.py
    ├── phage_longest_gene.py
    ├── product_protein_seq.py
    ├── pubmed_to_csv.py
    ├── tax2spreadsheet.py
    ├── tax2spreadsheetdb.py
    ├── taxonomy.py
    ├── taxonomy_database.py
    ├── taxonomy_database_add_to_tsv.py
    ├── taxonomy_database_table.py
    ├── taxonomy_phylum_kingdom.py
    └── taxonomy_to_kingdom.py
├── patric
    └── parse_gto.py
├── percent_pairwise_identity
    ├── RecA_uniprot.faa.gz
    ├── RecA_uniprot_cdhit.aln
    ├── RecA_uniprot_cdhit.faa
    ├── average_pairwise.pl
    ├── identical_percent_ids.json.gz
    ├── list2matrix.pl
    ├── min_percent_counts.py
    ├── min_percent_counts_only.py
    ├── min_percent_counts_subsample.py
    ├── needleman_wunsch-0.3.5
    │   ├── LICENSE
    │   ├── Makefile
    │   ├── README
    │   ├── README.md
    │   ├── libs
    │   │   ├── alignment_scoring
    │   │   │   ├── alignment.c
    │   │   │   ├── alignment.h
    │   │   │   ├── alignment_scoring.c
    │   │   │   ├── alignment_scoring.h
    │   │   │   ├── alignment_scoring_load.c
    │   │   │   └── alignment_scoring_load.h
    │   │   ├── bioinf
    │   │   │   ├── bioinf.c
    │   │   │   └── bioinf.h
    │   │   ├── string_buffer
    │   │   │   ├── string_buffer.c
    │   │   │   └── string_buffer.h
    │   │   └── utility_lib
    │   │   │   ├── utility_lib.c
    │   │   │   └── utility_lib.h
    │   ├── needleman_wunsch.c
    │   ├── needleman_wunsch.h
    │   ├── nw_cmdline.c
    │   ├── seq1.fna
    │   ├── seq2.fna
    │   └── uthash.h
    ├── pairwise_percent_ids.py
    ├── permute_fasta.py
    ├── plot_pairwise_percents.py
    ├── strain_taxonomy.txt.gz
    └── taxon_focus2.csv
├── perl
    ├── Clustal.pm
    ├── MinSeed.pm
    ├── OGD.pm
    ├── ParseTree.pm
    ├── RAEProtein.pm
    ├── RepeatFinder.pm
    ├── Rob.pm
    ├── SGE-0.02
    │   ├── Changes
    │   ├── Control
    │   │   ├── CVS
    │   │   │   ├── Entries
    │   │   │   ├── Repository
    │   │   │   └── Root
    │   │   ├── Control.pm
    │   │   ├── MANIFEST
    │   │   └── Makefile.PL
    │   ├── Copying
    │   ├── MANIFEST
    │   ├── Makefile.PL
    │   ├── README
    │   ├── Run
    │   │   ├── CVS
    │   │   │   ├── Entries
    │   │   │   ├── Repository
    │   │   │   └── Root
    │   │   ├── MANIFEST
    │   │   ├── Makefile.PL
    │   │   └── Run.pm
    │   ├── SGE.pm
    │   ├── Status
    │   │   ├── CVS
    │   │   │   ├── Entries
    │   │   │   ├── Repository
    │   │   │   └── Root
    │   │   ├── MANIFEST
    │   │   ├── Makefile.PL
    │   │   └── Status.pm
    │   ├── examples
    │   │   ├── CVS
    │   │   │   ├── Entries
    │   │   │   ├── Repository
    │   │   │   └── Root
    │   │   ├── README
    │   │   ├── submit2cluster.pl
    │   │   └── test.pl
    │   └── t
    │   │   ├── CVS
    │   │       ├── Entries
    │   │       ├── Repository
    │   │       └── Root
    │   │   └── SGE.t
    ├── Teragrid-0.02
    │   ├── Changes
    │   ├── Control
    │   │   ├── CVS
    │   │   │   ├── Entries
    │   │   │   ├── Repository
    │   │   │   └── Root
    │   │   ├── Control.pm
    │   │   ├── MANIFEST
    │   │   └── Makefile.PL
    │   ├── Copying
    │   ├── Jobs
    │   │   ├── CVS
    │   │   │   ├── Entries
    │   │   │   ├── Repository
    │   │   │   └── Root
    │   │   ├── Jobs.pm
    │   │   ├── MANIFEST
    │   │   └── Makefile.PL
    │   ├── LSGW.pm
    │   ├── MANIFEST
    │   ├── Makefile.PL
    │   ├── README
    │   └── examples
    │   │   ├── CVS
    │   │       ├── Entries
    │   │       ├── Repository
    │   │       └── Root
    │   │   ├── all_jobs.pl
    │   │   ├── blast.pl
    │   │   ├── job.pl
    │   │   └── job_data.pl
    ├── alignment
    │   ├── __init__.py
    │   ├── dna_alignment.py
    │   ├── edit_distance.py
    │   ├── gapped_alignment.py
    │   ├── gapped_alignment2.py
    │   ├── local_alignment.py
    │   └── matrices.py
    └── raeseqlib.pm
├── phage
    ├── comapre_gpdb_mgv.py
    ├── envs
    │   └── phispy.yaml
    ├── genbank_has_phage.py
    ├── is_phage_function.py
    ├── metagenomes
    │   ├── contig_mv_samples.txt.gz
    │   ├── contigs_gokushovirus.blastn.gz
    │   ├── count_contigs.pl
    │   ├── count_contigs2.pl
    │   ├── count_phables.pl
    │   ├── crass_contigs.tsv.gz
    │   ├── crassphage_1percent.png
    │   ├── crassus.ipynb
    │   ├── crassus_results.tsv.gz
    │   ├── find_pb_segment2.pl
    │   ├── freezer.txt.gz
    │   ├── hist.png
    │   ├── hist_1percent.png
    │   ├── ibd_16s
    │   │   ├── RC2_16S_IBD_OTU.tsv.gz
    │   │   ├── RC2_16S_IBD_metadata.tsv.gz
    │   │   └── RC2_16S_IBD_taxadata.tsv.gz
    │   ├── join_vir.pl
    │   ├── limit_contigs.pl
    │   ├── microviridae.ipynb
    │   ├── microviridae.png
    │   ├── microviridae58782.png
    │   ├── microviridae_correlations.tsv.gz
    │   ├── microvirus_contig_count_table.tsv.gz
    │   ├── microvirus_contigs.pl
    │   ├── most_abundant.pl
    │   ├── most_abundant.txt.gz
    │   ├── mv_gen_cont_samples.txt.gz
    │   ├── mv_samples.txt.gz
    │   ├── mv_sequences.txt.gz
    │   ├── our_crassphage.tsv.gz
    │   ├── pb199070.png
    │   ├── pb199070_2.png
    │   ├── pb199070_both.png
    │   ├── pb58328.png
    │   ├── phables_mv_samples.txt.gz
    │   ├── pharokka_top_hits_mash_inphared.nonone.tsv.gz
    │   ├── pharokka_top_hits_mash_inphared.tsv.gz
    │   ├── picobirnaviridae edited.png
    │   ├── picobirnaviridae.ipynb
    │   ├── picobirnaviridae_contig_count_table.tsv.gz
    │   ├── picobirnaviridae_contigs.pl
    │   ├── picobirnaviridae_correlations.tsv.gz
    │   ├── picobirnaviridae_rdrp.tblastn.gz
    │   ├── rdrp.tblastn.tsv.gz
    │   ├── rdrp_contig_count_table.tsv.gz
    │   ├── rdrp_contigs.pl
    │   ├── rdrp_wehave.tsv.gz
    │   ├── sampleSeqCounts.tsv.gz
    │   ├── sample_genome_read_counts.tsv.gz
    │   ├── virus_contig_annotations.tsv.gz
    │   └── virus_contig_annotations_samples.tsv.gz
    ├── phage_functions.py
    ├── phage_functions_gbk.py
    ├── phage_quality.snakefile
    ├── phage_quality_assessment_scripts
    │   ├── av_protein_lengths.py
    │   ├── check_phage_functions.py
    │   ├── coding_vs_noncoding.py
    │   ├── count_adjacent_orfs.py
    │   └── phage_quality_cluster.snakefile
    ├── phage_quality_config.yaml
    ├── phispy_download_conda.snakefile
    ├── phispy_vogs_download.snakefile
    ├── phispy_vogs_download_submit.sh
    ├── plot_phage_tsne.py
    ├── prophage_from_genbank.py
    ├── read_ends.py
    ├── remove_prophages.py
    ├── remove_prophages_sequences.py
    ├── separate_prophages.py
    ├── separate_prophages_coordinates.py
    ├── submit_phispy_vogs_download.sh
    └── write_prophages.py
├── phage_clustering
    ├── bit_score.py
    └── bit_score_by_len.py
├── phage_protein_blast_genera
    ├── README.md
    ├── blast_tax_to_genera.py
    ├── genera_per_phage_protein.py
    ├── num_best_hits.py
    ├── num_prots_vs_taxa.py
    ├── phage_host_location.txt
    ├── plot_best_hits.py
    └── tax_violin_plots.py
├── primers
    ├── count_primers.pl
    ├── find_adapter_sequences.pl
    ├── find_dangling_adapters.py
    └── match_primers.py
├── process_EK_metagenomes
    ├── IlluminaAdapters.fa
    ├── README.md
    ├── bin
    │   ├── compress.slurm
    │   ├── merge_counts.pl
    │   ├── normalize_data.pl
    │   └── sankey_matic.pl
    ├── count_fastq.slurm
    ├── count_mmseqs.pl
    ├── count_mmseqs.slurm
    ├── count_sharks.slurm
    ├── count_subsystems.pl
    ├── count_subsystems.slurm
    ├── fastp.slurm
    ├── fastq2fasta.slurm
    ├── join_sagc_lanes.slurm
    ├── kraken2otu.py
    ├── megahit.slurm
    ├── megahit_submit.slurm
    ├── mmseqs_add_subsystems.slurm
    ├── mmseqs_easy_taxonomy.slurm
    ├── mmseqs_easy_taxonomy_submit.sh
    ├── mmseqs_easy_taxonomy_submit.slurm
    ├── sharks.slurm
    ├── vamb.slurm
    ├── vamb_concatenate.py
    ├── vamb_create_fasta.py
    └── vamb_minimap.slurm
├── process_JCJ_metagenomes
    ├── README.md
    ├── bin
    │   └── sankey_matic.pl
    ├── fastp.slurm
    ├── fastq2fasta.slurm
    ├── humans.slurm
    ├── megahit.slurm
    ├── megahit_submit.sh
    ├── mmseqs_easy_taxonomy.slurm
    ├── mmseqs_easy_taxonomy_submit.slurm
    ├── vamb.slurm
    ├── vamb_concatenate.py
    ├── vamb_create_fasta.py
    └── vamb_minimap.slurm
├── process_metagenomes
    ├── README.md
    ├── fastp.slurm
    ├── fastq2fasta.slurm
    ├── host_removal.slurm
    ├── megahit.slurm
    ├── mmseqs_add_subsystems.slurm
    ├── mmseqs_easy_taxonomy.slurm
    ├── mmseqs_easy_taxonomy_submit.slurm
    ├── mmseqs_taxonomy.slurm
    ├── vamb.slurm
    ├── vamb_concat.slurm
    ├── vamb_create_fasta_clusters.py
    └── vamb_minimap.slurm
├── prophages
    ├── download_phage_slices.py
    ├── phage_finder_tests.snakefile
    ├── phageboost_genbank.py
    ├── phageboost_tests.snakefile
    ├── phigaro_tests.snakefile
    ├── phispy_phage_genes.snakefile
    ├── phispy_training_vs_test.snakefile
    ├── phispy_with_training.snakefile
    ├── prophage_proteins.snakefile
    ├── run_phispy_snakemakes.sh
    ├── run_virsorter.snakefile
    └── virsorter_tests.snakefile
├── proteins
    ├── md5_to_ncbi_taxonomy.py
    ├── protein_md5.py
    ├── protein_md5_fast.py
    └── unique_protein_ids.py
├── proxymeta
    ├── README.md
    └── find_mates.py
├── pymol
    └── draw_images.py
├── rc2
    └── compare_fastq_files.py
├── refs_and_citations
    ├── altmetric_one.py
    ├── altmetrics.py
    ├── compare_titles.py
    ├── gs_download_cites.py
    ├── orcid_vs_google.py
    ├── refs2csv.py
    ├── refs2csv_tk.py
    └── summarise_pubs_counts.py
├── requirements.txt
├── requirements_mini.txt
├── rob_tests
    ├── hashes.c
    ├── hashing.c
    ├── sequences.py
    └── test_stream_pair.py
├── roblib
    ├── __init__.py
    ├── alignments.py
    ├── bcolors.py
    ├── blast.py
    ├── colours.py
    ├── date_parsing.py
    ├── dna.py
    ├── dnadist.py
    ├── files.py
    ├── functions.py
    ├── genbank.py
    ├── geography.py
    ├── newick.py
    ├── rob_error.py
    ├── seqio_filter.py
    ├── sequences.py
    ├── stats.py
    ├── strings.py
    └── translate.py
├── roblib_tk
    ├── __init__.py
    └── file_chooser.py
├── sankey
    └── sankey_plot.ipynb
├── sdsu
    ├── average_grade.pl
    ├── parse_pos.py
    └── thesis_parse.py
├── searchSRA
    ├── envs
    │   └── samtools.yaml
    ├── filter_reads.sh
    ├── merge_counts_abstracts.py
    ├── process.smk
    ├── process_expand.smk
    └── searchSRA_abstracts.tsv.gz
├── seed
    └── pegs_in_order.py
├── seed_servers
    ├── RAST-alljobs.pl
    ├── RAST-jobs.pl
    ├── RAST-retrieve-jobs.pl
    ├── RAST-status.pl
    ├── RAST-submit-jobs.pl
    └── test_occ_roles.py
├── silva
    └── parse_silva_act.py
├── snakemake
    ├── EnvBiotec_process_metagenomes.snakefile
    ├── SAGC_process_metagenomes.snakefile
    ├── abricate.snakefile
    ├── annotate_phages.snakefile
    ├── cluster.snakefile
    ├── cluster_phages.snakefile
    ├── deconseq.snakefile
    ├── envs
    │   ├── bowtie.yaml
    │   ├── canu.yaml
    │   ├── filtlong.yaml
    │   ├── flye.yaml
    │   ├── focus.yaml
    │   ├── kraken.yaml
    │   ├── megahit.yaml
    │   ├── miniasmminipolish.yaml
    │   ├── minimap.yaml
    │   ├── prinseq.yaml
    │   ├── raven.yaml
    │   ├── seqtk.yaml
    │   ├── superfocus.yaml
    │   └── trycycler.yaml
    ├── kraken.snakefile
    ├── patric_complete_genomes_proteins.snakefile
    ├── phispy.snakefile
    ├── phispy.yaml
    ├── process_metagenomes.json
    └── process_metagenomes.snakefile
├── snakemake_tests
    └── test.snakefile
├── sra
    ├── README.md
    ├── SRA.partie.tsv.gz
    ├── bigquery_json.py
    ├── bigquery_json2csv.py
    ├── filter.py
    ├── phage_bacteria.ipynb
    ├── plot_3d.ipynb
    ├── plot_partie_3d.py
    ├── plot_partie_boxes.py
    ├── run_accession-experiment_lib.tsv.gz
    ├── runs_to_abstracts.pl
    ├── sra_by_date.py
    ├── sra_file_sizes.py
    ├── sra_status.py
    ├── sra_xml.py
    ├── sra_xml_dir.py
    ├── sra_xml_print_all_attributes.py
    └── study_types.py
├── superfocus_all
    ├── join_output.py
    ├── summarize_hits.py
    └── superfocus_to_taxonomy.py
├── taxon
    ├── Error.py
    ├── README.md
    ├── __init__.py
    ├── config.py
    ├── load_from_database.py
    ├── mmseqs_report_to_table.py
    ├── read_accession_files.py
    ├── sqlite_taxon.py
    ├── taxon.py
    └── taxonomy
    │   ├── __init__.py
    │   └── taxonomy.py
├── testrepeatfinder
    ├── README.md
    ├── ROBTEST.repeatfinder
    ├── compare.pl
    ├── errors
    ├── files.txt
    ├── fna_repeats_to_seq.pl
    ├── fna_to_repeats.py
    ├── pp1.test
    ├── pp2.test
    ├── repeatFinder.cpp
    ├── repeatFinder.h
    ├── setup.py
    ├── tempRepeatDNA.99620.pp.1.fasta
    ├── tempRepeatDNA.99620.pp.1.fasta.repeatfinder
    ├── tempRepeatDNA.99620.pp.2.fasta
    ├── tempRepeatDNA.99620.pp.2.fasta.repeatfinder
    ├── test.fasta
    ├── test.fasta.repeatfinder
    ├── test
    │   ├── pp1.fasta
    │   ├── pp3.fasta
    │   ├── pp4.fasta
    │   ├── pp5.fasta
    │   ├── pp6.fasta
    │   └── pp7.fasta
    └── test_repeatfinder.py
├── text_matching
    ├── vfdb.txt.gz
    └── virulence_matching.py
├── thea
    ├── add_source_to_rapsearch.py
    ├── count_hits.cpp
    ├── count_lastal_hits.cpp
    ├── lastal_abund_ubiq.cpp
    ├── load_rapsearch_sqlite.py
    ├── locate_orfs.py
    ├── normalize_hits.cpp
    ├── orf_evidence.py
    ├── overlapping_orfs.py
    └── rapsearch_check_translation.py
└── trees
    ├── dist_matrix.py
    ├── negative_branch_lengths.py
    ├── rename_trees.ori.py
    ├── rename_trees.py
    ├── rename_trees_crassphage.py
    ├── tree_to_cophenetic_matrix.py
    ├── tree_to_pairwisedistance.py
    └── trim_alignment.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | *.gz filter=lfs diff=lfs merge=lfs -text
2 | *.sqlite filter=lfs diff=lfs merge=lfs -text
3 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "ProphageGenomics"]
2 | 	path = ProphageGenomics
3 | 	url = git@github.com:hkang408/ProphageGenomics.git
4 | 


--------------------------------------------------------------------------------
/ANI/C/Version1.1/Libraries/FNACharactersLib.h:
--------------------------------------------------------------------------------
1 | #ifndef _FNACHARACTERSLIB_H_
2 | #define _FNACHARACTERSLIB_H_
3 | 
4 | extern char** fnaCharactersOf(const char* fnaFileNameAndLocation, const int LINESIZE, const int numberOfLines);
5 | 
6 | #endif


--------------------------------------------------------------------------------
/ANI/C/Version1.1/Libraries/levenshteinDistanceLib.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <string.h>
 3 | #include "levenshteinDistanceLib.h"
 4 | //Levenshtein edit distance
 5 | int editDistance(const char* S, const char* T, int sLength, int tLength) {
 6 | 	int minimum, first, second, third, conditional;
 7 | 	if(sLength == 0) {
 8 | 		return tLength;
 9 | 	}
10 | 	if(tLength == 0) {
11 | 		return sLength;
12 | 	}
13 | 	if(S[sLength-1] != T[tLength - 1]) {
14 | 		conditional = 1;
15 | 	} else {
16 | 		conditional = 0;
17 | 	}
18 | 	first = editDistance(S, T, sLength - 1, tLength) + 1;
19 | 	second = editDistance(S, T, sLength, tLength - 1) + 1;
20 | 	third = editDistance(S, T, sLength - 1, tLength - 1) + conditional;
21 | 	minimum = first;
22 | 	if(first > second) {
23 | 		minimum = second;
24 | 	} 
25 | 	if (second > third) {
26 | 		minimum = third;
27 | 	}
28 | 	return minimum;
29 | }


--------------------------------------------------------------------------------
/ANI/C/Version1.1/Libraries/levenshteinDistanceLib.h:
--------------------------------------------------------------------------------
1 | #ifndef _LEVENSHTEINDISTANCELIB_H_
2 | #define _LEVENSHTEINDISTANCELIB_H_
3 | 
4 | extern int editDistance(const char* S, const char* T, int sLength, int tLength);
5 | 
6 | #endif


--------------------------------------------------------------------------------
/ANI/C/Version1.1/Libraries/numberOfLinesLib.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <string.h>
 3 | #include <stdlib.h>
 4 | #include "numberOfLinesLib.h"
 5 | 
 6 | int numberOfLines(const int LINESIZE, const char *FNANAME) {
 7 |     FILE *fnaPointer;
 8 |     char singleCharLine[100];
 9 |     fnaPointer = fopen(FNANAME, "r");
10 |     if(fnaPointer == NULL) {
11 |         printf("%s", "Error: fnaPointer is null\n");
12 |         exit(EXIT_FAILURE);
13 |     }
14 |     int numberOfLines = 0;
15 |     while (fgets(singleCharLine, 100, fnaPointer) != NULL) {
16 |         numberOfLines++;
17 |     }
18 |     if(numberOfLines == 0) {
19 |         printf("%s", "Error: No Lines in .fna file\n");
20 |         exit(EXIT_FAILURE);
21 |     }
22 |     fclose(fnaPointer);
23 |     return numberOfLines;
24 | }


--------------------------------------------------------------------------------
/ANI/C/Version1.1/Libraries/numberOfLinesLib.h:
--------------------------------------------------------------------------------
1 | #ifndef _NUMBEROFLINESLIB_H_
2 | #define _NUMBEROFLINESLIB_H_
3 | 
4 | extern int numberOfLines(const int LINESIZE, const char *FNANAME);
5 | 
6 | #endif


--------------------------------------------------------------------------------
/ANI/C/Version1.1/Tests/fnaFiles/5mers.fna:
--------------------------------------------------------------------------------
1 | Expected ANI: 21
2 | CTGATCGATC


--------------------------------------------------------------------------------
/ANI/C/Version1.1/Tests/fnaFiles/5mers2.fna:
--------------------------------------------------------------------------------
1 | Expected ANI: 21
2 | TCAGCTGCTA


--------------------------------------------------------------------------------
/ANI/C/Version1.2/Libraries/FNACharactersLib.h:
--------------------------------------------------------------------------------
1 | #ifndef _FNACHARACTERSLIB_H_
2 | #define _FNACHARACTERSLIB_H_
3 | 
4 | extern char** fnaCharactersOf(const char* fnaFileNameAndLocation, const int LINESIZE, const int numberOfLines);
5 | 
6 | #endif


--------------------------------------------------------------------------------
/ANI/C/Version1.2/Libraries/levenshteinDistanceLib.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | int editDistance(const char* S, const char* T, int kmerLength) {
 3 |     int matrix[kmerLength + 1][kmerLength + 1];
 4 |     for (int i = 0; i <= kmerLength; i++) {
 5 |         matrix[i][0] = i;
 6 |         matrix[0][i] = i;
 7 |     }
 8 |     for (int j = 1; j <= kmerLength; j++) {
 9 |         for (int k = 1; k <= kmerLength; k++) {
10 |             if (S[j-1] == T[k-1]) 
11 |                 matrix[j][k] = matrix[j-1][k-1];
12 |             else 
13 |                 matrix[j][k] = (matrix[j][k-1] + 1) > (matrix[j-1][k] + 1) ? ((matrix[j-1][k-1] + 1) > (matrix[j-1][k] + 1) ? (matrix[j-1][k] + 1) : (matrix[j-1][k-1] + 1)) : ((matrix[j-1][k-1] + 1) > (matrix[j][k-1] + 1) ? (matrix[j][k-1] + 1) : (matrix[j-1][k-1] + 1));
14 |         }
15 |     }
16 |     return matrix[kmerLength][kmerLength];
17 | }


--------------------------------------------------------------------------------
/ANI/C/Version1.2/Libraries/levenshteinDistanceLib.h:
--------------------------------------------------------------------------------
1 | #ifndef _LEVENSHTEINDISTANCELIB_H_
2 | #define _LEVENSHTEINDISTANCELIB_H_
3 | 
4 | extern int editDistance(const char* S, const char* T, int sLength, int tLength);
5 | 
6 | #endif


--------------------------------------------------------------------------------
/ANI/C/Version1.2/Libraries/numberOfLinesLib.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <string.h>
 3 | #include <stdlib.h>
 4 | #include "numberOfLinesLib.h"
 5 | 
 6 | int numberOfLines(const int LINESIZE, const char *FNANAME) {
 7 |     FILE *fnaPointer;
 8 |     char singleCharLine[256];
 9 |     fnaPointer = fopen(FNANAME, "r");
10 |     if(fnaPointer == NULL) {
11 |         printf("%s", "Error: fnaPointer is null\n");
12 |         exit(EXIT_FAILURE);
13 |     }
14 |     int numberOfLines = 0;
15 |     while (fgets(singleCharLine, 256, fnaPointer) != NULL) {
16 |         numberOfLines++;
17 |     }
18 |     if(numberOfLines == 0) {
19 |         printf("%s", "Error: No Lines in .fna file\n");
20 |         exit(EXIT_FAILURE);
21 |     }
22 |     fclose(fnaPointer);
23 |     return numberOfLines;
24 | }


--------------------------------------------------------------------------------
/ANI/C/Version1.2/Libraries/numberOfLinesLib.h:
--------------------------------------------------------------------------------
1 | #ifndef _NUMBEROFLINESLIB_H_
2 | #define _NUMBEROFLINESLIB_H_
3 | 
4 | extern int numberOfLines(const int LINESIZE, const char *FNANAME);
5 | 
6 | #endif


--------------------------------------------------------------------------------
/ANI/C/Version1/Libraries/FNACharactersLib/FNACharactersLib.h:
--------------------------------------------------------------------------------
1 | #ifndef _FNACHARACTERSLIB_H_
2 | #define _FNACHARACTERSLIB_H_
3 | 
4 | extern char** fnaCharactersOf(const char* fnaFileNameAndLocation, const int LINESIZE, const int numberOfLines);
5 | 
6 | #endif


--------------------------------------------------------------------------------
/ANI/C/Version1/Libraries/levenshteinDistanceLib/levenshteinDistanceLib.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <string.h>
 3 | #include "levenshteinDistanceLib.h"
 4 | //Levenshtein edit distance
 5 | int editDistance(const char* S, const char* T, int sLength, int tLength) {
 6 | 	int minimum, first, second, third, conditional;
 7 | 	if(sLength == 0) {
 8 | 		return tLength;
 9 | 	}
10 | 	if(tLength == 0) {
11 | 		return sLength;
12 | 	}
13 | 	if(S[sLength-1] != T[tLength - 1]) {
14 | 		conditional = 1;
15 | 	} else {
16 | 		conditional = 0;
17 | 	}
18 | 	first = editDistance(S, T, sLength - 1, tLength) + 1;
19 | 	second = editDistance(S, T, sLength, tLength - 1) + 1;
20 | 	third = editDistance(S, T, sLength - 1, tLength - 1) + conditional;
21 | 	minimum = first;
22 | 	if(first > second) {
23 | 		minimum = second;
24 | 	} 
25 | 	if (second > third) {
26 | 		minimum = third;
27 | 	}
28 | 	return minimum;
29 | }


--------------------------------------------------------------------------------
/ANI/C/Version1/Libraries/levenshteinDistanceLib/levenshteinDistanceLib.h:
--------------------------------------------------------------------------------
1 | #ifndef _LEVENSHTEINDISTANCELIB_H_
2 | #define _LEVENSHTEINDISTANCELIB_H_
3 | 
4 | extern int editDistance(const char* S, const char* T, int sLength, int tLength);
5 | 
6 | #endif


--------------------------------------------------------------------------------
/ANI/C/Version1/Libraries/numberOfLinesLib/numberOfLinesLib.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <string.h>
 3 | #include <stdlib.h>
 4 | #include "numberOfLinesLib.h"
 5 | 
 6 | int numberOfLines(const int LINESIZE, const char *FNANAME) {
 7 |     FILE *fnaPointer;
 8 |     char singleCharLine[100];
 9 |     fnaPointer = fopen(FNANAME, "r");
10 |     if(fnaPointer == NULL) {
11 |         printf("%s", "Error: fnaPointer is null\n");
12 |         exit(EXIT_FAILURE);
13 |     }
14 |     int numberOfLines = 0;
15 |     while (fgets(singleCharLine, 100, fnaPointer) != NULL) {
16 |         numberOfLines++;
17 |     }
18 |     if(numberOfLines == 0) {
19 |         printf("%s", "Error: No Lines in .fna file\n");
20 |         exit(EXIT_FAILURE);
21 |     }
22 |     fclose(fnaPointer);
23 |     return numberOfLines;
24 | }


--------------------------------------------------------------------------------
/ANI/C/Version1/Libraries/numberOfLinesLib/numberOfLinesLib.h:
--------------------------------------------------------------------------------
1 | #ifndef _NUMBEROFLINESLIB_H_
2 | #define _NUMBEROFLINESLIB_H_
3 | 
4 | extern int numberOfLines(const int LINESIZE, const char *FNANAME);
5 | 
6 | #endif


--------------------------------------------------------------------------------
/ANI/C/Version1/Libraries/queryKmersLib/queryKmersLib.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <string.h>
 3 | #include <stdlib.h>
 4 | #include "queryKmersLib.h"
 5 | // 2 <= kmerSize <= 66
 6 | //67+ will return kmers of 1000+ characters for some reason(lineSize = 70?)
 7 | char** queryKmersIntoArray(const char* dnaChars, const int KMERSIZE, const int numberOfLines, const int LINESIZE) {
 8 | 	const int numberOfKmers = numberOfLines*LINESIZE;
 9 | 	char** kmersArrayMalloc = (char**) malloc(numberOfKmers*sizeof(char*));
10 | 	if(kmersArrayMalloc == NULL) { 
11 |         printf("%s", "Error: kmersArrayMalloc\n");
12 |         exit(EXIT_FAILURE);
13 | 	}
14 | 	for(int i = 0; i < numberOfKmers; i++) {
15 |       kmersArrayMalloc[i] = (char *) malloc(sizeof(char)*KMERSIZE*10);
16 |       	if(kmersArrayMalloc[i] == NULL) { 
17 |         printf("%s", "Error: kmersArrayMalloc\n");
18 |         exit(EXIT_FAILURE);
19 | 	}
20 |    }
21 | 	for(int i = 0; i < strlen(dnaChars) - KMERSIZE + 1; i++) {
22 | 		memcpy(kmersArrayMalloc[i], &dnaChars[i], KMERSIZE);
23 | 	}
24 | 	return (char**) kmersArrayMalloc;
25 | 	
26 | }


--------------------------------------------------------------------------------
/ANI/C/Version1/Libraries/queryKmersLib/queryKmersLib.h:
--------------------------------------------------------------------------------
1 | #ifndef _QUERYKMERSLIB_H_
2 | #define _QUERYKMERSLIB_H_
3 | 
4 | extern char **queryKmersIntoArray(const char* dnaChars, int kmerSize, const int numberOfLines, const int LINESIZE);
5 | 
6 | #endif


--------------------------------------------------------------------------------
/ANI/C/Version1/Tests/ANI/5mers.fna:
--------------------------------------------------------------------------------
1 | Expected ANI: 21
2 | CTGATCGATC


--------------------------------------------------------------------------------
/ANI/C/Version1/Tests/ANI/5mers2.fna:
--------------------------------------------------------------------------------
1 | Expected ANI: 21
2 | TCAGCTGCTA


--------------------------------------------------------------------------------
/ANI/C/Version1/Tests/kmers/5mers/5mers.fna:
--------------------------------------------------------------------------------
1 | Expected kmer array output: CTGAT TGATC GATCG ATCGA TCGAT CGATC 
2 | CTGATCGATC


--------------------------------------------------------------------------------
/AlphaFold/best_scores.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Read all the ranking_debug.json files and report the best model and its pLDDT score
 3 | """
 4 | 
 5 | import os
 6 | import sys
 7 | import argparse
 8 | import json
 9 | 
10 | __author__ = 'Rob Edwards'
11 | 
12 | if __name__ == "__main__":
13 |     parser = argparse.ArgumentParser(description=' ')
14 |     parser.add_argument('-d', help='directory with ranking_debug.json', required=True)
15 |     parser.add_argument('-v', help='verbose output', action='store_true')
16 |     args = parser.parse_args()
17 | 
18 |     rdj = os.path.join(args.d, "ranking_debug.json")
19 |     if not os.path.exists(rdj):
20 |         sys.stderr.write(f"Error: {rdj} not found\n")
21 |         sys.exit(1)
22 | 
23 |     f = open(rdj, 'r')
24 |     d = json.load(f)
25 |     bm = d['order'][0]
26 |     pl = d['plddts'][bm]
27 |     print("\t".join(map(str, [args.d, bm, pl])))


--------------------------------------------------------------------------------
/AlphaFold/ranking_debug.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "plddts": {
 3 |         "model_1": 87.45812934795958,
 4 |         "model_2": 84.52258185860828,
 5 |         "model_3": 87.78593692844834,
 6 |         "model_4": 87.99449366375426,
 7 |         "model_5": 85.32976078502992
 8 |     },
 9 |     "order": [
10 |         "model_4",
11 |         "model_3",
12 |         "model_1",
13 |         "model_5",
14 |         "model_2"
15 |     ]
16 | }


--------------------------------------------------------------------------------
/Bangers/JQ995537.faa.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:f1e079a13647da7641cb9fb2417984df8a7b70655dfc1d166a2a3baa84cf2aeb
3 | size 19527
4 | 


--------------------------------------------------------------------------------
/Bangers/README.md:
--------------------------------------------------------------------------------
 1 | # Bangers
 2 | 
 3 | `B`eyond `A` `N`ucleotide `G`enerated `E`valuation of `R`elationships between `S`equences.
 4 | 
 5 | This is a project of Mike and Rob, and you should probably ignore it.
 6 | 
 7 | If you want to play, compile like so:
 8 | 
 9 | ```
10 | gcc -Wall -o bangers bangers.c -lz
11 | ```
12 | 
13 | and run like so:
14 | 
15 | ```
16 | ./bangers JQ995537.faa.gz
17 | ```
18 | 
19 | It will convert amino acid strings into something else
20 | 


--------------------------------------------------------------------------------
/Flinders/__init_.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | __author__ = 'Rob Edwards'
 5 | 
 6 | from .substitution_rules import score
 7 | 
 8 | __all__ = [
 9 |         'scores'
10 | ]
11 | 


--------------------------------------------------------------------------------
/LizMetagenomes/envs/focus.yaml:
--------------------------------------------------------------------------------
1 | name: focus
2 | channels:
3 |     - conda-forge
4 |     - bioconda
5 |     - default
6 | dependencies:
7 |     - focus
8 | 


--------------------------------------------------------------------------------
/LizMetagenomes/envs/prinseq.yaml:
--------------------------------------------------------------------------------
1 | name: prinseq-plus-plus
2 | channels:
3 |     - conda-forge
4 |     - bioconda
5 |     - default
6 | dependencies:
7 |     - prinseq-plus-plus
8 | 


--------------------------------------------------------------------------------
/LizMetagenomes/envs/superfocus.yaml:
--------------------------------------------------------------------------------
1 | name: superfocus
2 | channels:
3 |     - conda-forge
4 |     - bioconda
5 |     - default
6 | dependencies:
7 |     - super-focus
8 | 


--------------------------------------------------------------------------------
/ModelSEED/json_keys.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Read some json files and print all keys
 3 | """
 4 | 
 5 | import os
 6 | import sys
 7 | import argparse
 8 | import json
 9 | 
10 | 
11 | __author__ = 'Rob Edwards'
12 | 
13 | if __name__ == "__main__":
14 |     parser = argparse.ArgumentParser(description=' ')
15 |     parser.add_argument('-f', help='json file', required=True)
16 |     parser.add_argument('-v', help='verbose output', action='store_true')
17 |     args = parser.parse_args()
18 | 
19 |     akeys = set()
20 |     t = json.load(open(args.f, 'r'))
21 | 
22 |     print("{}".format("\n".join(t.keys())))
23 | 


--------------------------------------------------------------------------------
/ModelSEED/json_keys_keys.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Read some json files and print all keys
 3 | """
 4 | 
 5 | import os
 6 | import sys
 7 | import argparse
 8 | 
 9 | from roblib import bcolors
10 | import json
11 | 
12 | 
13 | __author__ = 'Rob Edwards'
14 | 
15 | if __name__ == "__main__":
16 |     parser = argparse.ArgumentParser(description=' ')
17 |     parser.add_argument('-f', help='json file', required=True)
18 |     parser.add_argument('-v', help='verbose output', action='store_true')
19 |     args = parser.parse_args()
20 | 
21 |     akeys = set()
22 |     t = json.load(open(args.f, 'r'))
23 |     for k in t:
24 |         akeys.update(t[k].keys())
25 | 
26 |     print("{}".format("\n".join(akeys)))
27 | 


--------------------------------------------------------------------------------
/ModelSEED/json_list_keys.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Read some json files and print all keys
 3 | """
 4 | 
 5 | import os
 6 | import sys
 7 | import argparse
 8 | 
 9 | from roblib import bcolors
10 | import json
11 | 
12 | 
13 | __author__ = 'Rob Edwards'
14 | 
15 | if __name__ == "__main__":
16 |     parser = argparse.ArgumentParser(description=' ')
17 |     parser.add_argument('-f', help='json file', required=True)
18 |     parser.add_argument('-v', help='verbose output', action='store_true')
19 |     args = parser.parse_args()
20 | 
21 |     akeys = set()
22 |     t = json.load(open(args.f, 'r'))
23 |     for k in t:
24 |         akeys.update(k.keys())
25 | 
26 |     print("{}".format("\n".join(akeys)))
27 | 


--------------------------------------------------------------------------------
/PythonClass/parse_genbank.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from Bio import SeqIO
 3 | 
 4 | seq = SeqIO.read('sequence.gb', 'genbank')
 5 | print(seq.id)
 6 | 
 7 | 
 8 | with open('features.tsv', 'w') as out:
 9 |     for feature in seq.features:
10 |         if 'locus_tag' in feature.qualifiers:
11 |             lt = feature.qualifiers['locus_tag'][0]
12 |         if 'product' in feature.qualifiers:
13 |             out.write(lt + "\t" + feature.qualifiers['product'][0] + "\n")


--------------------------------------------------------------------------------
/PythonClass/random_sequence.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Generate a random sequence
 3 | """
 4 | 
 5 | import os
 6 | import sys
 7 | import argparse
 8 | from random import randint
 9 | __author__ = 'Rob Edwards'
10 | 
11 | 
12 | def random_sequence(maxlen):
13 |     """
14 |     Generate a random DNA sequence less than maxlen size
15 |     """
16 | 
17 |     bases = ["A", "G", "C", "T"]
18 |     for i in range(maxlen):
19 |         print(bases[randint(0,3)], end="")
20 |     print()
21 | 
22 | 
23 | 
24 | 
25 | 
26 | if __name__ == "__main__":
27 |     parser = argparse.ArgumentParser(description=' ')
28 |     parser.add_argument('-n', type=int, default=1000,
29 |                         help='maximum sequence length (default=1000)')
30 |     parser.add_argument('-v', help='verbose output', action='store_true')
31 |     args = parser.parse_args()
32 |     
33 |     random_sequence(args.n)
34 | 
35 | 
36 | 
37 | 
38 | 
39 | 
40 | 


--------------------------------------------------------------------------------
/RAST/RAST-alljobs.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl -w
 2 | #
 3 | 
 4 | use strict;
 5 | use Data::Dumper;
 6 | $ENV{SAS_SERVER}="PUBSEED";
 7 | print STDERR "SAS is $ENV{SAS_SERVER}\n";
 8 | use Term::ReadKey;
 9 | use RASTserver;
10 | 
11 | ## Use RAST test, not regular RAST
12 | # Now using regular RAST
13 | 
14 | 
15 | print "Please enter your RAST username:  ";
16 | my $user = ReadLine(0);
17 | chomp $user;
18 | 
19 | print "Please enter your RAST password:  ";
20 | ReadMode 2;
21 | my $password = ReadLine(0);
22 | chomp $password;
23 | ReadMode 1;
24 | print "\n";
25 | 
26 | 
27 | my $rast=new RASTserver($user, $password);
28 | unless (defined $rast) {die "Can't connect ot the rast server"}
29 | 
30 | print Dumper($rast->jobs());
31 | 


--------------------------------------------------------------------------------
/RAST/RAST-jobs.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl -w
 2 | #
 3 | #
 4 | 
 5 | use strict;
 6 | use RASTserver;
 7 | use Term::ReadKey;
 8 | use Data::Dumper;
 9 | $ENV{SAS_SERVER}="PUBSEED";
10 | 
11 | print "Please enter your RAST username:  ";
12 | my $user = ReadLine(0);
13 | chomp $user;
14 | 
15 | print "Please enter your RAST password:  ";
16 | ReadMode 2;
17 | my $password = ReadLine(0);
18 | chomp $password;
19 | ReadMode 1;
20 | print "\n";
21 | 
22 | my $rast=new RASTserver($user, $password);
23 | unless (defined $rast) {die "Can't connect ot the rast server"}
24 | 
25 | my $time = time; my $job = 0;
26 | my @jobs = $rast->jobs();
27 | 
28 | foreach my $j (@jobs) {
29 | 	print Dumper($j);
30 | 	print STDERR $job++, " : ", ($time-time), " seconds\n";
31 | }
32 | 


--------------------------------------------------------------------------------
/RAST/RAST-status.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | use strict;
 4 | use RASTserver;
 5 | $ENV{SAS_SERVER}="SEED";
 6 | use Term::ReadKey;
 7 | 
 8 | die "$0 [-u username] [-p password] <list of jobs>" unless (defined $ARGV[0]);
 9 | 
10 | my @ids;
11 | my $user; my $password;
12 | while (@ARGV) {
13 | 	my $t=shift @ARGV;
14 | 	if ($t eq "-u") {$user = shift @ARGV}
15 | 	elsif ($t eq "-p") {$password = shift @ARGV}
16 | 	else {push @ids, $t}
17 | }
18 | 
19 | if (!$user) {
20 | 	print "Please enter your RAST username:  ";
21 | 	$user = ReadLine(0);
22 | 	chomp $user;
23 | }
24 | 
25 | if (!$password) {
26 | 	print "Please enter your RAST password:  ";
27 | 	ReadMode 2;
28 | 	$password = ReadLine(0);
29 | 	chomp $password;
30 | 	ReadMode 1;
31 | 	print "\n";
32 | }
33 | 
34 | 
35 | my $rast=new RASTserver($user, $password);
36 | unless (defined $rast) {die "Can't connect ot the rast server"}
37 | 
38 | my $stat = $rast->status_of_RAST_job({-job => \@ids});
39 | 
40 | foreach my $job (sort {$a <=> $b} keys %$stat) {
41 | 	print join("\t", $job, $stat->{$job}->{'status'}), "\n";
42 | }
43 | 


--------------------------------------------------------------------------------
/RAST/make_assigned_functions.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl 
 2 | #
 3 | 
 4 | use strict;
 5 | 
 6 | my $dir = shift || die "seed directory";
 7 | my $odir = shift;
 8 | unless ($odir) {$odir = $dir}
 9 | 
10 | if (-e "$dir/assigned_functions") {
11 | 	print STDERR "Backing up assigned_functions\n";
12 | 	`cp -f $dir/assigned_functions $dir/assigned_functions.bak`;
13 | }
14 | 
15 | my %fn;
16 | open(IN, "$dir/proposed_non_ff_functions") ||die "$! proposed_non_ff_functions";
17 | while(<IN>) {
18 | 	chomp;
19 | 	my @a=split /\t/;
20 | 	$fn{$a[0]}=$a[1];
21 | }
22 | close IN;
23 | 
24 | open(IN, "$dir/proposed_functions") ||die "$! proposed_functions";
25 | while(<IN>) {
26 | 	chomp;
27 | 	my @a=split /\t/;
28 | 	$fn{$a[0]}=$a[1];
29 | }
30 | close IN;
31 | 
32 | 
33 | 
34 | open(OUT, ">$odir/assigned_functions") || die "$! assigned_functions";
35 | map {print OUT "$_\t$fn{$_}\n"} sort {$a cmp $b} keys %fn;
36 | close OUT;
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![Edwards Lab](https://img.shields.io/badge/Bioinformatics-EdwardsLab-03A9F4)](https://edwards.sdsu.edu/research)
 2 | 
 3 | EdwardsLab
 4 | ==========
 5 | 
 6 | Code from the Edwards lab, including bioinformatics, image analysis and more. All this code is created and maintained by folks at Rob Edwards' bioinformatics lab at SDSU.
 7 | 
 8 | For more information about the Edwards' lab see http://edwards.sdsu.edu/research.
 9 | 
10 | We use this repository to share our code and data with each other, and with the world.
11 | 
12 | We make all this code available to everyone to use. If you find errors or bugs please let Rob Edwards know ... see the above URL for contact information.
13 | 
14 | The [bin](bin) directory contains general scripts that we use on a day to day basis
15 | 


--------------------------------------------------------------------------------
/ViralBioinformaticsTools/README.md:
--------------------------------------------------------------------------------
 1 | # Viral Bioinformatics Tools
 2 | 
 3 | See [the Google Form](https://forms.gle/BaWcsAf6iqB7gkNGA) that you can fill in or the [submitted tools](https://docs.google.com/spreadsheets/d/1ClNgip08olKK-oBMMlPHBwIcilqSxsan8MEaYphUei4/edit?usp=sharing)
 4 | 
 5 | This data was generated from those forms.
 6 | 
 7 | See also the [Google Colab](https://colab.research.google.com/drive/1nsyMjnbjm_8AMR1FCubuTvNwU4KeqVQg?usp=sharing) notebook that uses this data.
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/ViralBioinformaticsTools/git_hub_dates.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:f405738a0a23e7b44db74810ffcc20005b999fe20b83c34d85acfd63f685328a
3 | size 4725
4 | 


--------------------------------------------------------------------------------
/ViralBioinformaticsTools/proj_start_stop.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:85aaa5789b4c2e2e13f07661bcf7b0f29b15ac897dfb49fdc646ccf81f3216c2
3 | size 2522
4 | 


--------------------------------------------------------------------------------
/ViralBioinformaticsTools/viral_bioinformatics_tools.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:b8658d0fa7f99b9f3f205bd9aa2d9fd2f452f037c77edbae994b6eff242fee4f
3 | size 13631
4 | 


--------------------------------------------------------------------------------
/VirusDiscoveryProject/DataSelection/README.md:
--------------------------------------------------------------------------------
 1 | # Data Selection
 2 | 
 3 | First, this is a temporary location, these files should be in the VirusDiscoveryProject repo.
 4 | 
 5 | Take a look at the [datasets](datasets.ipynb) jupyter notebook that summarizes everything.
 6 | 
 7 | The [WGS datasets](wgs_datasets.tsv.gz) file (*Note*: This is gzip compressed) has all the raw data. This is a `.tsv` file and so you can load it into a spreadsheet program.
 8 | 
 9 | The [random selection](random_selection.txt) file has a set of 1,000 metagenomes chosen at random.
10 | 
11 | The [size selection](size_selection.txt) file has 999 metagenome IDs chosen as small, medium, or large data sets.
12 | 
13 | The [phage selection](phage_size_selection.txt) file has 999 metagenome IDs chosen as small, medium, or large data sets but that have the most number of phage fragments.
14 | 
15 | 
16 | We should use the data sets listed in the [phage selection](phage_size_selection.txt) file.
17 | 


--------------------------------------------------------------------------------
/VirusDiscoveryProject/DataSelection/wgs_datasets.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:7187ab967e1b6b4f141a3028b839d5850f9163b8eadd7225b92aa96c03aaab8c
3 | size 5003956
4 | 


--------------------------------------------------------------------------------
/assembly/mummerplot.yaml:
--------------------------------------------------------------------------------
 1 | # YAML config file for the mummerplot snakefile
 2 | 
 3 | #directory where the fasta files reside
 4 | fasta: fasta
 5 | 
 6 | # the directory for reverse complemented fasta files as required
 7 | fasta_rc: fasta_rc
 8 | 
 9 | # where to write the mummer output (*.mums)
10 | mummer_output: mummer
11 | 
12 | # where to write the png files
13 | mummer_png: mummer_png
14 | 
15 | # final montage output plot
16 | montage: "mummer_montage.png"
17 | 
18 | 
19 | 
20 | 


--------------------------------------------------------------------------------
/bam/bam2fasta.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | import pysam
 4 | 
 5 | __author__ = 'Rob Edwards'
 6 | 
 7 | 
 8 | def qual2fastq(quals):
 9 |     """
10 |     Convert a list of quality scores to a single fastq line
11 | 
12 |     :param quals: A list of quality scores
13 |     :type quals: list
14 |     :return: A fastq quality string
15 |     :rtype: str
16 |     """
17 |     quality = [chr(q) for q in quals]
18 |     return "".join(quality)
19 | 
20 | if __name__ == '__main__':
21 |     parser = argparse.ArgumentParser(description='Convert bam to fastq')
22 |     parser.add_argument('-b', help='bam file', required=True)
23 |     parser.add_argument('-v', help='verbose output')
24 |     args = parser.parse_args()
25 | 
26 |     bamfile = pysam.AlignmentFile(args.b, "rb")
27 |     for read in bamfile.fetch(until_eof=True):
28 |         print(">{}\n{}".format(read.query_name, read.query_sequence))
29 | 


--------------------------------------------------------------------------------
/bam/bam2fastq.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | import pysam
 4 | 
 5 | __author__ = 'Rob Edwards'
 6 | 
 7 | 
 8 | def qual2fastq(quals):
 9 |     """
10 |     Convert a list of quality scores to a single fastq line
11 | 
12 |     :param quals: A list of quality scores
13 |     :type quals: list
14 |     :return: A fastq quality string
15 |     :rtype: str
16 |     """
17 |     quality = [chr(q + 33) for q in quals]
18 |     return "".join(quality)
19 | 
20 | if __name__ == '__main__':
21 |     parser = argparse.ArgumentParser(description='Convert bam to fastq')
22 |     parser.add_argument('-b', help='bam file', required=True)
23 |     parser.add_argument('-v', help='verbose output')
24 |     args = parser.parse_args()
25 | 
26 |     bamfile = pysam.AlignmentFile(args.b, "rb")
27 |     for read in bamfile.fetch(until_eof=True):
28 |         if read.query_qualities:
29 |             print("@{}\n{}\n+\n{}".format(read.query_name, read.query_sequence, qual2fastq(read.query_qualities)))
30 |         else:
31 |             print("@{}\n{}\n+\n".format(read.query_name, read.query_sequence))
32 | 


--------------------------------------------------------------------------------
/bam/list_reads.py:
--------------------------------------------------------------------------------
 1 | """
 2 | List all the reads that map to a bam file
 3 | """
 4 | 
 5 | import os
 6 | import sys
 7 | import argparse
 8 | import pysam
 9 | 
10 | parser = argparse.ArgumentParser(description="List all the reads in a bam file")
11 | parser.add_argument('-b', help='bam file', required=True)
12 | parser.add_argument('-v', help='verbose output', action="store_true")
13 | args = parser.parse_args()
14 | 
15 | bamfile = pysam.AlignmentFile(args.b, "rb")
16 | for read in bamfile.fetch(until_eof=True):
17 |     print(read.query_name)
18 | 


--------------------------------------------------------------------------------
/bin/Makefile:
--------------------------------------------------------------------------------
 1 | # A simple Makefile to compile the C code here
 2 | #
 3 | 
 4 | FLAGS := $(FLAGS) -Wall -O3
 5 | override CFLAGS += $(shell pkg-config --cflags --libs python3-embed)
 6 | 
 7 | all:
 8 | 	gcc $(FLAGS) -o fastq_avqual fastq_avqual.c -lz
 9 | 	gcc $(FLAGS) -o count_fasta count_fasta.c -lz
10 | 	# c++ $(FLAGS) -o fq2fa fastq2fasta.cpp 
11 | 	gcc $(FLAGS) -o fastq2fasta fastq2fasta.c -lz
12 | 	gcc $(FLAGS) -o count_fastq count_fastq.c -lz
13 | 	gcc $(FLAGS) -o fastg2gfa fastg2gfa.c -lz
14 | 	gcc $(FLAGS) -o fasta_split fasta_split.c -lz
15 | 
16 | clean:
17 | 	rm -f fastq_avqual count_fasta fastq2fasta count_fastq fastg2gfa fasta_split
18 | 
19 | 


--------------------------------------------------------------------------------
/bin/all_4mers.py:
--------------------------------------------------------------------------------
 1 | """
 2 | print all 4 mers
 3 | """
 4 | 
 5 | import os
 6 | import sys
 7 | import argparse
 8 | 
 9 | from roblib import bcolors
10 | import itertools
11 | 
12 | alphabet = ['A', 'C', "G", 'T']
13 | c=0
14 | for a in itertools.product(alphabet, repeat=4):
15 |     if (c == 16):
16 |         print()
17 |         c=0
18 |     print("".join(a), end=' ')
19 |     c+=1
20 | print()


--------------------------------------------------------------------------------
/bin/average_quality_scores.pl:
--------------------------------------------------------------------------------
 1 | #__perl__
 2 | #
 3 | # Calculate the average quality score of one or more files
 4 | #
 5 | # Usage: average_quality_scores.pl [-l] <list of qual files>
 6 | #
 7 | # -l initiates printing all scores for each file otherwise
 8 | # a summary is produced
 9 | 
10 | use strict;
11 | use Rob;
12 | my $rob = new Rob;
13 | my ($total, $n)=(0,0);
14 | my ($min, $max)=(10000, 0);
15 | 
16 | my $printall = 0;
17 | 
18 | foreach my $f (@ARGV) {
19 | 	if ($f eq "-l") {$printall = 1; next}
20 | 	my $qu = $rob->read_fasta($f, 1);
21 | 	foreach my $id (keys %$qu) {
22 | 		my @qual = split /\s+/, $qu->{$id};
23 | 		my $t=0;
24 | 		map {$t+=$_} @qual;
25 | 		my $av = $t/($#qual+1);
26 | 		$printall && print "$id\t$av\n";
27 | 		$total+=$t;
28 | 		$n+=$#qual+1;
29 | 		$av > $max ? $max = $av : 1;
30 | 		$av < $min ? $min = $av : 1;
31 | 	}
32 | }
33 | print "TOTAL: $total NBASES: $n MINIMUM: $min MAXIMUM: $max AVERAGE: ", $total/$n, "\n";
34 | 
35 | 


--------------------------------------------------------------------------------
/bin/blast2seq.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | 
4 | from roblib import read_fasta
5 | import argparse
6 | __author__ = 'Rob Edwards'
7 | 
8 | 
9 | 


--------------------------------------------------------------------------------
/bin/checkR1R2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | if [[ $# != 1 ]]; then echo "$0  <directory of fastq files>" >&2; exit; fi
 5 | 
 6 | # Check that we have an R2 for every R1 and vice-versa
 7 | 
 8 | for R1 in $(find $1 -name \*R1\*); do
 9 | 	R2=${R1/R1/R2};
10 | 	if [[ ! -e $R2 ]]; then echo "Not found: $R2 for associated R1: $R1" >&2; fi
11 | done
12 | 
13 | 
14 | for R2 in $(find $1 -name \*R2\*); do
15 | 	R1=${R2/R2/R1};
16 | 	if [[ ! -e $R1 ]]; then echo "Not found: $R1 for associated R2: $R2" >&2; fi
17 | done
18 | 
19 | 


--------------------------------------------------------------------------------
/bin/cpgs.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Read a fastq file and count CpGs
 3 | """
 4 | 
 5 | import os
 6 | import sys
 7 | import argparse
 8 | 
 9 | from roblib import bcolors
10 | from roblib import stream_fastq
11 | 
12 | def countcpgs(fqfile):
13 |     """
14 |     Count the CpGs in a file
15 |     :param fqfile: the fastq file
16 |     :return:
17 |     """
18 | 
19 |     count = {}
20 |     for seqid, header, seq, qual in stream_fastq(fqfile):
21 |         cg = seq.count('CG')
22 |         count[cg] = count.get(cg, 0) + 1
23 |     return count
24 | 
25 | 
26 | if __name__ == '__main__':
27 |     parser = argparse.ArgumentParser(description='Count CGs in a fastq file')
28 |     parser.add_argument('-f', help='fastq file', required=True)
29 |     parser.add_argument('-v', help='verbose output', action='store_true')
30 |     args = parser.parse_args()
31 | 
32 |     count = countcpgs(args.f)
33 |     for c in sorted(list(count.keys())):
34 |         print(f"{c}\t{count[c]}")
35 | 


--------------------------------------------------------------------------------
/bin/crc64.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Calculate the crc64 checksum of a fasta sequence.
 3 | """
 4 | 
 5 | import os
 6 | import sys
 7 | import argparse
 8 | from crc64iso.crc64iso import crc64
 9 | from roblib import stream_fasta
10 | 
11 | __author__ = 'Rob Edwards'
12 | 
13 | if __name__ == "__main__":
14 |     parser = argparse.ArgumentParser(description=' ')
15 |     parser.add_argument('-f', help='fasta file', required=True)
16 |     parser.add_argument('-o', help='output file (default -)', default=sys.stdout)
17 |     parser.add_argument('-v', help='verbose output', action='store_true')
18 |     args = parser.parse_args()
19 | 
20 |     for seqid, seq in stream_fasta(args.f):
21 |         print(f"{seqid}\t{crc64(seq)}")


--------------------------------------------------------------------------------
/bin/dump_all_tables.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import sqlite3
 3 | import pandas as pd
 4 | 
 5 | 
 6 | def to_csv(filename):
 7 |     db = sqlite3.connect(filename)
 8 |     cursor = db.cursor()
 9 |     cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
10 |     tables = cursor.fetchall()
11 |     for table_name in tables:
12 |         table_name = table_name[0]
13 |         table = pd.read_sql_query("SELECT * from %s" % table_name, db)
14 |         table.to_csv(table_name + '.csv', index_label='index', encoding='utf-8')
15 | 
16 | 
17 | if __name__ == '__main__':
18 |     parser = argparse.ArgumentParser(description='Dump the contents of an SQL file to CSV. This was taken from http://stackoverflow.com/questions/305378/get-list-of-tables-db-schema-dump-etc-in-sqlite-databases')
19 |     parser.add_argument('-d', help='SQLlite database file', required=True)
20 |     args = parser.parse_args()
21 |     to_csv(args.d)
22 | 


--------------------------------------------------------------------------------
/bin/extract.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | import sys
 4 | 
 5 | __author__ = 'Rob Edwards'
 6 | 
 7 | print('CWD'  + os.getcwd())
 8 | 
 9 | fname =  os.path.join(os.environ['HOME'], 'Dropbox/Metagenomics/51.hits_small.fa.lrz')
10 | 
11 | f = subprocess.Popen(['/usr/bin/lrunzip', '-q', '-d', '-f', '-o-', fname], stdout=subprocess.PIPE).stdout
12 | 
13 | for l in f:
14 |     print("READ: {}".format(l))


--------------------------------------------------------------------------------
/bin/extract_fasta_sequence.pl:
--------------------------------------------------------------------------------
 1 | =pod
 2 | 
 3 | Extract one or more sequences from a fasta file, but not using much memory
 4 | 
 5 | =cut
 6 | 
 7 | use strict;
 8 | 
 9 | unless ($#ARGV >= 1) {
10 | 	die "$0 <fasta fiile> [list of sequences to extract]\n";
11 | }
12 | 
13 | my $faf = shift;
14 | my %want;
15 | map {$want{$_}=1} @ARGV;
16 | 
17 | if ($faf =~ /.gz$/) {
18 | 	open(IN, "gunzip -c $faf|") || die "Can't open a pipe to $faf";
19 | } else {
20 | 	open(IN, $faf) || die "$! $faf";
21 | }
22 | my $p=0;
23 | while (<IN>) {
24 | 	if (index($_, ">") == 0) {
25 | 		$p = 0;
26 | 		if (/^>(\S+)/ && $want{$1}) {$p=1}
27 | 	}
28 | 	print if ($p);
29 | }
30 | close IN;
31 | 
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/bin/factorial.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Calculate some factorials
 3 | """
 4 | 
 5 | import os
 6 | import sys
 7 | 
 8 | 
 9 | def factorial(n):
10 |     if n == 2:
11 |         return 2
12 |     return n * factorial(n-1)
13 | 
14 | 
15 | for n in range(2,100):
16 |     if factorial(n) > 100000 ** n:
17 |         print("{}\tFACTORIAL".format(n))
18 |     else:
19 |         print("{}\texponent".format(n))
20 | #    print("{}\t{}\t{}".format(n, factorial(n), 100000 ** n))


--------------------------------------------------------------------------------
/bin/fasta2sequence.pl:
--------------------------------------------------------------------------------
 1 | =pod
 2 | 
 3 | Extract one or more sequences from a fasta file
 4 | 
 5 | =cut
 6 | 
 7 | use strict;
 8 | use Data::Dumper;
 9 | use Rob;
10 | my $rob = new Rob;
11 | 
12 | unless ($#ARGV >= 1) {
13 | 	die "$0 <fasta fiile> [list of sequences to extract]\n";
14 | }
15 | 
16 | my $faf = shift;
17 | my $fa = $rob->read_fasta($faf);
18 | foreach my $seq (@ARGV) {
19 | 	if (!defined $fa->{$seq}) {
20 | 		print STDERR "ERROR: $seq not found in $faf\n";
21 | 		next;
22 | 	}
23 | 	print ">$seq\n", $fa->{$seq}, "\n";
24 | }
25 | 
26 | 
27 | 
28 | 
29 | 


--------------------------------------------------------------------------------
/bin/fastapercent.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl -w
 2 | 
 3 | # look through a fasta file and figure out what percent is done
 4 | 
 5 | use strict;
 6 | 
 7 | my ($file, $tag)=@ARGV;
 8 | unless ($file && $tag) {die "$0 <fasta file> <fasta tag (without '>')>"}
 9 | my ($c, $s)=(0,0);
10 | open(IN, $file) || die "can't open $file";
11 | while (<IN>) {
12 |  next unless (/^>/);
13 |  (/^>$tag\s+/) ? eval {$s=$c} : 1;
14 |  $c++;
15 | }
16 | 
17 | print "$tag is at $s, and the file is $c. We have done ", int(($s/$c)*100000)/1000, " percent\n";
18 | 


--------------------------------------------------------------------------------
/bin/fastq2fasta.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Convert a fastq file to a fasta file. Note in this case I just ignore the quailty scores!
 3 | """
 4 | 
 5 | import os
 6 | import sys
 7 | import argparse
 8 | from roblib import stream_fastq
 9 | 
10 | __author__ = 'Rob Edwards'
11 | 
12 | if __name__ == "__main__":
13 |     parser = argparse.ArgumentParser(description=' ')
14 |     parser.add_argument('-f', help='input fastq file', required=True)
15 |     parser.add_argument('-o', help='output fasta file', required=True)
16 |     args = parser.parse_args()
17 | 
18 |     with open(args.o, 'w') as out:
19 |         for (sid, label, seq, qual) in stream_fastq(args.f):
20 |             out.write(">{}\n{}\n".format(sid, seq))
21 | 


--------------------------------------------------------------------------------
/bin/fastq_avqual.c:
--------------------------------------------------------------------------------
 1 | /* Average the quality scores in a fastq file 
 2 |  *
 3 |  * Rob Edwards, 10/12/21
 4 |  *
 5 |  */
 6 | 
 7 | 
 8 | #include <zlib.h>
 9 | #include <stdio.h>
10 | #include <stdlib.h>
11 | #include "kseq.h"
12 | 
13 | KSEQ_INIT(gzFile, gzread)
14 | 
15 | #define table_size 10000
16 | 
17 | int main(int argc, char *argv[]) {
18 | 
19 | 	if ( argc < 2) {
20 | 		printf("Usage: %s <fastq file (use - to read from STDIN)>\n", argv[0]);
21 | 		return 1;
22 | 	}
23 | 	gzFile fp;
24 | 	kseq_t *seq;
25 | 	int c=0;
26 | 	long total=0;
27 | 	long n=0;
28 | 
29 | 	fp = gzopen(argv[1], "r");
30 | 	seq = kseq_init(fp);
31 | 	int l;
32 | 	while ((l = kseq_read(seq)) >= 0) {
33 | 		c++;
34 | 		for (int i = 0; i < strlen(seq->qual.s); i++) {
35 | 			total+= (int) seq->qual.s[i];
36 | 			n++;
37 | 		}
38 | 	}
39 | 	kseq_destroy(seq);
40 | 	gzclose(fp);
41 | 	printf("File\tNumber of sequences\tTotal bp\tTotal quality\tAverage quality\n");
42 | 	printf("%s\t%d\t%ld\t%ld\t%ld\n", argv[1], c, n, total, total/n);
43 | 
44 | }
45 | 
46 | 


--------------------------------------------------------------------------------
/bin/filter_fasta_length.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Stream a fasta file and print it out
 3 | """
 4 | 
 5 | import os
 6 | import sys
 7 | import argparse
 8 | from roblib import sequences
 9 | if __name__ == '__main__':
10 |     parser = argparse.ArgumentParser(description="stream the contents of a fasta file")
11 |     parser.add_argument('-f', help='file to stream', required=True)
12 |     parser.add_argument('-m', help='minimum sequence length', type=int, default=1000)
13 |     args = parser.parse_args()
14 | 
15 |     for (seqid, seq) in sequences.stream_fasta(args.f):
16 |         if len(seq) > args.m:
17 |             print(">{}\n{}".format(seqid, seq))
18 | 


--------------------------------------------------------------------------------
/bin/filter_seq_by_length.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Filter a fasta file for sequences longer than a specified length
 3 | """
 4 | 
 5 | import os
 6 | import sys
 7 | import argparse
 8 | from roblib import sequences
 9 | 
10 | __author__ = 'Rob Edwards'
11 | 
12 | if __name__ == "__main__":
13 |     parser = argparse.ArgumentParser(description='Filter a fasta file for sequences longer than a specified length')
14 |     parser.add_argument('-f', help='fasta file', required=True)
15 |     parser.add_argument('-l', help='Minimum length to filter on (seq >= this number)', type=int)
16 |     args = parser.parse_args()
17 | 
18 |     for seqid, seq in sequences.stream_fasta(args.f):
19 |         if len(seq) >= args.l:
20 |             print(">{}\n{}".format(seqid, seq))


--------------------------------------------------------------------------------
/bin/genbank2fna.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl -w
 2 | 
 3 | # get DNA out of a genbank file
 4 | 
 5 | use Bio::SeqIO;
 6 | use strict;
 7 | 
 8 | my $usage=<<EOF;
 9 | $0 <list of genbankfiles>
10 | 
11 | EOF
12 | 
13 | die $usage unless ($ARGV[0]);
14 | 
15 | foreach my $file (@ARGV)
16 | {
17 | 	my $of = $file;
18 | 	$of =~ s/\.gbk/.fasta/;
19 | 	$of =~ s/\.genbank/.fasta/;
20 | 	if ($of eq $file) {$of .= ".fasta"}
21 |         my $sin=Bio::SeqIO->new(-file=>$file, -format=>'genbank');
22 | 	my $sout = Bio::SeqIO->new(-file=>">$of", -format=>'fasta');
23 | 	while (my $seq = $sin->next_seq()) {
24 | 		$sout->write_seq($seq);
25 | 	}
26 | }
27 | 
28 | 


--------------------------------------------------------------------------------
/bin/genbank_count_motifs.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Count the occurrence of motifs in a genbank file
 3 | """
 4 | 
 5 | import os
 6 | import sys
 7 | import argparse
 8 | from roblib import genbank_seqio, rc
 9 | 
10 | 
11 | __author__ = 'Rob Edwards'
12 | 
13 | if __name__ == "__main__":
14 |     parser = argparse.ArgumentParser(description='Count the motif and its occurrence in a sequence')
15 |     parser.add_argument('-f', help='input genbank file', required=True)
16 |     parser.add_argument('-m', help='motif to look for', type=str)
17 |     parser.add_argument('-v', help='verbose output', action='store_true')
18 |     args = parser.parse_args()
19 | 
20 |     motif = args.m.upper()
21 | 
22 |     print("File\tContig\tLength\tNumber matches")
23 |     for seq in genbank_seqio(args.f, args.v):
24 |         dna = seq.seq.upper()
25 |         count = dna.count(motif)
26 |         count += dna.count(rc(motif))
27 |         print(f"{args.f}\t{seq.id}\t{len(dna)}\t{count}")
28 | 


--------------------------------------------------------------------------------
/bin/genbanktable2fasta.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl -w
 2 | #
 3 | 
 4 | use strict;
 5 | 
 6 | # convert a genbanktable to fasta using either contig/start/end/strand or gene id
 7 | 
 8 | use Getopt::Std;
 9 | my %opts;
10 | getopts('lg:i', \%opts);
11 | unless (($opts{i} || $opts{l}) && $opts{g}) {
12 | 	die <<EOF;
13 | $0
14 | -g genbank table file (required)
15 | -l use location (conting, start, stop) as ID
16 | -i use gene id as ID
17 | 
18 | either i or l must be specified (but not both)
19 | EOF
20 | }
21 | 
22 | if ($opts{l} && $opts{i}) {die "Not both -i and -l"}
23 | 
24 | open(IN, $opts{g}) || die "$! $opts{g}";
25 | while (<IN>) {
26 | 	chomp;
27 | 	my @a=split /\t/;
28 | 	my $header;
29 | 	if ($opts{i}) {
30 | 		$header = ">$a[6]\n";
31 | 	} else {
32 | 		my ($b, $e, $strand) =  ($a[7], $a[8], $a[9]);
33 | 		if ($strand < 0) {($b, $e) = ($e, $b)}
34 | 		$header = ">$a[0]_${b}_${e}\n";
35 | 	}
36 | 	unless ($header) {print STDERR "Can't construct a header in $_\n"}
37 | 	print $header, $a[10], "\n";
38 | }
39 | 


--------------------------------------------------------------------------------
/bin/get_genbank.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | use strict;
 3 | use LWP::Simple;
 4 | use Data::Dumper;
 5 | 
 6 | my $id = shift || die "$0 <genbank id>";
 7 | 
 8 | 
 9 | open(OUT, ">${id}_sequences.gbk") || die "Can't write to ${id}_sequences.gbk";
10 | my $url='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&retmode=text&rettype=gb&id=' . $id;
11 | 
12 | print OUT get($url);
13 | print "$url\n";
14 | exit 0;
15 | 
16 | 


--------------------------------------------------------------------------------
/bin/get_genbank_batch_proteins.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | use strict;
 3 | use LWP::Simple;
 4 | use Data::Dumper;
 5 | 
 6 | unless ($ARGV[0]) {die "$0 <file of ids> <number to get (default = 100)" }
 7 | open(IN, $ARGV[0]) || die "Can't open $ARGV[0]";
 8 | my @ids;
 9 | while (<IN>) {
10 | 	chomp;
11 | 	push @ids, $_;
12 | }
13 | close IN;
14 | my $n=100;
15 | if ($ARGV[1]) {$n=$ARGV[1]}
16 | 
17 | my $time=time-10;
18 | my $url='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&retmode=text&rettype=gb&tool=robsgetter&email=rob@salmonella.org&id=';
19 | open(OUT, ">proteins.gbk") || die "Can't write to proteins.gbk";
20 | my $total=0;
21 | while (@ids) {
22 | 	my @pieces = splice(@ids, 0, $n);
23 | 	$total+=scalar(@pieces);
24 | 	my $urlid=$url . join(",", @pieces);
25 | 	while (time-$time < 1.5) {sleep 1}
26 | 	print STDERR "Getting upto $total\n";
27 | 	$time=time;
28 | 	print OUT get($urlid);
29 | }
30 | close OUT;
31 | 
32 | exit 0;
33 | 
34 | 


--------------------------------------------------------------------------------
/bin/get_lastlogs.sh:
--------------------------------------------------------------------------------
 1 | DATE=`date +'%Y%m%d'`
 2 | for i in anthill.sdsu.edu edwards-data.sdsu.edu rambox phantome.org edwards-dna; do
 3 | 	echo $i;
 4 | 	ssh $i 'lastlog' > $i.$DATE.lastlog
 5 | done
 6 | 
 7 | 
 8 | python2.7 ~/EdwardsLab/bin/merge_last_logs.py -l anthill.sdsu.edu.$DATE.lastlog -l edwards-data.sdsu.edu.$DATE.lastlog -l rambox.$DATE.lastlog -l phantome.org.$DATE.lastlog -l edwards-dna.$DATE.lastlog > lastlog.$DATE
 9 | 
10 | 


--------------------------------------------------------------------------------
/bin/getopt.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <fstream>
 3 | #include <string>
 4 | #include <cstring>
 5 | #include <getopt.h>
 6 | using namespace std;
 7 | 
 8 | int main (int argc, char* argv[]) {
 9 | 	if ( argc < 3 ) {
10 | 		cerr << "Please add some options\n";
11 | 		return 1;
12 | 	}
13 | 
14 | 	int replace = 0;
15 | 	for (;;) {
16 | 		switch(getopt(argc, argv, "r")) {
17 | 			default:
18 | 				cerr << " fount " << optarg << "\n";
19 | 			case -1:
20 | 				cerr << "Found -1\n";
21 | 				break;
22 | 			case 'r':
23 | 				cout << "Flag r set";
24 | 				replace = 1;
25 | 				continue;
26 | 		}
27 | 		break;
28 | 	}
29 | 	cerr << "argc: " << argc << "\n";
30 | 	cerr << "Optind: " << optind << "\n";
31 | 	if (optind +2 != argc) {
32 | 		cerr << " <fastq file> <fasta file>";
33 | 		return 1;
34 | 	}
35 | 	char* fqf = argv[optind];
36 | 	char* faf = argv[optind+1]; 
37 | 	cout << "Fastq: " << fqf << " Fasta: " << faf << "\n";
38 | }
39 | 
40 | 
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------
/bin/gfa2fasta.sh:
--------------------------------------------------------------------------------
1 | if [ -z $1 ]; then 
2 | 	echo "$0 <gfa file>  >  <fasta file>"
3 | 	exit $E_BADARGS
4 | fi
5 | 
6 | awk -v id=$1 '/^S/{print ">"id"_"$2"\n"$3}' $1
7 | 


--------------------------------------------------------------------------------
/bin/json_validator.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A very simple JSON validator. We read the JSON file in and print it out using pprint.
 3 | """
 4 | 
 5 | import sys
 6 | import json 
 7 | import pprint
 8 | __author__ = 'Rob Edwards'
 9 | 
10 | 
11 | h = "A very simple JSON validator. We read the JSON file in and print it out using pprint.\n"
12 | h += f"\nUsage: {sys.argv[0]} <json file>\n"
13 | 
14 | 
15 | if len(sys.argv) < 2:
16 |     sys.exit(h)
17 | 
18 | 
19 | pp = pprint.PrettyPrinter(indent=4)
20 | 
21 | with open(sys.argv[1], 'r') as f:
22 |     j = json.load(f)
23 | 
24 | pp.pprint(j)
25 | 
26 | 


--------------------------------------------------------------------------------
/bin/merge_pdf.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Merge multiple pdf files into
 3 | """
 4 | 
 5 | import os
 6 | import sys
 7 | import argparse
 8 | import PyPDF2 as PDF
 9 | 
10 | merger = PDF.PdfFileMerger(strict=False)
11 | for f in sys.argv:
12 |     if f.endswith('.pdf') and os.path.exists(f):
13 |         sys.stderr.write("Adding {}\n".format(f))
14 |         merger.append(PDF.PdfFileReader(f, 'rb'))
15 |     else:
16 |         sys.stderr.write("Skipped {}\n".format(f))
17 | 
18 | merger.write("AllDocs.pdf")
19 | 


--------------------------------------------------------------------------------
/bin/parsebz2xml.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 
 3 | """
 4 | 
 5 | import os
 6 | import sys
 7 | import argparse
 8 | import bz2
 9 | from bs4 import BeautifulSoup
10 | 
11 | 
12 | 
13 | # with bz2.BZ2File('biosample_set.xml.bz2', 'r') as input:
14 | #     for i in range(10):
15 | #         l = input.readline()
16 | #         print("{}\n".format(l))
17 | 
18 | 
19 | # <Id db="BioSample" is_primary="1">SAMN00000002</Id>\n'
20 | 
21 | def primaryId(tag):
22 |     return tag['db'] == 'BioSample' and tag['is_primary']
23 | 
24 | 
25 | with bz2.BZ2File('biosample_set.xml.bz2', 'r') as input:
26 |     soup = BeautifulSoup(input, 'xml')
27 |     pi = soup.find_next(primaryId)
28 |     print("{}".format(pi))
29 | 


--------------------------------------------------------------------------------
/bin/pdb2fa.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Convert a PDB file to a fasta file
 3 | 
 4 | Taken from https://www.biostars.org/p/435629/
 5 | """
 6 | 
 7 | import os
 8 | import sys
 9 | from Bio import SeqIO
10 | import argparse
11 | 
12 | if __name__ == "__main__":
13 |     parser = argparse.ArgumentParser(description='Convert a PDB file to a fasta file')
14 |     parser.add_argument('-p', help='input PDB file', required=True)
15 |     parser.add_argument('-f', help='output fasta file', required=True)
16 |     parser.add_argument('-v', help='verbose output', action='store_true')
17 |     args = parser.parse_args()
18 | 
19 |     with open(args.p, 'r') as pdb_file, open(args.f, 'w') as fasta_file:
20 |         for record in SeqIO.parse(pdb_file, 'pdb-atom'):
21 |             if record.id.startswith("???"):
22 |                 print(f">{args.p}\n{record.seq}", file=fasta_file)
23 |             else:
24 |                 print(f">{record.id}\n{record.seq}", file=fasta_file)
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/bin/rc.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl -w
 2 | #
 3 | 
 4 | use strict;
 5 | use Rob;
 6 | my $r = new Rob;
 7 | 
 8 | my $f = shift || die "Fasta file to reverse complement?";
 9 | my $fa = $r->read_fasta($f);
10 | foreach my $id (keys %$fa) {
11 | 	print ">$id\n", $r->rc($fa->{$id}), "\n";
12 | }
13 | 


--------------------------------------------------------------------------------
/bin/rename_fasta.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Rename the sequences in a fasta file
 3 | 
 4 | If you give an optional -r the sequences will be renamed with that, otherwise with the file name
 5 | """
 6 | 
 7 | import os
 8 | import sys
 9 | import argparse
10 | from roblib import stream_fasta
11 | 
12 | __author__ = 'Rob Edwards'
13 | 
14 | if __name__ == "__main__":
15 |     parser = argparse.ArgumentParser(description='Rename the sequences in a fasta file')
16 |     parser.add_argument('-f', help='input fasta file', required=True)
17 |     parser.add_argument('-r', help='string to rename the seqids to. Default=use the file name')
18 |     parser.add_argument('-v', help='verbose output', action='store_true')
19 |     args = parser.parse_args()
20 | 
21 |     if args.r:
22 |         ren = args.r
23 |     else:
24 |         ren = args.f
25 | 
26 |     counter = 0
27 |     for seqid, seq in stream_fasta(args.f):
28 |         counter += 1
29 |         print(f">{ren}_{counter}\n{seq}")
30 | 


--------------------------------------------------------------------------------
/bin/renumber_fasta.pl:
--------------------------------------------------------------------------------
 1 | use strict;
 2 | use Getopt::Std;
 3 | use Data::Dumper;
 4 | use Rob;
 5 | my $rob = new Rob;
 6 | 
 7 | my $counter = 0;
 8 | 
 9 | my $usage = <<EOF;
10 | $0 <id.map file> <output file> <list of fasta files>
11 | id.map and output file must not exist (will not be automatically overwritten)
12 | list of fasta files can be long
13 | EOF
14 | 
15 | my $idf = shift || die $usage;
16 | my $out = shift || die $usage;
17 | 
18 | if (-e $idf) {die "$idf exists.\n$usage"}
19 | if (-e $out) {die "$out exists.\n$usage"}
20 | 
21 | open(IDF, ">$idf") || die "Can't write $idf";
22 | open(OUT, ">$out") || die "Can't write $out";
23 | 
24 | foreach my $f (@ARGV) {
25 | 	my $fa = $rob->read_fasta($f);
26 | 	foreach my $seqid (keys %$fa) {
27 | 		$counter++;
28 | 		print OUT ">$counter\n", $fa->{$seqid}, "\n";
29 | 		print IDF "$f\t$seqid\t$counter\n";
30 | 	}
31 | }
32 | 
33 | close(IDF);
34 | close(OUT);
35 | 


--------------------------------------------------------------------------------
/bin/riddler.py:
--------------------------------------------------------------------------------
 1 | """
 2 | An answer to http://fivethirtyeight.com/features/how-long-will-your-smartphone-distract-you-from-family-dinner/
 3 | """
 4 | 
 5 | from random import choice
 6 | 
 7 | if __name__ == '__main__':
 8 |     tasks = [1,2,3,4,5]
 9 |     total = []
10 |     for i in range(1000):
11 |         mine = choice(tasks)
12 |         sisters = choice(tasks)
13 |         while mine != sisters:
14 |             mine += choice(tasks)
15 |             sisters += choice(tasks)
16 |         total.append(mine)
17 |     print(1.0 * sum(total)/len(total))
18 | 
19 | 


--------------------------------------------------------------------------------
/bin/separatemultifasta.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl -w
 2 | 
 3 | # rewritten with BioPerl
 4 | 
 5 | use strict;
 6 | use Bio::SeqIO;
 7 | 
 8 | my $file=shift || die "$0 <fasta file>";
 9 | my $dir=$file.".files";
10 | if (-e $dir) {die "$dir already exists. Not overwriting"}
11 | else {mkdir $dir, 0755}
12 | 
13 | my %seen;
14 | my $sio=Bio::SeqIO->new(-file=>$file, -format=>"fasta");
15 | while (my $seq=$sio->next_seq)
16 | {
17 |  my $id=$seq->id;
18 |  $id =~ s/\s+/_/g;
19 |  while ($seen{$id}) {print "$id already written, "; $id.="1"; print " now trying $id\n"}
20 |  my $outfn = $id;
21 |  $outfn =~ s/\|/_/g;
22 |  $outfn =~ s/\_$//;
23 |  $outfn =~ s/\//_/g;
24 |  my $fout=Bio::SeqIO->new(-file=>">$dir/$outfn.fasta", -format=>"fasta");
25 |  $seen{$id}=1;
26 |  $fout->write_seq($seq);
27 | }
28 | 
29 | 


--------------------------------------------------------------------------------
/bin/sge_summary.pl:
--------------------------------------------------------------------------------
 1 | use strict;
 2 | 
 3 | open(IN, "qstat |") || die "$! pipe to qstat";
 4 | my $job;
 5 | my %type;
 6 | while (<IN>) {
 7 | 	chomp;
 8 | 	next if (/^job-ID/ || /^\-/);
 9 | 	my @a=split /\s+/;
10 | 	$job->{$a[2]}->{$a[4]}++;
11 | 	$type{$a[4]}=1;
12 | }
13 | 
14 | my @t = sort {lc($a) cmp lc($b)} keys %type;
15 | print join("\t", "Job       ", @t), "\n";
16 | 
17 | foreach my $j (sort {lc($a) cmp lc($b)} keys %{$job}) {
18 | 	print $j;
19 | 	foreach my $t (@t) {
20 | 		print $job->{$j}->{$t} ? "\t". $job->{$j}->{$t} : "\t0";
21 | 	}
22 | 	print "\n";
23 | }
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/bin/sort_fasta_by_len.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | =pod
 4 | 
 5 | Just sort a fasta file by the length of the sequences.
 6 | 
 7 | Default is longest -> shortest.
 8 | 
 9 | =cut
10 | 
11 | use strict;
12 | use Getopt::Std;
13 | use Data::Dumper;
14 | use Rob;
15 | my $rob = new Rob;
16 | 
17 | my %opts;
18 | getopts('f:vr', \%opts);
19 | unless ($opts{f}) {
20 | 	die <<EOF;
21 | 	$0
22 | 	-f fasta file to parse (required)
23 | 	-r sort shortest to longest (default is longest first)
24 | 	-v verbose output
25 | EOF
26 | }
27 | 
28 | my $fa = $rob->read_fasta($opts{f});
29 | my @keys;
30 | if ($opts{r}) {@keys = sort {length($fa->{$a}) <=> length($fa->{$b})} keys %$fa}
31 | else {@keys = sort {length($fa->{$b}) <=> length($fa->{$a})} keys %$fa}
32 | 
33 | map {print ">$_\n", $fa->{$_}, "\n"} @keys;
34 | 
35 | 
36 | 
37 | 
38 | 


--------------------------------------------------------------------------------
/bin/sort_fasta_by_len_lengths_only.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | =pod
 4 | 
 5 | Just sort a fasta file by the length of the sequences and print out the sequence lengths
 6 | 
 7 | Default is longest -> shortest.
 8 | 
 9 | =cut
10 | 
11 | use strict;
12 | use Getopt::Std;
13 | use Data::Dumper;
14 | use Rob;
15 | my $rob = new Rob;
16 | 
17 | my %opts;
18 | getopts('f:vr', \%opts);
19 | unless ($opts{f}) {
20 | 	die <<EOF;
21 | 	$0
22 | 	-f fasta file to parse (required)
23 | 	-r sort shortest to longest (default is longest first)
24 | 	-v verbose output
25 | EOF
26 | }
27 | 
28 | my $fa = $rob->read_fasta($opts{f});
29 | my @keys;
30 | if ($opts{r}) {@keys = sort {length($fa->{$a}) <=> length($fa->{$b})} keys %$fa}
31 | else {@keys = sort {length($fa->{$b}) <=> length($fa->{$a})} keys %$fa}
32 | 
33 | map {print length($fa->{$_}), "\t$_\n"} @keys;
34 | 
35 | 
36 | 
37 | 
38 | 


--------------------------------------------------------------------------------
/bin/stream_fasta.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Stream a fasta file and print it out
 3 | """
 4 | 
 5 | import os
 6 | import sys
 7 | import argparse
 8 | from roblib import sequences
 9 | if __name__ == '__main__':
10 |     parser = argparse.ArgumentParser(description="stream the contents of a fasta file")
11 |     parser.add_argument('-f', help='file to stream', required=True)
12 |     args = parser.parse_args()
13 | 
14 |     for (seqid, seq) in sequences.stream_fasta(args.f):
15 |         print("{}\t{}".format(seqid, seq))


--------------------------------------------------------------------------------
/bin/test.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Test stuff!
 3 | """
 4 | 
 5 | import os
 6 | import sys
 7 | 
 8 | import argparse
 9 | import scipy
10 | import scipy.cluster.hierarchy as sch
11 | 
12 | import dateutil.parser
13 | 
14 | d = 'Thu Feb 11 16:39:56 -0800 2016'
15 | z = dateutil.parser.parse(d)
16 | print(z)
17 | 
18 | 
19 | 
20 | 
21 | sys.exit(0)
22 | 
23 | 
24 | X = scipy.randn(100, 2)  # 100 2-dimensional observations
25 | print(X)
26 | 
27 | d = sch.distance.pdist(X)
28 | print(len(d))
29 | 


--------------------------------------------------------------------------------
/bin/transpose.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | Read a tsv file and transpose it. 
 5 | """
 6 | 
 7 | import os
 8 | import sys
 9 | import argparse
10 | import pandas as pd
11 | from roblib import bcolors
12 | __author__ = 'Rob Edwards'
13 | 
14 | 
15 | 
16 | 
17 | 
18 | if __name__ == "__main__":
19 |     parser = argparse.ArgumentParser(description=' ')
20 |     parser.add_argument('-i', '--input', help='input file', required=True)
21 |     parser.add_argument('-o', '--output', help='output file', required=True)
22 |     parser.add_argument('-s', '--sep', help='separator (default=tab)', default="\t")
23 |     parser.add_argument('-v', help='verbose output', action='store_true')
24 |     args = parser.parse_args()
25 | 
26 |     if args.v:
27 |         print(f"{bcolors.GREEN}Reading {args.input}{bcolors.ENDC}")
28 |     df = pd.read_csv(args.input, sep=args.sep)
29 |     dft = df.T
30 |     if args.v:
31 |         print(f"{bcolors.GREEN}Writing {args.output}{bcolors.ENDC}")
32 |     dft.to_csv(args.output, sep=args.sep)
33 |     
34 | 
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/bin/update_blastdb.sh:
--------------------------------------------------------------------------------
 1 | DATE=$(date +%Y%m%d)
 2 | cd /home2/db/blast
 3 | mkdir nr_$DATE
 4 | cd nr_$DATE
 5 | ncftpget ftp://ftp.ncbi.nlm.nih.gov/blast/db/nr*
 6 | cat *md5 > all.md5
 7 | md5sum -c all.md5
 8 | for t in *.tar.gz; do echo $t; tar xf $t; done
 9 | cd /home2/db/blast
10 | rm -f nr
11 | ln -s nr_$DATE nr
12 | 
13 | mkdir nt_$DATE
14 | cd nt_$DATE
15 | ncftpget ftp://ftp.ncbi.nlm.nih.gov/blast/db/nt*
16 | cat *md5 > all.md5
17 | md5sum -c all.md5
18 | for t in *.tar.gz; do echo $t; tar xf $t; done
19 | cd /home2/db/blast
20 | rm -f nt
21 | ln -s nt_$DATE nt
22 | 
23 | 
24 | 


--------------------------------------------------------------------------------
/cartopy/example.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | 
 3 | import cartopy.crs as ccrs
 4 | 
 5 | """
 6 | This example is from the cartopy website, and is mostly to make sure things are installed and working.
 7 | """
 8 | 
 9 | 
10 | def main():
11 |     ax = plt.axes(projection=ccrs.Robinson())
12 | 
13 |     # make the map global rather than have it zoom in to
14 |     # the extents of any plotted data
15 |     ax.set_global()
16 | 
17 |     ax.stock_img()
18 |     ax.coastlines()
19 | 
20 |     # san diego
21 |     sdlat, sdlon = 32.7157, -117.1611
22 |     # brisbane
23 |     brislat, brislon = -27.4698, 153.0251
24 | 
25 | 
26 |     # NOTE: longitude before latitude!!
27 |     plt.plot([sdlon, brislon], [sdlat, brislat], color='blue', linewidth=2,  transform=ccrs.Geodetic())
28 | 
29 | 
30 | 
31 |     plt.show()
32 | 
33 | 
34 | if __name__ == '__main__':
35 |     main()
36 | 


--------------------------------------------------------------------------------
/cluster/submit2cluster_edwards:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl -w
 2 | 
 3 | # submit a job using my new method
 4 | 
 5 | use strict;
 6 | use lib '/home/redwards/perl/lib/perl5/site_perl/5.8.8/';
 7 | use Schedule::SGE;
 8 | 
 9 | my $usage=<<EOF;
10 | 
11 | $0 <options> <command>
12 | 
13 | OPTIONS: 
14 |  -N name    (default=first word of command)
15 |  -P project (default=redwards)
16 | EOF
17 | 
18 | my ($name, $project, $command)=('', 'redwards', '');
19 | while (@ARGV) {
20 |  my $test=shift @ARGV;
21 |  if ($test eq "-N") {$name=shift @ARGV}
22 |  elsif ($test eq "-P") {$project=shift @ARGV}
23 |  else {$command .= " ". $test}
24 | }
25 | 
26 | die $usage unless $command;
27 | 
28 | unless ($name) { 
29 |  $command =~ m/^(\S+)/; $name=$1;
30 | }
31 |  
32 | 
33 | my $sge=Schedule::SGE->new(
34 |  -executable 	=> {qsub=>'/usr/local/bin/qsub', qstat=>'/usr/local/bin/qstat'},
35 |  -name		=> $name,
36 |  -verbose   	=> 0,
37 |  -notify 	=> 1,
38 |  -mailto	=> 'rob@salmonella.org',
39 | );
40 | 
41 | $sge->command($command);
42 | 
43 | my $pid=$sge->execute();
44 | exit(0);
45 |  
46 | 
47 | 


--------------------------------------------------------------------------------
/cpp/fastq/include/stream_fastq.h:
--------------------------------------------------------------------------------
 1 | #ifndef STREAM_FASTQ_H
 2 | #define STREAM_FASTQ_H
 3 | 
 4 | 
 5 | class stream_fastq
 6 | {
 7 |     public:
 8 |         stream_fastq();
 9 |         virtual ~stream_fastq();
10 |     protected:
11 |     private:
12 | };
13 | 
14 | #endif // STREAM_FASTQ_H
15 | 


--------------------------------------------------------------------------------
/cpp/fastq/main.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | 
 3 | using namespace std;
 4 | 
 5 | int main()
 6 | {
 7 |     cout << "Hello world!" << endl;
 8 |     return 0;
 9 | }
10 | 


--------------------------------------------------------------------------------
/cpp/fastq/src/stream_fastq.cpp:
--------------------------------------------------------------------------------
 1 | #include "stream_fastq.h"
 2 | 
 3 | stream_fastq::stream_fastq()
 4 | {
 5 |     //ctor
 6 | }
 7 | 
 8 | stream_fastq::~stream_fastq()
 9 | {
10 |     //dtor
11 | }
12 | 


--------------------------------------------------------------------------------
/crAssphage/collapse_bam_variants.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Collapse variants from a bamfile and try and make as few variants as possible
 3 | """
 4 | 
 5 | import os
 6 | import sys
 7 | import argparse
 8 | from roblib import read_fasta
 9 | 
10 | if __name__ == '__main__':
11 |     parser = argparse.ArgumentParser(description="Collapse all variants from a bam file and make as few options as possible")
12 |     parser.add_argument('-b', help='bam file', required=True)
13 |     parser.add_argument('-r', help='reference fasta sequence', required=True)
14 |     parser.add_argument('-s', help='start of the region to look at (default = whole sequence)', default=0, type=int)
15 |     parser.add_argument('-e', help='end of sequence to look at (default = whole sequence)', type=int)
16 |     parser.add_argument('-v', help='verbose output', action="store_true")
17 |     args = parser.parse_args()
18 | 
19 |     fa = read_fasta(args.r)
20 | 
21 |     if not args.e:
22 |         args.e = max([len(fa[f]) for f in fa])
23 | 
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/crAssphage/phylip2clustal.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A simple BioPython converter to move from phylip to clustal formats for the alignments
 3 | """
 4 | 
 5 | import os
 6 | import sys
 7 | 
 8 | import argparse
 9 | from Bio import SeqIO
10 | 
11 | if __name__ == '__main__':
12 |     parser = argparse.ArgumentParser(description="Convert an alignment file from phylip format to clustal format")
13 |     parser.add_argument('-i', help='Alignment input file', required=True)
14 |     parser.add_argument('-o', help='Output file name (optional: default = input file with clustal appended)')
15 |     args = parser.parse_args()
16 | 
17 |     outfile = args.i + ".clustal"
18 |     if args.o:
19 |         outfile = args.o
20 | 
21 |     records=SeqIO.parse(args.i, 'phylip')
22 | 
23 |     with open(outfile, 'w') as out:
24 |         SeqIO.write(records, out, 'clustal')


--------------------------------------------------------------------------------
/crAssphage/transpose_and_join.pl.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Transpose and join a whole lot of files created by coverage_depth.py
 3 | """
 4 | 
 5 | import os, sys
 6 | import argparse
 7 | 
 8 | if __name__ == '__main__':
 9 |     parser = argparse.ArgumentParser(description="Read a directory of files and join them as a single file")
10 |     parser.add_argument('-d', help='directory of output files from coverage_depth.py', required=True)
11 |     args = parser.parse_args()
12 | 
13 |     for f in os.listdir(args.d):
14 |         data = []
15 |         with open(os.path.join(args.d, f), 'r') as fin:
16 |             for l in fin:
17 |                 p=l.strip().split("\t")
18 |                 data.append(p[1])
19 |             sumd = sum(map(int, data[1:]))
20 |             data[1:0] = [sumd]
21 |         print("\t".join(map(str, data)))
22 | 
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/deconvolute_minion_reads/README.md:
--------------------------------------------------------------------------------
 1 | # Deconvolute Minion Reads
 2 | 
 3 | Separate minion reads based on a timestamp or some other characteristic of the file.
 4 | 
 5 | 
 6 | # Why we do this!
 7 | 
 8 | When we do minion sequencing of some (but not all) samples, we often cheat and load multiple samples on the chip. Usually, the way that we do this is to start the run processing, let it run for a couple of hours, and then add the next sample. We can use the timestamps in the fastq file to separate out the individual reads.
 9 | 
10 | 
11 | 
12 | 
13 | 


--------------------------------------------------------------------------------
/deconvolute_minion_reads/fastq/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | __author__ = 'Rob Edwards'
3 | 
4 | from .sequences import stream_fastq
5 | 
6 | __all__ = ['stream_fastq']
7 | 


--------------------------------------------------------------------------------
/fasta/extract_sequence.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Extract a sequence from a fasta file
 3 | """
 4 | 
 5 | import os
 6 | import sys
 7 | import argparse
 8 | from roblib import stream_fasta
 9 | 
10 | 
11 | 
12 | if __name__ == '__main__':
13 |     parser = argparse.ArgumentParser(description=" ")
14 |     parser.add_argument('-f', help='fasta file', required=True)
15 |     parser.add_argument('-i', help='sequence id, (multiple allowed)', nargs='+')
16 |     parser.add_argument('-v', help='verbose output', action='store_true')
17 |     args = parser.parse_args()
18 | 
19 |     for seqid, seq in stream_fasta(args.f):
20 |         if seqid in args.i:
21 |             print(f">{seqid}\n{seq}\n")
22 | 


--------------------------------------------------------------------------------
/fasta/length_filter.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Filter a fasta file on length
 3 | """
 4 | 
 5 | import os
 6 | import sys
 7 | import argparse
 8 | from roblib import stream_fasta
 9 | 
10 | def length_filter(f, l, verbose=False):
11 |     """
12 |     Filter a fasta file based on the minimum length, l
13 |     :param f: fasta file
14 |     :param l: minimum sequene length
15 |     :param verbose: more output
16 |     :return:
17 |     """
18 | 
19 |     for seqid, seq in stream_fasta(f, True):
20 |         if len(seq) < l:
21 |             continue
22 |         print(">{}\n{}".format(seqid, seq))
23 | 
24 | 
25 | if __name__ == '__main__':
26 |     parser = argparse.ArgumentParser(description="Filter a file based on length")
27 |     parser.add_argument('-f', help='the fasta file to filter', required=True)
28 |     parser.add_argument('-l', help='minimum length (default=1000)', default=1000, type=int)
29 |     parser.add_argument('-v', help='verbose output', action="store_true")
30 |     args = parser.parse_args()
31 | 
32 |     length_filter(args.f, args.l, args.v)


--------------------------------------------------------------------------------
/fasta/lengths.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Print the IDs and lengths of sequences in a fasta file
 3 | """
 4 | 
 5 | import os
 6 | import sys
 7 | import argparse
 8 | from roblib import stream_fasta
 9 | 
10 | parser = argparse.ArgumentParser(description="Print the lengths of sequences in a fasta file")
11 | parser.add_argument('-f', help='fasta file', required=True)
12 | parser.add_argument('-w', help='whole sequence ID. Default is to use ID upto whitespace', action="store_true", default=False)
13 | parser.add_argument('-v', help='verbose output', action="store_true")
14 | args = parser.parse_args()
15 | 
16 | for seqid, seq in stream_fasta(args.f, args.w):
17 |     print("{}\t{}".format(seqid, len(seq)))
18 | 


--------------------------------------------------------------------------------
/fasta/test.fasta:
--------------------------------------------------------------------------------
 1 | >seq1
 2 | AAAAAA
 3 | >seq2
 4 | TTTTTT
 5 | >seq3
 6 | CCCCCC
 7 | >seq4
 8 | GGGGGG
 9 | >seq5
10 | AAATTT
11 | >seq6
12 | TTTCCC
13 | >seq7
14 | CCCGGG
15 | >seq8
16 | GGGAAA
17 | 


--------------------------------------------------------------------------------
/fastq/README.md:
--------------------------------------------------------------------------------
1 | # Scripts for manipulating fastq files
2 | 
3 | 


--------------------------------------------------------------------------------
/fastq/average_quality.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Print the average quality score of a set of sequences
 3 | """
 4 | 
 5 | import os
 6 | import sys
 7 | import argparse
 8 | 
 9 | from roblib import stream_fastq, qual_to_numbers
10 | 
11 | __author__ = 'Rob Edwards'
12 | __copyright__ = 'Copyright 2020, Rob Edwards'
13 | __credits__ = ['Rob Edwards']
14 | __license__ = 'MIT'
15 | __maintainer__ = 'Rob Edwards'
16 | __email__ = 'raedwards@gmail.com'
17 | 
18 | if __name__ == '__main__':
19 |     parser = argparse.ArgumentParser(description=" ")
20 |     parser.add_argument('-f', help='file', required=True)
21 |     parser.add_argument('-o', help='output file')
22 |     parser.add_argument('-v', help='verbose output', action='store_true')
23 |     args = parser.parse_args()
24 | 
25 |     print("SeqID\tLength\tAverage Qual")
26 |     for sid, seqid, seq, qual in stream_fastq(args.f):
27 |         q2n =  list(qual_to_numbers(qual))
28 |         av = sum(q2n)/len(q2n)
29 |         print(f"{sid}\t{len(seq)}\t{av}")


--------------------------------------------------------------------------------
/fastq/filter_fastq_length.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <zlib.h>
 4 | 
 5 | #include "kseq.h"
 6 | 
 7 | KSEQ_INIT(gzFile, gzread);
 8 | 
 9 | // compile with:
10 | // gcc -I../include -o filter_fastq_length ./filter_fastq_length.c -lz
11 | 
12 | 
13 | int main(int argc, char* argv[]) {
14 | 	if (argc < 2) {
15 | 		fprintf(stderr,  "%s <fastq file> <minimum length>\n", argv[0]);
16 | 		exit(1);
17 | 	}
18 | 
19 | 	gzFile fp;
20 | 	kseq_t *seq;
21 | 	fp = gzopen(argv[1], "r");
22 | 	seq = kseq_init(fp);
23 | 	int l;
24 | 	int kept = 0;
25 | 	int dropped = 0;
26 | 	int minlen = atoi(argv[2]);
27 | 	// fprintf(stderr, "Filtering %s. Reads shorter than %d will be ignored\n", argv[1], minlen);
28 | 	while ((l = kseq_read(seq)) >= 0) {
29 | 		if (seq->seq.l > minlen) {
30 | 			printf("@%s %s\n%s\n+\n%s\n", seq->name.s, seq->comment.s, seq->seq.s, seq->qual.s);
31 | 			kept++;
32 | 		} else{
33 | 			dropped++;
34 | 		}
35 | 	}
36 | 	fprintf(stderr, "Kept: %d  Dropped: %d\n", kept, dropped);
37 | 	return 0;
38 | }
39 | 
40 | 
41 | 


--------------------------------------------------------------------------------
/fastq/test.fastq.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:b526c90ab96ca8ec6bacfbfec56bc760991c7e0c38ed055f8776cccebd647df4
3 | size 78966
4 | 


--------------------------------------------------------------------------------
/fastq/trim_primers.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Please note: This is a practice for code that will end up in primer-trimming github (that we should rename).
 3 | 
 4 | You should use that version
 5 | """
 6 | 
 7 | import os
 8 | import sys
 9 | import argparse
10 | import PyPrinseq
11 | 
12 | __author__ = 'Rob Edwards'
13 | __copyright__ = 'Copyright 2020, Rob Edwards'
14 | __credits__ = ['Rob Edwards']
15 | __license__ = 'MIT'
16 | __maintainer__ = 'Rob Edwards'
17 | __email__ = 'raedwards@gmail.com'
18 | 
19 | if __name__ == '__main__':
20 |     parser = argparse.ArgumentParser(description=" ")
21 |     parser.add_argument('-f', help='fastq file', required=True)
22 |     parser.add_argument('-l', help='left primers file', required=True)
23 |     args = parser.parse_args()
24 | 
25 |     PyPrinseq.primertrimming(args.f, args.l, None)
26 | 


--------------------------------------------------------------------------------
/h5py/test_data.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Create an hdf5 test data set for turbocor
 3 | 
 4 | 
 5 | """
 6 | 
 7 | import os
 8 | import sys
 9 | import argparse
10 | import h5py
11 | import matplotlib.pyplot as plt
12 | import numpy as np
13 | import seaborn as sns
14 | 
15 | # this is taken from SO: https://stackoverflow.com/questions/18683821/generating-random-correlated-x-and-y-points-using-numpy
16 | 
17 | 
18 | 
19 | xx = np.array([-0.51, 51.2])
20 | yy = np.array([0.33, 51.6])
21 | means = [xx.mean(), yy.mean()]
22 | stds = [xx.std() / 3, yy.std() / 3]
23 | corr = 0.9         # correlation
24 | covs = [[stds[0]**2          , stds[0]*stds[1]*corr],
25 |         [stds[0]*stds[1]*corr,           stds[1]**2]]
26 | 
27 | m = np.random.multivariate_normal(means, covs, 1000).T
28 | with h5py.File('correlated.h5', 'a') as f:
29 |     if 'data' in f:
30 |         d = f['data']
31 |         d.resize(d.shape[0]+2, axis=0)
32 |         d[-2:] = m
33 |     else:
34 |         f.create_dataset("data", data=m, maxshape=(None,1000), chunks=True)
35 | 
36 | # sns.scatterplot(m[0], m[1])
37 | # plt.show()
38 | 
39 | 


--------------------------------------------------------------------------------
/h5py/tmp.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/h5py/tmp.h5


--------------------------------------------------------------------------------
/isolation_sources/README.md:
--------------------------------------------------------------------------------
1 | # For data in ~/Dropbox/Genotype-phenotype/Isolation Sources
2 | 
3 | 


--------------------------------------------------------------------------------
/isolation_sources/genera-environment.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import argparse
 4 | 
 5 | 
 6 | def genera(envF):
 7 |     """
 8 |     Read genera per environment
 9 |     :param envF:
10 |     :return:
11 |     """
12 | 
13 |     with open(envF, 'r') as f:
14 | 
15 | 
16 | 
17 | 
18 | 
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     parser = argparse.ArgumentParser(description="Calculate shared genera per env")
23 |     parser.add_argument('-f', help='file of environments and genera')
24 |     args = parser.parse_args()


--------------------------------------------------------------------------------
/jupyter/Emma_subsystems/EagleRay_level1.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:2ce48ce7c25fceb521de89fa24fe95088c1a6caa8faa4c9c001b35a22d9e66e1
3 | size 10782
4 | 


--------------------------------------------------------------------------------
/jupyter/Emma_subsystems/EagleRay_level1_norm_all.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:9502beb78f6f347bb55375bd179cd4b35d3068752e4f434826d2c25761166ed7
3 | size 10791
4 | 


--------------------------------------------------------------------------------
/jupyter/Emma_subsystems/EagleRay_level1_norm_ss.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:bc056f5111cf33f4aeb49f9a17e289bbf0d39fd378ef2356c43bac975710808d
3 | size 10814
4 | 


--------------------------------------------------------------------------------
/jupyter/Emma_subsystems/NorfolkWater_level1.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:b3e768aa89233898b08b131c9eda6ce955cc403130f0fdb91b3f7772832b5b1a
3 | size 20425
4 | 


--------------------------------------------------------------------------------
/jupyter/Emma_subsystems/NorfolkWater_level1_norm_all.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:e8b3d3133227c0714e97fb1f147d714ea97dc660563b583ad349069ef4e44146
3 | size 20434
4 | 


--------------------------------------------------------------------------------
/jupyter/Emma_subsystems/NorfolkWater_level1_norm_ss.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:932f8f94d2401c32380cc722c5141f6b647c5843ed95d57cdad87ef808810247
3 | size 20586
4 | 


--------------------------------------------------------------------------------
/jupyter/Emma_subsystems/PortJackson_level1.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:92b8639efee2e5f7c8584b5d8420dfcac6708bf429607bb68acf123f4a4eaf83
3 | size 11498
4 | 


--------------------------------------------------------------------------------
/jupyter/Emma_subsystems/PortJackson_level1_norm_all.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:1ebd7e6f6b34282c612807173699148329ca5749a2c637e1bb0cdefe9a3a0b26
3 | size 11507
4 | 


--------------------------------------------------------------------------------
/jupyter/Emma_subsystems/PortJackson_level1_norm_ss.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:cde6a00307bcf3e0c9d7734131fa430d93baf416a5c55206d982f20e25a60dd8
3 | size 11512
4 | 


--------------------------------------------------------------------------------
/jupyter/Emma_subsystems/TigerSharks_level1.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:3d9815154bab584826ca4a99e6a60fc251995228f5effc1cbaf511d542831758
3 | size 20971
4 | 


--------------------------------------------------------------------------------
/jupyter/Emma_subsystems/TigerSharks_level1_norm_all.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:5edb1d45808bca5f177e2fd2f3bb600a6085af670e3de5ce883df784bdd621b4
3 | size 20980
4 | 


--------------------------------------------------------------------------------
/jupyter/Emma_subsystems/TigerSharks_level1_norm_ss.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:957c447aa3cecf547cc6010f477c01a9eb5ef6b5168f97bb8402e88512ab5633
3 | size 21068
4 | 


--------------------------------------------------------------------------------
/jupyter/Emma_subsystems/eagle_ray_types/level1_norm_all.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:15cf6cf7c75c7eafdebedcced60e162c7ccbd4d39f9a107aee7fb27f3b840e31
3 | size 10724
4 | 


--------------------------------------------------------------------------------
/jupyter/Emma_subsystems/eagle_ray_types/level1_norm_ss.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:cce4747eb366ab9b852d231b1f696a8e2a0b8c3545a5929f2e515fd6a6b8dd4b
3 | size 10770
4 | 


--------------------------------------------------------------------------------
/jupyter/Emma_subsystems/eagle_ray_types/level1_raw.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:0fa890683819d6b9e91f72e2ba1e05cf4a9bb7b1a0f5011bec5aafb453251600
3 | size 5413
4 | 


--------------------------------------------------------------------------------
/jupyter/Emma_subsystems/eagle_ray_types/level2_norm_all.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:b4a3f5d5fa237a3fec88e848110316ca78d81985d26377da71922a208127e8fa
3 | size 43265
4 | 


--------------------------------------------------------------------------------
/jupyter/Emma_subsystems/eagle_ray_types/level2_norm_ss.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:045384a1fd95a31501ee32e651c2f46431b57cd390e2f718f8d1e98140a96c6a
3 | size 43436
4 | 


--------------------------------------------------------------------------------
/jupyter/Emma_subsystems/eagle_ray_types/level2_raw.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:ab8ffe777679dbee1512b3c5e3275d89d05cae5feb9136bf7ffd384d75e4e11f
3 | size 16711
4 | 


--------------------------------------------------------------------------------
/jupyter/Emma_subsystems/eagle_ray_types/ss_typed_norm_all.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:02273ecfd057264f4f37d527bc640cb0034f75793bc344fce197c2ef99e41a15
3 | size 174172
4 | 


--------------------------------------------------------------------------------
/jupyter/Emma_subsystems/eagle_ray_types/ss_typed_norm_ss.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:b09ccb52f25f7ed0e42b029e67b1d7f547c7fc369c22856b016d8f67cd86ced9
3 | size 175390
4 | 


--------------------------------------------------------------------------------
/jupyter/Emma_subsystems/eagle_ray_types/ss_typed_raw.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:3562afeeab0169cbca687ffcdd7c8dc5a35df75fae3dc7dc9b08adf461f96a7f
3 | size 60431
4 | 


--------------------------------------------------------------------------------
/jupyter/bacteroides_prophages.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/bacteroides_prophages.png


--------------------------------------------------------------------------------
/jupyter/data/get_headers.pl:
--------------------------------------------------------------------------------
 1 | use strict;
 2 | use Getopt::Std;
 3 | use Data::Dumper;
 4 | use Rob;
 5 | 
 6 | my $file = shift || die "file to process";
 7 | open(IN, $file) || die "$! : $file";
 8 | my @a = split /\t/, <IN>;
 9 | chomp(@a);
10 | my $idx = shift @a;
11 | 
12 | my @first;
13 | my @second;
14 | map {if (m/chip/) {push @first, $_} elsif ($_) {push @second, $_}} @a;
15 | my $c=0;
16 | foreach my $l (sort {$a cmp $b} @first) {
17 | 	$c++;
18 | 	print "$l\t$c\n";
19 | }
20 | foreach my $l (sort {$a cmp $b} @second) {
21 | 	$c++;
22 | 	print "$l\t$c\n";
23 | }
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/jupyter/lactobacillus_prophages.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/lactobacillus_prophages.png


--------------------------------------------------------------------------------
/jupyter/sarah_data/mmseqs_taxonomy/all_levels.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:58856f191c98072e280bee0acbfe5fef927d33c301bb7b9b32368156ff2a6e7b
3 | size 1788929
4 | 


--------------------------------------------------------------------------------
/jupyter/sarah_data/mmseqs_taxonomy/all_levels_renamed.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:268178f7756f2135ed31f6a51f5a188d65cb47c1e449167afc531fd63976f58a
3 | size 1788864
4 | 


--------------------------------------------------------------------------------
/jupyter/sarah_data/mmseqs_taxonomy/animation/img_10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/mmseqs_taxonomy/animation/img_10.png


--------------------------------------------------------------------------------
/jupyter/sarah_data/mmseqs_taxonomy/animation/img_11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/mmseqs_taxonomy/animation/img_11.png


--------------------------------------------------------------------------------
/jupyter/sarah_data/mmseqs_taxonomy/animation/img_12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/mmseqs_taxonomy/animation/img_12.png


--------------------------------------------------------------------------------
/jupyter/sarah_data/mmseqs_taxonomy/animation/img_13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/mmseqs_taxonomy/animation/img_13.png


--------------------------------------------------------------------------------
/jupyter/sarah_data/mmseqs_taxonomy/animation/img_14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/mmseqs_taxonomy/animation/img_14.png


--------------------------------------------------------------------------------
/jupyter/sarah_data/mmseqs_taxonomy/animation/img_15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/mmseqs_taxonomy/animation/img_15.png


--------------------------------------------------------------------------------
/jupyter/sarah_data/mmseqs_taxonomy/animation/img_16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/mmseqs_taxonomy/animation/img_16.png


--------------------------------------------------------------------------------
/jupyter/sarah_data/mmseqs_taxonomy/animation/img_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/mmseqs_taxonomy/animation/img_2.png


--------------------------------------------------------------------------------
/jupyter/sarah_data/mmseqs_taxonomy/animation/img_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/mmseqs_taxonomy/animation/img_3.png


--------------------------------------------------------------------------------
/jupyter/sarah_data/mmseqs_taxonomy/animation/img_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/mmseqs_taxonomy/animation/img_4.png


--------------------------------------------------------------------------------
/jupyter/sarah_data/mmseqs_taxonomy/animation/img_5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/mmseqs_taxonomy/animation/img_5.png


--------------------------------------------------------------------------------
/jupyter/sarah_data/mmseqs_taxonomy/animation/img_6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/mmseqs_taxonomy/animation/img_6.png


--------------------------------------------------------------------------------
/jupyter/sarah_data/mmseqs_taxonomy/animation/img_7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/mmseqs_taxonomy/animation/img_7.png


--------------------------------------------------------------------------------
/jupyter/sarah_data/mmseqs_taxonomy/animation/img_8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/mmseqs_taxonomy/animation/img_8.png


--------------------------------------------------------------------------------
/jupyter/sarah_data/mmseqs_taxonomy/animation/img_9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/mmseqs_taxonomy/animation/img_9.png


--------------------------------------------------------------------------------
/jupyter/sarah_data/mmseqs_taxonomy/animation/taxonomy.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/mmseqs_taxonomy/animation/taxonomy.gif


--------------------------------------------------------------------------------
/jupyter/sarah_data/mmseqs_taxonomy/class.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:5aef8a6a4af7ff80dcc84915274bcb7a6f71e9005c659d79966f278127479774
3 | size 15934
4 | 


--------------------------------------------------------------------------------
/jupyter/sarah_data/mmseqs_taxonomy/class_renamed.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:8058cabad4699fadd0747fd66fd0eb77087c4afc030a7613e20c8efc45e6b5de
3 | size 15902
4 | 


--------------------------------------------------------------------------------
/jupyter/sarah_data/mmseqs_taxonomy/family.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:b50225e7f78de7055fa908373a631b75e9f5ff465ca93f47ce7e60489718e92e
3 | size 62056
4 | 


--------------------------------------------------------------------------------
/jupyter/sarah_data/mmseqs_taxonomy/family_renamed.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:fabe353af584179e6a7a2973f796991257a75287940da39c62b5957e2af4d368
3 | size 62017
4 | 


--------------------------------------------------------------------------------
/jupyter/sarah_data/mmseqs_taxonomy/genus.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:e57dcaf0a670ccbeba5ac4c8fcf0971ea4e897c34f5369f5b21ed2615543e7db
3 | size 171315
4 | 


--------------------------------------------------------------------------------
/jupyter/sarah_data/mmseqs_taxonomy/genus_renamed.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:825923ecbd31fef4514e0b030e1515b757b0db975d2f62ff53cb6fa568a6ca1c
3 | size 171283
4 | 


--------------------------------------------------------------------------------
/jupyter/sarah_data/mmseqs_taxonomy/order.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:2a91d16a4a12a245d005221c8004da7f2f09cf0f6b37d9396d5a7b72db6c864f
3 | size 32319
4 | 


--------------------------------------------------------------------------------
/jupyter/sarah_data/mmseqs_taxonomy/order_renamed.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:4a290a9420f5f9085790cdf23f78847ebd2a7182bdb9d176386670f1d7262500
3 | size 32283
4 | 


--------------------------------------------------------------------------------
/jupyter/sarah_data/mmseqs_taxonomy/pca_by_approach.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/mmseqs_taxonomy/pca_by_approach.png


--------------------------------------------------------------------------------
/jupyter/sarah_data/mmseqs_taxonomy/pca_by_filter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/mmseqs_taxonomy/pca_by_filter.png


--------------------------------------------------------------------------------
/jupyter/sarah_data/mmseqs_taxonomy/pca_by_method.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/mmseqs_taxonomy/pca_by_method.png


--------------------------------------------------------------------------------
/jupyter/sarah_data/mmseqs_taxonomy/pca_by_replicate.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/mmseqs_taxonomy/pca_by_replicate.png


--------------------------------------------------------------------------------
/jupyter/sarah_data/mmseqs_taxonomy/pca_by_sample.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/mmseqs_taxonomy/pca_by_sample.png


--------------------------------------------------------------------------------
/jupyter/sarah_data/mmseqs_taxonomy/pca_combined_replicates.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/mmseqs_taxonomy/pca_combined_replicates.png


--------------------------------------------------------------------------------
/jupyter/sarah_data/mmseqs_taxonomy/phylum.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:0096fb49275e35c2ca5fb12de5ec5411980d16f2eb7c9aa01f768f510c34d3e6
3 | size 13142
4 | 


--------------------------------------------------------------------------------
/jupyter/sarah_data/mmseqs_taxonomy/phylum_renamed.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:29aaaf1f49cebb44a6540c46cd23f8a3c1574a3f46ad7514c1ee8c3cfd6d5ef0
3 | size 13118
4 | 


--------------------------------------------------------------------------------
/jupyter/sarah_data/mmseqs_taxonomy/species.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:ec0a283c3e57c47b6c3de2635f87b52ab99604bfeee0ac5ea60c34b05a46e43b
3 | size 920431
4 | 


--------------------------------------------------------------------------------
/jupyter/sarah_data/mmseqs_taxonomy/species_renamed.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:8004e4bbd9f8af70ba304b330156f60a2f9ab122e7aad405d4c91087954db821
3 | size 920367
4 | 


--------------------------------------------------------------------------------
/jupyter/sarah_data/mmseqs_taxonomy/superkingdom.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:07d848069a728613a45d9d3be4de3460b7c2f31a079606776421d179cbdd3e47
3 | size 857
4 | 


--------------------------------------------------------------------------------
/jupyter/sarah_data/mmseqs_taxonomy/superkingdom_renamed.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:eef8e2ca8e33e73ffec79d9a83468dda871f9bc77fba00d584cd5c8e7df6d629
3 | size 838
4 | 


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/README.md:
--------------------------------------------------------------------------------
 1 | # NORMALIZATIONS
 2 | 
 3 | Currently we perform three normalizations:
 4 | 
 5 | 1. \*\_raw.tsv
 6 | 
 7 | This is the non-normalised data, so just the raw counts. For each sequence, if it appears in one subsystem we incremenet that count by 1, but if it occurs in more than one subsystem, we increment that count by 1/n (1/2 for 2 subsystems, 1/3 for 3 subsystems, etc).
 8 | 
 9 | 2. \*\_norm_all.tsv
10 | 
11 | This data is normalised for _all_ reads, regardless of whether they are in a subsystem or not. This makes smaller numbers. 
12 | 
13 | 3. \*\_norm_ss.tsv
14 | 
15 | This data is normalised only to the number of reads that match to subsystems, so if there is a lot of other stuff we ignore it.
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/all_norm_all.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:df1101333f95f325366b8807d1caaae0a0cc4a7cd71812b7296d10f8dc44cf7e
3 | size 230533
4 | 


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/all_norm_all_renamed.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:0f9d76e22baba6dfbc17b46a0c5bc9017d385f8bfb3ef3bd0f542e7f703f1e30
3 | size 230577
4 | 


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/all_norm_ss.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:d4e1595695e2a67ab48d69e4c69aa3ccf6b27a07a6b2c0374677bb9830c90fa4
3 | size 231419
4 | 


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/all_norm_ss_renamed.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:27df5af57de582b8acd880d39ae3363b5394dcded57d7271724cf8bf35ca2f00
3 | size 231463
4 | 


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/all_raw.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:e7d1ebf57b9e220840468ded0b1fbf85614371022269a0cfdbe7c7053a50389c
3 | size 83793
4 | 


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/all_raw_renamed.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:8c33f5f97ca06e83a40b7f21bdd915dcad602066168ff6c76c4b50c9b7ba8576
3 | size 83610
4 | 


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/animation/img_10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/subsystems/animation/img_10.png


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/animation/img_11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/subsystems/animation/img_11.png


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/animation/img_12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/subsystems/animation/img_12.png


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/animation/img_13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/subsystems/animation/img_13.png


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/animation/img_14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/subsystems/animation/img_14.png


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/animation/img_15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/subsystems/animation/img_15.png


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/animation/img_16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/subsystems/animation/img_16.png


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/animation/img_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/subsystems/animation/img_2.png


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/animation/img_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/subsystems/animation/img_3.png


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/animation/img_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/subsystems/animation/img_4.png


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/animation/img_5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/subsystems/animation/img_5.png


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/animation/img_6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/subsystems/animation/img_6.png


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/animation/img_7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/subsystems/animation/img_7.png


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/animation/img_8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/subsystems/animation/img_8.png


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/animation/img_9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/subsystems/animation/img_9.png


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/animation/subsystems.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/subsystems/animation/subsystems.gif


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/class_norm_all.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:fb6e04772566d1276a3680e25f00ef74ecef9b11b4be9823b55ef3db175f77c7
3 | size 4546
4 | 


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/class_norm_all_renamed.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:ea4e2ac3cc4ba80f4ab2f961c9412249af8e913a3c5dc32446ec161966f0d0e3
3 | size 4578
4 | 


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/class_norm_ss.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:8ae2abf953ef5d2044cdabc4674c1f8f688baec58038a091a490e23bf605003d
3 | size 4548
4 | 


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/class_norm_ss_renamed.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:eb63cc354a1a1c3c3961971d4d75a1cbaa126a18f5fc80cb064cb2885a625d58
3 | size 4578
4 | 


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/class_raw.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:3f4125bc09e5b15cdcb1b3f6cf9471f485a583b906b7393e66d68f881895f25a
3 | size 3153
4 | 


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/class_raw_renamed.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:3bf2f6cb66c3ef3689b83955077fb4967f8eee2e0a931fb67cb0e9d5cc1c5acf
3 | size 3171
4 | 


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/level1_norm_all.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:498aab50552e37c89cd29109718c4b6d9905c23e8bcc285fee4e2f2122bc1e05
3 | size 11863
4 | 


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/level1_norm_all_renamed.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:3a1f53cba5dd1aee5cec3fcc2a87b414a8fc6dee2dc959e2f2c027294cddb61d
3 | size 11897
4 | 


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/level1_norm_ss.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:26d50896610e8ade7ad7168e55e5685716a5b8d341952cb86ed64f4320b73dc6
3 | size 11844
4 | 


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/level1_norm_ss_renamed.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:09e3d6f9086406c040574a01176caa1af841f6dbe2421142453048ce8ed98cb8
3 | size 11878
4 | 


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/level1_raw.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:abe36aba85f88a9587ec3dc1ae9e1cce323c23fded8d04f70b8f5f973617164b
3 | size 6837
4 | 


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/level1_raw_renamed.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:18f7651b1b8dd08a956262605d85cfaaa029593ca1b6d86a5a91b27ab5772e98
3 | size 6856
4 | 


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/level2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/subsystems/level2.png


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/level2_norm_all.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:ee134eecf83b7eec83ca1c5e87cbe7ebee1ad853716b63e9724613331f80249a
3 | size 49983
4 | 


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/level2_norm_all_renamed.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:c50aff3a6902a98a1b9bc5e66e28e57a591ad2d14c1c19f4ba5b39a874c04f96
3 | size 50030
4 | 


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/level2_norm_ss.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:2601d0e377ebf2119a0a43e21147b58f75d86e3dd83b34da55ce5f32022304e0
3 | size 50114
4 | 


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/level2_norm_ss_renamed.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:d2e6ab18718b9dd843f26420187ca772c505ded6636a6c2173555e16bc5166da
3 | size 50153
4 | 


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/level2_raw.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:91df8242cb4bd16ad574b4f20b4cd3e00a0c7b675d3745370e64d287f76c0451
3 | size 21481
4 | 


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/level2_raw_renamed.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:39632352c97256c26af5de894a68d7f4d6bebabd66c075479c6613e9dd49759f
3 | size 21500
4 | 


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/normalised_subsystem_level2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/subsystems/normalised_subsystem_level2.png


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/pca_by_approach.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/subsystems/pca_by_approach.png


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/pca_by_filter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/subsystems/pca_by_filter.png


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/pca_by_method.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/subsystems/pca_by_method.png


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/pca_by_replicate.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/subsystems/pca_by_replicate.png


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/pca_by_sample.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/subsystems/pca_by_sample.png


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/pca_combined_replicates.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/subsystems/pca_combined_replicates.png


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/subsystems_norm_all.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:5157a6de41c6c9ad430076ab1772079bb943a17c579ffa9940231c69afefc31c
3 | size 225791
4 | 


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/subsystems_norm_all_renamed.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:88facbf9b39609a93c7f83e448b57e2c07560f8486269fa55bfbee0b4f448d89
3 | size 225818
4 | 


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/subsystems_norm_ss.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:b76508435fa599cd36b9c7f86c7b2b89af92160084d9e649bb349b81bb8a478a
3 | size 226313
4 | 


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/subsystems_norm_ss_renamed.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:b90fdf723679ebd12ba035d121a6906dbec200b81d3bd03c8bac8cbe11b987f4
3 | size 226359
4 | 


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/subsystems_raw.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:5794567819778d270611d5b2b2a683dd457c77041296044ff1631c468b470624
3 | size 79893
4 | 


--------------------------------------------------------------------------------
/jupyter/sarah_data/subsystems/subsystems_raw_renamed.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:0210849eaf89ba559b6f10738be5787c96bfcf6dc9b02cffc2d71d79ed79481b
3 | size 79688
4 | 


--------------------------------------------------------------------------------
/jupyter/subsystems_data.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:db34618a583198795f2624f590d004843af584530c9c22d82e977f0615e50fae
3 | size 656607
4 | 


--------------------------------------------------------------------------------
/jupyter/subsystems_data_all.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:0ead7593972d777acce4b91d5cfad642f6edeedfdd6f25d7d941b9194e14ef34
3 | size 656559
4 | 


--------------------------------------------------------------------------------
/jupyter/taxonomy_data.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:8df2e6cf456fdda362f72c5518c6e2a657fc59a03e7a12937507a0832c7bd3ee
3 | size 3293678
4 | 


--------------------------------------------------------------------------------
/jupyter/taxonomy_data_all.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:af7b5210010a96b006021d5222ba380ba38e18f0a73f7619a8c560b210fd2a24
3 | size 3292856
4 | 


--------------------------------------------------------------------------------
/jupyter/test2.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 2,
 6 |    "metadata": {
 7 |     "collapsed": true
 8 |    },
 9 |    "outputs": [],
10 |    "source": [
11 |     "import os"
12 |    ]
13 |   }
14 |  ],
15 |  "metadata": {
16 |   "kernelspec": {
17 |    "display_name": "Python 3",
18 |    "language": "python",
19 |    "name": "python3"
20 |   },
21 |   "language_info": {
22 |    "codemirror_mode": {
23 |     "name": "ipython",
24 |     "version": 2
25 |    },
26 |    "file_extension": ".py",
27 |    "mimetype": "text/x-python",
28 |    "name": "python",
29 |    "nbconvert_exporter": "python",
30 |    "pygments_lexer": "ipython2",
31 |    "version": "2.7.6"
32 |   }
33 |  },
34 |  "nbformat": 4,
35 |  "nbformat_minor": 0
36 | }


--------------------------------------------------------------------------------
/kbase/parse_json.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Parse the JSON file and print some stuff out
 3 | """
 4 | 
 5 | import os
 6 | import sys
 7 | 
 8 | import argparse
 9 | import json
10 | 
11 | import re
12 | 
13 | if __name__ == '__main__':
14 |     parser = argparse.ArgumentParser(description="Parse the JSON file downloaded from KBase")
15 |     parser.add_argument('-f', help='JSON file', required=True)
16 |     args = parser.parse_args()
17 | 
18 |     data = json.load(open(args.f))
19 | 
20 |     # print all the keys
21 |     print("\n".join(data.keys()))
22 | 
23 |     # print keys associated with features
24 |     feats = data['features']
25 | 
26 |     print("There are " + str(len(data['contig_lengths'])) + " contigs")
27 | 
28 |     sys.exit(0)
29 | 
30 |     for f in feats:
31 |         #print("\t".join([f['id'], f['function']]))
32 |         for p in f['function'].split(' ; '):
33 |             m = re.match('\s*[\d\-\.]+$', p)
34 |             if m and m.end() == len(p):
35 |                 print("\t".join([f['id'], 'EC ' + p]))
36 |            # else:
37 |            #     print("\t".join([f['id'], p]))
38 | 
39 | 
40 | 


--------------------------------------------------------------------------------
/kmers/count_kmers.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Count the 11mers in a sequence
 3 | """
 4 | 
 5 | import os
 6 | import sys
 7 | import argparse
 8 | from itertools import product
 9 | from roblib import read_fasta
10 | from roblib import median
11 | 
12 | parser = argparse.ArgumentParser(description='Count the kmers in a fasta file')
13 | parser.add_argument('-f', help='fasta file', required=True)
14 | parser.add_argument('-s', help='K-mer size, (default=11)', type=int, default=11)
15 | args = parser.parse_args()
16 | 
17 | fa = read_fasta(args.f)
18 | 
19 | for id in fa:
20 |     count = []
21 |     for k in product("ATGC", repeat=args.s):
22 |         sk = "".join(k)
23 |         count.append(fa[id].upper().count(sk))
24 | 
25 |     print("id: {} len(seq): {} sum: {}  n: {} average: {} median: {} max: {}".format(
26 |         id, len(fa[id]), sum(count), len(count), (1.0 * sum(count) / len(count)), median(count), max(count)
27 |     ))
28 | 
29 | 
30 | 


--------------------------------------------------------------------------------
/kmers/count_kmers_genbank.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Count the kmers in a genbank file
 3 | """
 4 | 
 5 | 
 6 | import os
 7 | import sys
 8 | import argparse
 9 | from itertools import product
10 | from roblib import median
11 | from Bio import SeqIO
12 | 
13 | parser = argparse.ArgumentParser(description='Count the kmers in a fasta file')
14 | parser.add_argument('-g', help='genbank file', required=True)
15 | parser.add_argument('-s', help='K-mer size, (default=11)', type=int, default=11)
16 | args = parser.parse_args()
17 | 
18 | gbk = SeqIO.parse(args.g, 'genbank')
19 | for record in gbk:
20 |     count = []
21 |     for k in product("ATGC", repeat=args.s):
22 |         sk = "".join(k)
23 |         count.append(record.seq.upper().count(sk))
24 | 
25 |     print("id: {} len(seq): {} sum: {}  n: {} average: {} median: {} max: {}".format(
26 |         id, len(record.seq), sum(count), len(count), (1.0 * sum(count) / len(count)), median(count), max(count)
27 |     ))
28 | 
29 | 
30 | 


--------------------------------------------------------------------------------
/kmers/find_kmers.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Model the frequency that we find different kmers in DNA sequences
 3 | """
 4 | 
 5 | import os
 6 | import sys
 7 | import argparse
 8 | from random import randint
 9 | from itertools import product
10 | from statistics import median
11 | 
12 | __author__ = 'Rob Edwards'
13 | 
14 | def generate_random_seq(length):
15 |     """
16 |     Generate a random sequence of length len
17 |     :param length: the length to generate
18 |     :return: str
19 |     """
20 | 
21 |     bases = {1: "A", 2: "G", 3: "C", 4: "T"}
22 |     seq=""
23 |     for i in range(length):
24 |         seq += bases[randint(1,4)]
25 |     return seq
26 | 
27 | 
28 | 
29 | #seq = generate_random_seq(1000000)
30 | seq = generate_random_seq(4194304)
31 | count = []
32 | for k in product("ATGC", repeat=11):
33 |     sk="".join(k)
34 |     count.append(seq.count(sk))
35 | 
36 | 
37 | 
38 | print("sum: {}  n: {} average: {} median: {}".format(
39 |     sum(count), len(count), (1.0*sum(count)/len(count)), median(count)
40 | ))
41 | 
42 | 


--------------------------------------------------------------------------------
/kmers/hashcode.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 
 3 | """
 4 | 
 5 | import os
 6 | import sys
 7 | import argparse
 8 | 
 9 | from roblib import bcolors
10 | 
11 | if __name__ == '__main__':
12 |     parser = argparse.ArgumentParser(description="Plot a heatmap")
13 |     parser.add_argument('-f', help='file', required=True)
14 |     parser.add_argument('-o', help='output file')
15 |     parser.add_argument('-v', help='verbose output', action='store_true')
16 |     args = parser.parse_args()
17 | 


--------------------------------------------------------------------------------
/manipulate_genomes/README.md:
--------------------------------------------------------------------------------
1 | # Filter some reads based on a blast file


--------------------------------------------------------------------------------
/matplotlib graphs/show_fig.py:
--------------------------------------------------------------------------------
1 | import pickle
2 | 
3 | figx = pickle.load(open('3dfig.pickle', 'rb'))
4 | figx.show()


--------------------------------------------------------------------------------
/mmseqs/dummy_database.sqlite:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:c0238af1292f9b9971da51b632ec5d7f2a75ff20fe039552385e76396010c96a
3 | size 24576
4 | 


--------------------------------------------------------------------------------
/mongodb/find_biomasses.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Find biomasses greater than a value
 3 | """
 4 | 
 5 | import os
 6 | import sys
 7 | 
 8 | from pymongo import MongoClient
 9 | 
10 | databasename = 'fba_models'
11 | collectionname = 'citrobacter'
12 | 
13 | value = 40
14 | 
15 | 
16 | client = MongoClient()
17 | 
18 | coll = client[databasename][collectionname]
19 | 
20 | for cursor in coll.find({"content.biomasses.biomasscompounds.coefficient": { '$gt': 35 }}):
21 |     print(cursor['file_name'])


--------------------------------------------------------------------------------
/mongodb/print_keys.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Print the keys from a file
 3 | """
 4 | 
 5 | import os
 6 | import sys
 7 | 
 8 | import argparse
 9 | import json
10 | 
11 | if __name__ == '__main__':
12 |     parser = argparse.ArgumentParser(description="Print all the keys in a file")
13 |     parser.add_argument('-f', help='Files to print keys from', required=True)
14 |     args = parser.parse_args()
15 | 
16 |     data = json.load(open(args.f, 'r'))
17 |     for k in data:
18 |         print(k)


--------------------------------------------------------------------------------
/mongodb/search_mongo.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Search a mongo database loaded with load_models.py
 3 | 'file_name' : '/data/Genotype-Phenotype-Modeling/models/Citrobacter/Citrobacter/models/C.sedlakii_gf_draft_ArgonneLB.json'
 4 | 
 5 | """
 6 | 
 7 | 
 8 | import argparse
 9 | 
10 | from pymongo import MongoClient
11 | 
12 | if __name__ == '__main__':
13 |     parser = argparse.ArgumentParser(description="Search a mongo database")
14 |     parser.add_argument('-n', help='Database name', required=True)
15 |     parser.add_argument('-c', help='Collection name', required=True)
16 |     parser.add_argument('-k', help='Key to find', required=True)
17 |     parser.add_argument('-v', help='Value for key', required=True)
18 |     args = parser.parse_args()
19 | 
20 |     client = MongoClient()
21 | 
22 |     db = client[args.n]
23 |     coll = db[args.c]
24 | 
25 |     for cursor in coll.find({args.k : args.v}):
26 |         # this is a cursor to the document
27 |         print(cursor['file_name'])
28 | 


--------------------------------------------------------------------------------
/mongodb/simple_find.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Search for something in the database
 3 | 
 4 | """
 5 | 
 6 | import sys
 7 | import os
 8 | 
 9 | import json
10 | from pymongo import MongoClient
11 | 
12 | client = MongoClient()
13 | 
14 | db = client['fba_models']
15 | coll = db['citrobacter']
16 | 
17 | for cursor in coll.find({ "media_ref": "5067/41/1" }):
18 |     # examples from the models data;
19 |     """
20 |     "content.id" :  "contig00112.fbamdl3.fba.90"
21 | 
22 |     "content.objectiveValue"  : { '$gt' : 10 }
23 |     "media_ref":"5067/41/1"
24 | 
25 |     """
26 |     # this is a cursor to the document
27 |     print(cursor['file_name'])
28 | 
29 | 
30 | 


--------------------------------------------------------------------------------
/mongodb/simple_load_models.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Create a mongo database if it doesn't exist and load a bunch of data into it.
 3 | 
 4 | We need a directory with one or more JSON files in it. We look for JSON on the end of the filename.
 5 | 
 6 | e.g. python load_models.py -d /data/Genotype-Phenotype-Modeling/models/Citrobacter/Citrobacter/models/ -n fba_models -c citrobacter
 7 | """
 8 | 
 9 | 
10 | import sys
11 | import os
12 | 
13 | import json
14 | from pymongo import MongoClient
15 | 
16 | client = MongoClient()
17 | 
18 | db = client['fba_models']
19 | coll = db['citrobacter']
20 | 
21 | 
22 | for f in os.listdir('/data/Genotype-Phenotype-Modeling/models/Citrobacter/Citrobacter/models/'):
23 |     if f.lower().endswith('.json'):
24 |         sys.stderr.write("Loading file " + f + "\n")
25 |         text = json.load(open(os.path.join(args.d, f)))
26 |         obj = {'file_name': os.path.join(args.d, f), 'content': text}
27 |         coll.insert(obj)
28 | 
29 | 


--------------------------------------------------------------------------------
/mummer/six_mers.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Create a list of all six mers and filter for those that are also  in the list of reverse complement 6 mers
 3 | """
 4 | 
 5 | import os
 6 | import sys
 7 | import argparse
 8 | 
 9 | from roblib import bcolors
10 | from roblib import rc
11 | 
12 | from itertools import product
13 | 
14 | def countit():
15 |     kmer = 6
16 |     bases = {'A', 'C', 'T', 'G'}
17 | 
18 |     fwd = set()
19 |     rev = set()
20 |     for s in product(bases, repeat = kmer):
21 |         seq = "".join(s)
22 |         fwd.add(seq)
23 |         rev.add(rc(seq))
24 | 
25 |     count = 0
26 |     for f in fwd:
27 |         if f not in rev:
28 |             print(f)
29 |         else:
30 |             count+=1
31 | 
32 |     print(f"Checked {count}")
33 | 
34 | if __name__ == '__main__':
35 |     countit()


--------------------------------------------------------------------------------
/ncbi/genbank_phages_via_ftp.py:
--------------------------------------------------------------------------------
 1 | import StringIO
 2 | from ftplib import FTP
 3 | import gzip
 4 | from Bio import SeqIO
 5 | 
 6 | r = StringIO.StringIO()
 7 | 
 8 | def read_data(data):
 9 |     r.write(data)
10 | 
11 | ftp = FTP('ftp.ncbi.nlm.nih.gov')
12 | ftp.login()
13 | ftp.cwd('genbank/')
14 | ftp.retrbinary('RETR gbphg3.seq.gz', r.write)
15 | 
16 | r.seek(0)
17 | 
18 | for seq in SeqIO.parse(gzip.GzipFile(fileobj=r), 'genbank'):
19 |     print(seq.id + "\t" + seq.)


--------------------------------------------------------------------------------
/ncbi/get_protein_sequence.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Get a set of protein sequences from NCBI
 3 | """
 4 | 
 5 | import os
 6 | import sys
 7 | 
 8 | import argparse
 9 | 
10 | from Bio import Entrez, SeqIO
11 | from time import sleep
12 | from random import randint
13 | 
14 | if __name__ == '__main__':
15 |     parser = argparse.ArgumentParser(description="Get a set of protein sequences from NCBI")
16 |     parser.add_argument('-f', help='File of IDs to get', required=True)
17 |     parser.add_argument('-o', help='Output file', required=True)
18 |     args = parser.parse_args()
19 | 
20 |     # retrieve a GI number from GenBank
21 |     Entrez.email = 'raedwards@gmail.com'  # set this so NCBI knows who to complain to
22 | 
23 |     out = open(args.o, 'w')
24 |     with open(args.f, 'r') as f:
25 |         for l in f:
26 |             p=l.strip().split("\t")
27 |             handle = Entrez.efetch(db="protein", id=p[0], rettype="gbwithparts", retmode="text")
28 |             out.write(handle.read())
29 |             sleep(randint(0, 5))
30 |     out.close()
31 | 


--------------------------------------------------------------------------------
/ncbi/tax2spreadsheetdb.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Print the NCBI taxonomy as a spreadsheet
 3 | """
 4 | 
 5 | from taxon import get_taxonomy_db, get_taxonomy, all_species_ids
 6 | 
 7 | want = ['superkingdom', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
 8 | 
 9 | def printtaxa(i, c):
10 |     """
11 |     Print out the taxonomy
12 |     :param i: identifier
13 |     :param c: database connection
14 |     :return:
15 |     """
16 | 
17 |     names = {w: "" for w in want}
18 |     t, n = get_taxonomy(i, c)
19 |     if t.rank in want:
20 |         names[t.rank] = n.get_name()
21 |     while t.parent != 1 and t.taxid != 1:
22 |         t, n = get_taxonomy(t.parent, c)
23 |         if t.rank in want:
24 |             names[t.rank] = n.get_name()
25 |     print("\t".join([str(i)] + [names[w] for w in want]))
26 | 
27 | 
28 | 
29 | if __name__ == '__main__':
30 |     c = get_taxonomy_db()
31 |     for i in all_species_ids(c):
32 |         printtaxa(i[0], c)
33 | 


--------------------------------------------------------------------------------
/ncbi/taxonomy_phylum_kingdom.py:
--------------------------------------------------------------------------------
 1 | from taxon import get_taxonomy_db, get_taxonomy, all_ids
 2 | 
 3 | want = ['species', 'genus', 'family', 'order', 'class', 'phylum', 'kingdom']
 4 | 
 5 | def printtaxa(i):
 6 |     bn=names[i].name
 7 |     if i in blastname:
 8 |         bn=blastname[i].name
 9 |     
10 |     level={}
11 |     
12 |     node = i
13 |     while taxa[node].parent != '1' and node != '1':
14 |         if taxa[node].rank in want:
15 |             level[taxa[node].rank]=names[node].name
16 |         node=taxa[node].parent
17 | 
18 |     print("{}\t{}".format(i, bn), end="")
19 |     for l in want:
20 |         if l in level:
21 |             print("\t{}".format(level[l]), end="")
22 |         else:
23 |             print("\t-", end="")
24 |     print("")
25 | 
26 | 
27 | c = get_taxonomy_db()
28 | for i in all_ids(c):
29 |     print (f"{i}")
30 |     t, n = get_taxonomy(i, c)
31 |     if t.rank == "phylum":
32 |         while t.parent != 1 and t.taxid != 1:
33 |             t, n = get_taxonomy(t.parent, c)
34 |             print(f"rank: {t.rank} :: name: {t.common_name}")
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/percent_pairwise_identity/RecA_uniprot.faa.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:62f8a5cde6d4176269b2a74de0098bca73643e9eff9906f0e24950a68daf7993
3 | size 381127
4 | 


--------------------------------------------------------------------------------
/percent_pairwise_identity/identical_percent_ids.json.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:6df18ec4c0e3f1bd1f615518d2b90ee5b12063c4f5e4d54323499ea36ef4e7f3
3 | size 7501033
4 | 


--------------------------------------------------------------------------------
/percent_pairwise_identity/min_percent_counts.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | sl = [] # sorted list
 4 | with open("RecA_uniprot_pairwise_ids.tsv", 'r') as f:
 5 | #with open("temp", 'r') as f:
 6 |     for l in f:
 7 |         p=l.strip().split("\t")
 8 |         val = float(p[2])
 9 |         added = False
10 |         for i in range(len(sl)):
11 |             if sl[i] > val:
12 |                 sl[i:i]=[val]
13 |                 added=True
14 |                 break
15 |         if not added:
16 |             sl.append(val)
17 | 
18 | indices = [-1 for i in range(100)]
19 | for i in range(len(sl)):
20 |     ni = int(sl[i])+1
21 |     sys.stderr.write("Adding {} for {}\n".format(ni, sl[i]))
22 |     indices[ni]=i
23 | 
24 | print(sl)
25 | 
26 | lastindex=-1
27 | for i in range(50, 70):
28 |     if indices[i] == -1:
29 |         indices[i] = lastindex
30 |     else:
31 |         lastindex = indices[i]
32 |     print("{}\t{}".format(i, indices[i]+1))
33 | 
34 | 


--------------------------------------------------------------------------------
/percent_pairwise_identity/needleman_wunsch-0.3.5/README.md:
--------------------------------------------------------------------------------
 1 | # Needleman-Wunsch global alignment
 2 | 
 3 | This is a fork of software originally written by Isaac Turner and available from [sourceforge](http://sourceforge.net/projects/needlemanwunsch). 
 4 | 
 5 | We have made some adjustments to the code, and changed the input/output formats. This code is copyright Isaac Turner and Katelyn McNair.
 6 | 
 7 | We will be releasing a full version of this software soon!
 8 | 
 9 | === License
10 | 
11 | This software was originally written under the GPL, and thus we use that license:
12 | 
13 | GNU General Public License (v3 or later).  See LICENSE file.
14 | 


--------------------------------------------------------------------------------
/percent_pairwise_identity/needleman_wunsch-0.3.5/seq1.fna:
--------------------------------------------------------------------------------
1 | >one
2 | actg
3 | 


--------------------------------------------------------------------------------
/percent_pairwise_identity/needleman_wunsch-0.3.5/seq2.fna:
--------------------------------------------------------------------------------
1 | >two
2 | act
3 | 


--------------------------------------------------------------------------------
/percent_pairwise_identity/strain_taxonomy.txt.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:f646c9ee6d0ecd307c6c23a3130ce156f2d220c983a8a81733d797d7479e8bd1
3 | size 131090
4 | 


--------------------------------------------------------------------------------
/perl/SGE-0.02/Changes:
--------------------------------------------------------------------------------
 1 | v 0.01
 2 | 
 3 | Initial release into the wild
 4 | 
 5 | 
 6 | v 0.02 
 7 | 
 8 | October 2005. Received these excellent comments from Hinri Kerstens:
 9 | 
10 | 	Thank you for writing the ScheduleSGE perl module. When using this
11 | 	module I found two 'mismatches' between your system and mine.
12 | 	- My SGE version (5.3) returns "your job" instead of "Your job" after
13 | 	submission of a job. Maybe you can make line 233 of Run.pm tolerant for
14 | 	that.
15 | 	- The CPAN documentation claims that a jobID can be grabbed by  "my
16 | 	$pid=$sge->job_id;", but job_id doesn't exist in the modules. It should
17 | 	be "my $pid=$sge->execute;" isn't it?
18 | 	
19 | 	After these modifications the module runs happily, so keep on the good
20 | 	work.
21 | 	
22 | 	regards
23 | 	
24 | 	Hinri
25 | 
26 | These two bugs have been fixed, and the method job_id added
27 | 
28 | 
29 | 
30 | 


--------------------------------------------------------------------------------
/perl/SGE-0.02/Control/CVS/Entries:
--------------------------------------------------------------------------------
1 | /Control.pm/1.1.1.1/Fri Sep 14 22:49:31 2007//
2 | /MANIFEST/1.1.1.1/Fri Sep 14 22:49:31 2007//
3 | /Makefile.PL/1.1.1.1/Fri Sep 14 22:49:31 2007//
4 | D
5 | 


--------------------------------------------------------------------------------
/perl/SGE-0.02/Control/CVS/Repository:
--------------------------------------------------------------------------------
1 | bioinformatics/Modules/SGE-0.02/Control
2 | 


--------------------------------------------------------------------------------
/perl/SGE-0.02/Control/CVS/Root:
--------------------------------------------------------------------------------
1 | :ext:linsalrob@edwards-sdsu.cvs.sourceforge.net:/cvsroot/edwards-sdsu
2 | 


--------------------------------------------------------------------------------
/perl/SGE-0.02/Control/Control.pm:
--------------------------------------------------------------------------------
 1 | # Schedule::SGE::Control
 2 | 
 3 | # POD docs
 4 | 
 5 | =head1 Schedule::SGE::Control
 6 | 
 7 |  Control jobs on the SGE queues. You should not use this method directly, rather you should use the SGE method that inherits from this, then all the methods herein are available to you.
 8 | 
 9 | =head1 AUTHOR
10 | 
11 |  Rob Edwards (rob@salmonella.org)
12 |  3/24/05
13 | 
14 | =cut
15 | 
16 | package SGE::Control;
17 | use strict;
18 | use Exporter;
19 | 
20 | use vars qw(@ISA @EXPORT_OK);
21 | @ISA = qw(Schedule::SGE Exporter);
22 | @EXPORT_OK = qw(qdel);
23 | our $VERSION = '0.01';
24 | 
25 | =head2 qdel()
26 | 
27 | Delete all failed jobs from a queue (this must be run as the user who owns the jobs)
28 | 
29 | =cut
30 | 
31 | sub qdel {
32 |  my ($self, $user)=@_;
33 |  unless ($user) {$user =`whoami`; chomp($user)}
34 |  print `qdel -u $user`;
35 | }
36 | 
37 | 
38 | 1;
39 | 


--------------------------------------------------------------------------------
/perl/SGE-0.02/Control/MANIFEST:
--------------------------------------------------------------------------------
1 | Control.pm
2 | Makefile.PL
3 | MANIFEST
4 | 


--------------------------------------------------------------------------------
/perl/SGE-0.02/Control/Makefile.PL:
--------------------------------------------------------------------------------
 1 | use 5.008001;
 2 | use ExtUtils::MakeMaker;
 3 | # See lib/ExtUtils/MakeMaker.pm for details of how to influence
 4 | # the contents of the Makefile that is written.
 5 | WriteMakefile(
 6 |     NAME              => 'Schedule::SGE::Control',
 7 |     VERSION_FROM      => 'Control.pm', # finds $VERSION
 8 |     PREREQ_PM         => {}, # e.g., Module::Name => 1.1
 9 |     ($] >= 5.005 ?     ## Add these new keywords supported since 5.005
10 |       (AUTHOR         => 'rob <Rob@nonet>') : ()),
11 | );
12 | 


--------------------------------------------------------------------------------
/perl/SGE-0.02/Copying:
--------------------------------------------------------------------------------
1 | Copyright (c) 2005 Rob Edwards. All rights reserved. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself.
2 | 


--------------------------------------------------------------------------------
/perl/SGE-0.02/MANIFEST:
--------------------------------------------------------------------------------
 1 | t/SGE.t
 2 | Control/Control.pm
 3 | Copying
 4 | Run/Run.pm
 5 | Status/Status.pm
 6 | Changes
 7 | README
 8 | examples/README
 9 | examples/submit2cluster.pl
10 | examples/test.pl
11 | 


--------------------------------------------------------------------------------
/perl/SGE-0.02/Makefile.PL:
--------------------------------------------------------------------------------
 1 | use 5.008001;
 2 | use ExtUtils::MakeMaker;
 3 | # See lib/ExtUtils/MakeMaker.pm for details of how to influence
 4 | # the contents of the Makefile that is written.
 5 | WriteMakefile(
 6 |     NAME              => 'Schedule::SGE',
 7 |     VERSION_FROM      => 'SGE.pm', # finds $VERSION
 8 |     PREREQ_PM         => {}, # e.g., Module::Name => 1.1
 9 |     ($] >= 5.005 ?     ## Add these new keywords supported since 5.005
10 |       (ABSTRACT       => 'Schedule::SGE is a suite of modules for interacting with the Sun Grid Engine. The base module Schedule::SGE handles locating the executables and making sure everything works fine. The three modules Schedule::SGE::Run, Schedule::SGE::Control, and Schedule::SGE::Status are for different interactions with the queues',
11 |        AUTHOR         => 'rob <Rob@nonet>') : ()),
12 | );
13 | 


--------------------------------------------------------------------------------
/perl/SGE-0.02/Run/CVS/Entries:
--------------------------------------------------------------------------------
1 | /MANIFEST/1.1.1.1/Fri Sep 14 22:49:31 2007//
2 | /Makefile.PL/1.1.1.1/Fri Sep 14 22:49:31 2007//
3 | /Run.pm/1.1.1.1/Fri Sep 14 22:49:31 2007//
4 | D
5 | 


--------------------------------------------------------------------------------
/perl/SGE-0.02/Run/CVS/Repository:
--------------------------------------------------------------------------------
1 | bioinformatics/Modules/SGE-0.02/Run
2 | 


--------------------------------------------------------------------------------
/perl/SGE-0.02/Run/CVS/Root:
--------------------------------------------------------------------------------
1 | :ext:linsalrob@edwards-sdsu.cvs.sourceforge.net:/cvsroot/edwards-sdsu
2 | 


--------------------------------------------------------------------------------
/perl/SGE-0.02/Run/MANIFEST:
--------------------------------------------------------------------------------
1 | Makefile.PL
2 | MANIFEST
3 | Run.pm
4 | 


--------------------------------------------------------------------------------
/perl/SGE-0.02/Run/Makefile.PL:
--------------------------------------------------------------------------------
 1 | use 5.008001;
 2 | use ExtUtils::MakeMaker;
 3 | # See lib/ExtUtils/MakeMaker.pm for details of how to influence
 4 | # the contents of the Makefile that is written.
 5 | WriteMakefile(
 6 |     NAME              => 'Schedule::SGE::Run',
 7 |     VERSION_FROM      => 'Run.pm', # finds $VERSION
 8 |     PREREQ_PM         => {}, # e.g., Module::Name => 1.1
 9 |     ($] >= 5.005 ?     ## Add these new keywords supported since 5.005
10 |       (AUTHOR         => 'rob <Rob@nonet>') : ()),
11 | );
12 | 


--------------------------------------------------------------------------------
/perl/SGE-0.02/Status/CVS/Entries:
--------------------------------------------------------------------------------
1 | /MANIFEST/1.1.1.1/Fri Sep 14 22:49:31 2007//
2 | /Makefile.PL/1.1.1.1/Fri Sep 14 22:49:31 2007//
3 | /Status.pm/1.1.1.1/Fri Sep 14 22:49:31 2007//
4 | D
5 | 


--------------------------------------------------------------------------------
/perl/SGE-0.02/Status/CVS/Repository:
--------------------------------------------------------------------------------
1 | bioinformatics/Modules/SGE-0.02/Status
2 | 


--------------------------------------------------------------------------------
/perl/SGE-0.02/Status/CVS/Root:
--------------------------------------------------------------------------------
1 | :ext:linsalrob@edwards-sdsu.cvs.sourceforge.net:/cvsroot/edwards-sdsu
2 | 


--------------------------------------------------------------------------------
/perl/SGE-0.02/Status/MANIFEST:
--------------------------------------------------------------------------------
1 | Makefile.PL
2 | MANIFEST
3 | Status.pm
4 | 


--------------------------------------------------------------------------------
/perl/SGE-0.02/Status/Makefile.PL:
--------------------------------------------------------------------------------
 1 | use 5.008001;
 2 | use ExtUtils::MakeMaker;
 3 | # See lib/ExtUtils/MakeMaker.pm for details of how to influence
 4 | # the contents of the Makefile that is written.
 5 | WriteMakefile(
 6 |     NAME              => 'Schedule::SGE::Status',
 7 |     VERSION_FROM      => 'Status.pm', # finds $VERSION
 8 |     PREREQ_PM         => {}, # e.g., Module::Name => 1.1
 9 |     ($] >= 5.005 ?     ## Add these new keywords supported since 5.005
10 |       (AUTHOR         => 'rob <Rob@nonet>') : ()),
11 | );
12 | 


--------------------------------------------------------------------------------
/perl/SGE-0.02/examples/CVS/Entries:
--------------------------------------------------------------------------------
1 | /README/1.1.1.1/Fri Sep 14 22:49:31 2007//
2 | /submit2cluster.pl/1.1.1.1/Fri Sep 14 22:49:31 2007//
3 | /test.pl/1.1.1.1/Fri Sep 14 22:49:31 2007//
4 | D
5 | 


--------------------------------------------------------------------------------
/perl/SGE-0.02/examples/CVS/Repository:
--------------------------------------------------------------------------------
1 | bioinformatics/Modules/SGE-0.02/examples
2 | 


--------------------------------------------------------------------------------
/perl/SGE-0.02/examples/CVS/Root:
--------------------------------------------------------------------------------
1 | :ext:linsalrob@edwards-sdsu.cvs.sourceforge.net:/cvsroot/edwards-sdsu
2 | 


--------------------------------------------------------------------------------
/perl/SGE-0.02/examples/README:
--------------------------------------------------------------------------------
1 | The script test.pl should max out one node by calculating factorials of large numbers and summing them.
2 | 
3 | You should be able to submit this to the cluster using the script submit2cluster.pl like this:
4 | 
5 | submit2cluster.pl test.pl
6 | 
7 | 
8 | 


--------------------------------------------------------------------------------
/perl/SGE-0.02/examples/test.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl -w
 2 | 
 3 | use strict;
 4 | my @results;
 5 | my @fact = (1);
 6 | for (my $c=1; $c<=10; $c++) {
 7 |  for (my $i=1; $i<=6000; $i++) {
 8 |   my $a=1;
 9 |   for (my $y=1; $y<=$i; $y++) {
10 |    $a+=factorial($y);
11 |   }
12 |   $results[$i]=$a;
13 |  }
14 | }
15 | 
16 | sub factorial {
17 |     my $n = shift;
18 |     return $fact[$n] if defined $fact[$n];
19 |     $fact[$n] = $n * factorial($n - 1);
20 | }
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/perl/SGE-0.02/t/CVS/Entries:
--------------------------------------------------------------------------------
1 | /SGE.t/1.1.1.1/Fri Sep 14 22:49:31 2007//
2 | D
3 | 


--------------------------------------------------------------------------------
/perl/SGE-0.02/t/CVS/Repository:
--------------------------------------------------------------------------------
1 | bioinformatics/Modules/SGE-0.02/t
2 | 


--------------------------------------------------------------------------------
/perl/SGE-0.02/t/CVS/Root:
--------------------------------------------------------------------------------
1 | :ext:linsalrob@edwards-sdsu.cvs.sourceforge.net:/cvsroot/edwards-sdsu
2 | 


--------------------------------------------------------------------------------
/perl/Teragrid-0.02/Changes:
--------------------------------------------------------------------------------
1 | v 0.01
2 | 
3 | Initial release into the wild, using LWP for web services
4 | 
5 | v 0.02
6 | 
7 | Added better job control, and some examples.
8 | 
9 | 


--------------------------------------------------------------------------------
/perl/Teragrid-0.02/Control/CVS/Entries:
--------------------------------------------------------------------------------
1 | /Control.pm/1.1.1.1/Fri Sep 14 22:49:31 2007//
2 | /MANIFEST/1.1.1.1/Fri Sep 14 22:49:31 2007//
3 | /Makefile.PL/1.1.1.1/Fri Sep 14 22:49:31 2007//
4 | D
5 | 


--------------------------------------------------------------------------------
/perl/Teragrid-0.02/Control/CVS/Repository:
--------------------------------------------------------------------------------
1 | bioinformatics/Modules/Teragrid-0.02/Control
2 | 


--------------------------------------------------------------------------------
/perl/Teragrid-0.02/Control/CVS/Root:
--------------------------------------------------------------------------------
1 | :ext:linsalrob@edwards-sdsu.cvs.sourceforge.net:/cvsroot/edwards-sdsu
2 | 


--------------------------------------------------------------------------------
/perl/Teragrid-0.02/Control/MANIFEST:
--------------------------------------------------------------------------------
1 | Control.pm
2 | Makefile.PL
3 | MANIFEST
4 | 


--------------------------------------------------------------------------------
/perl/Teragrid-0.02/Control/Makefile.PL:
--------------------------------------------------------------------------------
 1 | use 5.008001;
 2 | use ExtUtils::MakeMaker;
 3 | # See lib/ExtUtils/MakeMaker.pm for details of how to influence
 4 | # the contents of the Makefile that is written.
 5 | WriteMakefile(
 6 |     NAME              => 'TeraGrid::LSGW::Control',
 7 |     VERSION_FROM      => 'Control.pm', # finds $VERSION
 8 |     PREREQ_PM         => {}, # e.g., Module::Name => 1.1
 9 |     ($] >= 5.005 ?     ## Add these new keywords supported since 5.005
10 |       (AUTHOR         => 'Rob Edwards <RobE@theFIG.info>') : ()),
11 | );
12 | 


--------------------------------------------------------------------------------
/perl/Teragrid-0.02/Copying:
--------------------------------------------------------------------------------
1 | Copyright (c) 2006 Rob Edwards. All rights reserved. This program is free software; you can redistribute it and/or modify it under the same terms as the SEED Toolkit license.
2 | 


--------------------------------------------------------------------------------
/perl/Teragrid-0.02/Jobs/CVS/Entries:
--------------------------------------------------------------------------------
1 | /Jobs.pm/1.1.1.1/Fri Sep 14 22:49:31 2007//
2 | /MANIFEST/1.1.1.1/Fri Sep 14 22:49:31 2007//
3 | /Makefile.PL/1.1.1.1/Fri Sep 14 22:49:31 2007//
4 | D
5 | 


--------------------------------------------------------------------------------
/perl/Teragrid-0.02/Jobs/CVS/Repository:
--------------------------------------------------------------------------------
1 | bioinformatics/Modules/Teragrid-0.02/Jobs
2 | 


--------------------------------------------------------------------------------
/perl/Teragrid-0.02/Jobs/CVS/Root:
--------------------------------------------------------------------------------
1 | :ext:linsalrob@edwards-sdsu.cvs.sourceforge.net:/cvsroot/edwards-sdsu
2 | 


--------------------------------------------------------------------------------
/perl/Teragrid-0.02/Jobs/MANIFEST:
--------------------------------------------------------------------------------
1 | Jobs.pm
2 | Makefile.PL
3 | MANIFEST
4 | 


--------------------------------------------------------------------------------
/perl/Teragrid-0.02/Jobs/Makefile.PL:
--------------------------------------------------------------------------------
 1 | use 5.008001;
 2 | use ExtUtils::MakeMaker;
 3 | # See lib/ExtUtils/MakeMaker.pm for details of how to influence
 4 | # the contents of the Makefile that is written.
 5 | WriteMakefile(
 6 |     NAME              => 'TeraGrid::LSGW::Jobs',
 7 |     VERSION_FROM      => 'Jobs.pm', # finds $VERSION
 8 |     PREREQ_PM         => {}, # e.g., Module::Name => 1.1
 9 |     ($] >= 5.005 ?     ## Add these new keywords supported since 5.005
10 |       (AUTHOR         => 'Rob Edwards <RobE@theFIG.info>') : ()),
11 | );
12 | 


--------------------------------------------------------------------------------
/perl/Teragrid-0.02/MANIFEST:
--------------------------------------------------------------------------------
1 | Control/Control.pm
2 | Jobs/Jobs.pm
3 | examples/jobs.pl
4 | examples/blast.pl
5 | Copying
6 | Changes
7 | README
8 | 


--------------------------------------------------------------------------------
/perl/Teragrid-0.02/Makefile.PL:
--------------------------------------------------------------------------------
 1 | use 5.008001;
 2 | use ExtUtils::MakeMaker;
 3 | # See lib/ExtUtils/MakeMaker.pm for details of how to influence
 4 | # the contents of the Makefile that is written.
 5 | WriteMakefile(
 6 |     NAME              => 'TeraGrid::LSGW',
 7 |     VERSION_FROM      => 'LSGW.pm', # finds $VERSION
 8 |     PREREQ_PM         => {}, # e.g., Module::Name => 1.1
 9 |     ($] >= 5.005 ?     ## Add these new keywords supported since 5.005
10 |       (ABSTRACT       => 'TeraGrid::LSGW, Interact with the TeraGrid Life Sciences Gateway',
11 |        AUTHOR         => 'Rob Edwards <RobE@theFIG.info>') : ()),
12 | );
13 | 


--------------------------------------------------------------------------------
/perl/Teragrid-0.02/README:
--------------------------------------------------------------------------------
 1 | README
 2 | 
 3 | TeraGrid::LSGW
 4 | 
 5 | The Life Sciences Gateway to the TeraGrid is being developed to allow biologists access to High Performance Computing. This series of modules is being developed by Rob Edwards and Ivan Judson to assist in submitting jobbs to the LSGW.
 6 | 
 7 | To use these modules you'll need an account on the LSGW machine with access to BLAST. You will have to contact Ivan for that.
 8 | 
 9 | 
10 | 
11 | 
12 | 
13 | 


--------------------------------------------------------------------------------
/perl/Teragrid-0.02/examples/CVS/Entries:
--------------------------------------------------------------------------------
1 | /all_jobs.pl/1.1.1.1/Fri Sep 14 22:49:31 2007//
2 | /blast.pl/1.1.1.1/Fri Sep 14 22:49:31 2007//
3 | /job.pl/1.1.1.1/Fri Sep 14 22:49:31 2007//
4 | /job_data.pl/1.1.1.1/Fri Sep 14 22:49:31 2007//
5 | D
6 | 


--------------------------------------------------------------------------------
/perl/Teragrid-0.02/examples/CVS/Repository:
--------------------------------------------------------------------------------
1 | bioinformatics/Modules/Teragrid-0.02/examples
2 | 


--------------------------------------------------------------------------------
/perl/Teragrid-0.02/examples/CVS/Root:
--------------------------------------------------------------------------------
1 | :ext:linsalrob@edwards-sdsu.cvs.sourceforge.net:/cvsroot/edwards-sdsu
2 | 


--------------------------------------------------------------------------------
/perl/Teragrid-0.02/examples/all_jobs.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl -w
 2 | 
 3 | # just print out a list of all jobs
 4 | 
 5 | use strict;
 6 | use lib '/homes/redwards/perl/share/perl/5.8.4/';
 7 | use TeraGrid::LSGW;
 8 | 
 9 | 
10 | my $tg=new TeraGrid::LSGW(-verbose=>2);
11 | my $aj=$tg->jobs();
12 | print STDERR "There are ", scalar(keys %$aj), " jobs\n";
13 | my $jl=$tg->job_list;
14 | print join("\n", @$jl), "\n";
15 | 


--------------------------------------------------------------------------------
/perl/Teragrid-0.02/examples/job.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl -w
 2 | 
 3 | # Test the jobs interface
 4 | 
 5 | use strict;
 6 | use lib '/homes/redwards/perl/share/perl/5.8.4/';
 7 | use TeraGrid::LSGW;
 8 | 
 9 | 
10 | my $job=shift || die "$0 <job number>\n";
11 | 
12 | my $tg=new TeraGrid::LSGW(-verbose=>2);
13 | 
14 | open(OUT, ">$job.input") || die "Can't write to $job.input";
15 | print OUT join("\n", $tg->input($job)), "\n";
16 | close OUT;
17 | 
18 | open(OUT, ">$job.output") || die "Can't write to $job.output";
19 | print OUT join("\n", $tg->output($job)), "\n";
20 | close OUT;
21 | 
22 | open(OUT, ">$job.results") || die "Can't write to $job.results";
23 | print OUT join("\n", $tg->results($job)), "\n";
24 | close OUT;
25 | 
26 | 


--------------------------------------------------------------------------------
/perl/Teragrid-0.02/examples/job_data.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl -w
 2 | 
 3 | # Test the jobs interface
 4 | 
 5 | use strict;
 6 | use lib '/homes/redwards/perl/share/perl/5.8.4/';
 7 | use TeraGrid::LSGW;
 8 | 
 9 | 
10 | my $ajob="blastx-20060608-20153622";
11 | #my $ajob="blastx-20060612-01534283";
12 | 
13 | my $tg=new TeraGrid::LSGW(-verbose=>2);
14 | 
15 | my $jobs=$tg->jobs();
16 | 
17 | foreach my $j (keys %$jobs)
18 | {
19 | 	print join("\t", $j, @{$jobs->{$j}}), "\n";
20 | }
21 | 
22 | print "For job $ajob\n";
23 | print "INPUT\n======\n", $tg->input($ajob), "\nOUTPUT\n======\n", $tg->output($ajob), "\nRESULTS\n======\n", $tg->results($ajob), "\n";
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/perl/alignment/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'redwards'
2 | 
3 | from .edit_distance import edit_distance
4 | 
5 | from .gapped_alignment import gap_alignment
6 | from .dna_alignment import dna_gapped_alignment, dna_score_alignment
7 | from .gapped_alignment2 import gapped_alignment, score_alignment
8 | from local_alignment import local_alignment
9 | 


--------------------------------------------------------------------------------
/phage/envs/phispy.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |         - bioconda
3 |         - conda-forge
4 | dependencies:
5 |         - phispy
6 |         - rsync
7 |         - curl
8 | 
9 | 


--------------------------------------------------------------------------------
/phage/is_phage_function.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Note that this uses code from phispy!!
 3 | 
 4 | Test whether functions are phages .... or not!
 5 | """
 6 | 
 7 | import os
 8 | import sys
 9 | import argparse
10 | 
11 | from PhiSpyModules import is_phage_func, is_unknown_func
12 | 
13 | def is_phage_hypo(f):
14 |     with open(f, 'r') as fin:
15 |         for l in fin:
16 |             p = l.strip().split("\t")
17 |             if is_phage_func(p[0]):
18 |                 p.append(1)
19 |             else:
20 |                 p.append(0)
21 |             if is_unknown_func(p[0]):
22 |                 p.append(1)
23 |             else:
24 |                 p.append(0)
25 |             print("\t".join(map(str, p)))
26 | 
27 | if __name__ == '__main__':
28 |     parser = argparse.ArgumentParser(description=" ")
29 |     parser.add_argument('-f', help='file', required=True)
30 |     args = parser.parse_args()
31 | 
32 |     is_phage_hypo(args.f)
33 | 


--------------------------------------------------------------------------------
/phage/metagenomes/contig_mv_samples.txt.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:f78e9e819bc0093033ee52d0d26f673b4cb90baf4b36401f3c9af7ee0b920da0
3 | size 8185
4 | 


--------------------------------------------------------------------------------
/phage/metagenomes/contigs_gokushovirus.blastn.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:5395c12a95c71f34cb15b55da063459b9414613aceac190e3291cb1034a1dc37
3 | size 10073
4 | 


--------------------------------------------------------------------------------
/phage/metagenomes/count_contigs.pl:
--------------------------------------------------------------------------------
 1 | use strict;
 2 | use Getopt::Std;
 3 | use Data::Dumper;
 4 | use Rob;
 5 | 
 6 | 
 7 | 
 8 | open(IN, "mv_samples.txt") || die "$! mv_samples.txt";
 9 | my %sample;
10 | while (<IN>) {
11 | 	chomp;
12 | 	my @a=split /\t/;
13 | 	$sample{$a[0]}=$a[1];
14 | }
15 | close IN;
16 | 
17 | open(IN, "mv_sequences.txt") || die "$! mv_sequences.txt";
18 | my %contigs;
19 | while (<IN>) {
20 | 	chomp;
21 | 	my @a=split /\t/;
22 | 	if ($sample{$a[0]}) {print "$_\t$sample{$a[0]}\n"}
23 | 	elsif (/^contig/) {$contigs{$a[0]}=$_}
24 | 	else {print STDERR "Huh? $_\n"}
25 | }
26 | close IN;
27 | 
28 | open(IN, "gunzip -c contig_count_table.tsv.gz|") || die "$! contig_count_table.tsv.gz";
29 | my $contsamps;
30 | while (<IN>) {
31 | 	chomp;
32 | 	my @a=split /\t/;
33 | 	if ($a[3] > 0) {$contsamps->{$a[1]}->{$a[0]}=1}
34 | }
35 | close IN;
36 | 
37 | foreach my $c (keys %contigs) {
38 | 	print "$contigs{$c}\t";
39 | 	if ($contsamps->{$c}) {
40 | 		print scalar(keys %{$contsamps->{$c}}), "\n";
41 | 	} else {
42 | 		print "UNKNOWN\n";
43 | 	}
44 | }
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/phage/metagenomes/count_contigs2.pl:
--------------------------------------------------------------------------------
 1 | use strict;
 2 | use Getopt::Std;
 3 | use Data::Dumper;
 4 | use Rob;
 5 | 
 6 | # the first version was shit because it counted hits to multiple viruses more than once. 
 7 | # now we need to know which samples have e.g. Gokushaviridae
 8 | #
 9 | 
10 | # get the contig information first since I need to work on the phables information
11 | #
12 | 
13 | open(IN, "mv_sequences.txt") || die "$! mv_sequences.txt";
14 | my %class;
15 | while (<IN>) {
16 | 	chomp;
17 | 	my @a=split /\t/;
18 | 	$class{$a[0]}=$a[1];
19 | }
20 | close IN;
21 | 
22 | my $count;
23 | open(IN, "gunzip -c contig_count_table.tsv.gz|") || die "$! contig_count_table.tsv.gz";
24 | while (<IN>) {
25 | 	chomp;
26 | 	my @a=split /\t/;
27 | 	next unless ($a[3]);
28 | 	if ($class{$a[1]}) {
29 | 		$count->{$class{$a[1]}}->{$a[0]} = 1;
30 | 	}
31 | }
32 | close IN;
33 | 
34 | foreach my $c (keys %$count) {
35 | 	foreach my $s (keys %{$count->{$c}}) {
36 | 		print "$c\t$s\n";
37 | 	}
38 | }
39 | 


--------------------------------------------------------------------------------
/phage/metagenomes/count_phables.pl:
--------------------------------------------------------------------------------
 1 | use strict;
 2 | use Getopt::Std;
 3 | use Data::Dumper;
 4 | 
 5 | 
 6 | open(IN, "mv_sequences.txt") || die "$! mv_sequences.txt";
 7 | my %class;
 8 | while (<IN>) {
 9 | 	chomp;
10 | 	my @a=split /\t/;
11 | 	$class{$a[0]}=$a[1];
12 | }
13 | close IN;
14 | 
15 | my $count;
16 | my $header;
17 | open(IN, "sample_genome_read_counts.tsv") || die "$! sample_genome_read_counts.tsv";
18 | while (<IN>) {
19 | 	chomp;
20 | 	my @a=split /\t/;
21 | 	unless ($header) {$header=\@a; next}
22 | 	if ($class{$a[0]}) {
23 | 		map {$count->{$class{$a[0]}}->{$header->[$_]}=1 if ($a[$_])} (1..$#a);
24 | 	}
25 | }
26 | close IN;
27 | 
28 | 
29 | foreach my $c (keys %$count) {
30 | 	foreach my $s (keys %{$count->{$c}}) {
31 | 		print "$c\t$s\n";
32 | 	}
33 | }
34 | 
35 | 


--------------------------------------------------------------------------------
/phage/metagenomes/crass_contigs.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:064a7286f28153595450f0fed1ff5180cd518f37b37ab386f10f8f947d576974
3 | size 384917
4 | 


--------------------------------------------------------------------------------
/phage/metagenomes/crassphage_1percent.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/phage/metagenomes/crassphage_1percent.png


--------------------------------------------------------------------------------
/phage/metagenomes/crassus_results.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:b560b3f4b65d3bd19f50fcd9fd8014199b391709945391949591c6d78565a24f
3 | size 3528
4 | 


--------------------------------------------------------------------------------
/phage/metagenomes/find_pb_segment2.pl:
--------------------------------------------------------------------------------
 1 | =pod
 2 | 
 3 | contig_199070 is only present in these samples: '35536', '35613', '35634', '35658', '38046'
 4 | 
 5 | Can we find any more contigs that are only present in those samples, and not present elsewhere?
 6 | 
 7 | =cut
 8 | 
 9 | use strict;
10 | 
11 | open(IN, "gunzip -c contig_count_table.tsv.gz|") || die "$! contig_count_table.tsv.gz";
12 | my $h = 1;
13 | 
14 | my %w = (
15 | 	35536 => 1,
16 | 	35613 => 1,
17 | 	35634 => 1,
18 | 	35658 => 1,
19 | 	38046 => 1
20 | );
21 | 
22 | my %want;
23 | my %other;
24 | 
25 | while (<IN>) {
26 | 	if ($h) {$h=0; next}
27 | 	my @a=split /\t/;
28 | 	next unless ($a[3]);
29 | 	if ($w{$a[0]}) {
30 | 		$want{$a[1]}++;
31 | 	} else {
32 | 		$other{$a[1]}++;
33 | 	}
34 | }
35 | close IN;
36 | 
37 | print "Contig\tWAnted\tOther samples\n";
38 | foreach my $c (keys %want) {
39 | 	if ($want{$c} == 5) {
40 | 		print "$c\t$want{$c}\t$other{$c}\n";
41 | 	}
42 | }
43 | 
44 | 
45 | 


--------------------------------------------------------------------------------
/phage/metagenomes/freezer.txt.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:479ab0783ab0477523b812ba3b3e9c0dcd5c33b5942b515164363b80e7f86d19
3 | size 280
4 | 


--------------------------------------------------------------------------------
/phage/metagenomes/hist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/phage/metagenomes/hist.png


--------------------------------------------------------------------------------
/phage/metagenomes/hist_1percent.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/phage/metagenomes/hist_1percent.png


--------------------------------------------------------------------------------
/phage/metagenomes/ibd_16s/RC2_16S_IBD_OTU.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:5b347228726cda60a3bef5ee13b8fc58a50615d0553bd8e0b06e735341e81e4b
3 | size 847840
4 | 


--------------------------------------------------------------------------------
/phage/metagenomes/ibd_16s/RC2_16S_IBD_metadata.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:e9c37a2d86b1ca3babcae3f1b5effed7ae322eb39bdffbf54c70749f29234927
3 | size 13113
4 | 


--------------------------------------------------------------------------------
/phage/metagenomes/ibd_16s/RC2_16S_IBD_taxadata.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:e6c7ccc8827c1312dadf3120fde5a4f80f5a39bbac00349c5d341220762df75b
3 | size 348542
4 | 


--------------------------------------------------------------------------------
/phage/metagenomes/join_vir.pl:
--------------------------------------------------------------------------------
 1 | use strict;
 2 | use Getopt::Std;
 3 | use Data::Dumper;
 4 | use Rob;
 5 | my %count;
 6 | open(IN, "most_abundant.txt") || die "$! most_abundant.txt";
 7 | while (<IN>) {
 8 | 	chomp;
 9 | 	my @a=split /\t/;
10 | 	$count{$a[0]}=$a[1];
11 | }
12 | close IN;
13 | 
14 | open(IN, "virus_contig_annotations.tsv") || die "C$! virus_contig_annotations.tsv";
15 | while (<IN>) {
16 | 	my @a=split /\t/;
17 | 	if ($a[0] eq "contigID") {splice @a, 1, 0, "Samples"}
18 | 	elsif ($count{$a[0]}) {splice @a, 1, 0, $count{$a[0]}}
19 | 	else {splice @a, 1, 0, 0}
20 | 	print join("\t", @a);
21 | }
22 | close IN;
23 | 


--------------------------------------------------------------------------------
/phage/metagenomes/limit_contigs.pl:
--------------------------------------------------------------------------------
 1 | use strict;
 2 | use Getopt::Std;
 3 | use Data::Dumper;
 4 | 
 5 | my %w;
 6 | open(IN, "crassus_results.tsv") || die "$! crassus_results.tsv";
 7 | while (<IN>) {
 8 | 	chomp;
 9 | 	my @a=split /\t/;
10 | 	$w{$a[1]}=1;
11 | }
12 | close IN;
13 | 
14 | open(IN, "gunzip -c contig_count_table.tsv.gz|") || die "$! contig_count_table.tsv.gz";
15 | my $h = 1;
16 | while (<IN>) {
17 | 	if ($h) {print; $h=0; next}
18 | 	my @a=split /\t/;
19 | 	print if ($w{$a[1]} && $a[3]);
20 | }
21 | close IN;
22 | 


--------------------------------------------------------------------------------
/phage/metagenomes/microviridae.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/phage/metagenomes/microviridae.png


--------------------------------------------------------------------------------
/phage/metagenomes/microviridae58782.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/phage/metagenomes/microviridae58782.png


--------------------------------------------------------------------------------
/phage/metagenomes/microviridae_correlations.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:977e6c6dd4cb7c9686185c2e34df51c180dcad6383f7e78bfc10bd40642905c5
3 | size 1022545
4 | 


--------------------------------------------------------------------------------
/phage/metagenomes/microvirus_contig_count_table.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:0fbbbe8c199f278c95d58ef09a4af71fa4fe52c11491cf1704a7454032ba6544
3 | size 89605
4 | 


--------------------------------------------------------------------------------
/phage/metagenomes/microvirus_contigs.pl:
--------------------------------------------------------------------------------
 1 | use strict;
 2 | use Getopt::Std;
 3 | use Data::Dumper;
 4 | 
 5 | my %w;
 6 | open(IN, "gunzip -c contigs_gokushovirus.blastn.gz|") || die "$! contigs_gokushovirus.blastn.gz";
 7 | while (<IN>) {
 8 | 	chomp;
 9 | 	my @a=split /\t/;
10 | 	$w{$a[0]}=1;
11 | }
12 | close IN;
13 | 
14 | open(IN, "gunzip -c contig_count_table.tsv.gz|") || die "$! contig_count_table.tsv.gz";
15 | my $h = 1;
16 | while (<IN>) {
17 | 	if ($h) {print; $h=0; next}
18 | 	my @a=split /\t/;
19 | 	print if ($w{$a[1]} && $a[3]);
20 | }
21 | close IN;
22 | 


--------------------------------------------------------------------------------
/phage/metagenomes/most_abundant.pl:
--------------------------------------------------------------------------------
 1 | 
 2 | use strict;
 3 | use Getopt::Std;
 4 | use Data::Dumper;
 5 | use Rob;
 6 | 
 7 | # What are the most abundant contigs in terms of numbers of samples
 8 | # they are in
 9 | #
10 | 
11 | 
12 | my $count;
13 | open(IN, "gunzip -c contig_count_table.tsv.gz|") || die "$! contig_count_table.tsv.gz";
14 | while (<IN>) {
15 | 	chomp;
16 | 	my @a=split /\t/;
17 | 	next unless ($a[3]);
18 | 	$count->{$a[1]}->{$a[0]} = 1;
19 | }
20 | close IN;
21 | 
22 | foreach my $c (keys %$count) {
23 | 	print "$c\t", scalar(keys %{$count->{$c}}), "\n";
24 | }
25 | 


--------------------------------------------------------------------------------
/phage/metagenomes/most_abundant.txt.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:1398cd70fafc725457a49f0c5c45fba3d0d168af79c92252fe35a1270fa7b85e
3 | size 1192386
4 | 


--------------------------------------------------------------------------------
/phage/metagenomes/mv_gen_cont_samples.txt.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:baf3dee4372cd84717f94ee59b0471ece0740694a61edf6f38f4fa63feeb7d1e
3 | size 3760
4 | 


--------------------------------------------------------------------------------
/phage/metagenomes/mv_samples.txt.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:8979d93cd22c47a4377006ba1437fde11748b3191af0b07014ef9acaa9379108
3 | size 7844
4 | 


--------------------------------------------------------------------------------
/phage/metagenomes/mv_sequences.txt.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:d85fc8224fea65b05351cabc4b79931f363c978c965bcb4180fad719cb606ed5
3 | size 2410
4 | 


--------------------------------------------------------------------------------
/phage/metagenomes/our_crassphage.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:f221f60ac80a7b344deddcf360e46933a79f23ba722b14f7c3140024594d6ab0
3 | size 17184
4 | 


--------------------------------------------------------------------------------
/phage/metagenomes/pb199070.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/phage/metagenomes/pb199070.png


--------------------------------------------------------------------------------
/phage/metagenomes/pb199070_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/phage/metagenomes/pb199070_2.png


--------------------------------------------------------------------------------
/phage/metagenomes/pb199070_both.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/phage/metagenomes/pb199070_both.png


--------------------------------------------------------------------------------
/phage/metagenomes/pb58328.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/phage/metagenomes/pb58328.png


--------------------------------------------------------------------------------
/phage/metagenomes/phables_mv_samples.txt.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:c9a0aa9043ddd6c5dbf1f9736bec82de7f0ef5b2a30df0455fc098b93a04263c
3 | size 6722
4 | 


--------------------------------------------------------------------------------
/phage/metagenomes/pharokka_top_hits_mash_inphared.nonone.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:5c995afbd162130b757b5845082d635a233c6273c0e93022ebf6a7604e9f0c6e
3 | size 192651
4 | 


--------------------------------------------------------------------------------
/phage/metagenomes/pharokka_top_hits_mash_inphared.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:7d580c96d99db3ebf4d90ecf5e1fa3036e8438ae8a7a26f2b2b8e04623d5c47d
3 | size 428184
4 | 


--------------------------------------------------------------------------------
/phage/metagenomes/picobirnaviridae edited.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/phage/metagenomes/picobirnaviridae edited.png


--------------------------------------------------------------------------------
/phage/metagenomes/picobirnaviridae_contig_count_table.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:7777c6af305699c3dfffbb249db9e051e5e94e0df2ffebe5e3619d2a26cee181
3 | size 14971
4 | 


--------------------------------------------------------------------------------
/phage/metagenomes/picobirnaviridae_contigs.pl:
--------------------------------------------------------------------------------
 1 | use strict;
 2 | use Getopt::Std;
 3 | use Data::Dumper;
 4 | 
 5 | my %w;
 6 | open(IN, "gunzip -c picobirnaviridae_rdrp.tblastn.gz |") || die "$! picobirnaviridae_rdrp.tblastn.gz";
 7 | while (<IN>) {
 8 | 	chomp;
 9 | 	my @a=split /\t/;
10 | 	$w{$a[1]}=1;
11 | }
12 | close IN;
13 | 
14 | open(IN, "gunzip -c contig_count_table.tsv.gz|") || die "$! contig_count_table.tsv.gz";
15 | my $h = 1;
16 | while (<IN>) {
17 | 	if ($h) {print; $h=0; next}
18 | 	my @a=split /\t/;
19 | 	print if ($w{$a[1]} && $a[3]);
20 | }
21 | close IN;
22 | 


--------------------------------------------------------------------------------
/phage/metagenomes/picobirnaviridae_correlations.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:fb327a9e4c8eda037dfffcc6ad9ceeb3f35887a59ce75ee78972be408c5a5171
3 | size 360246
4 | 


--------------------------------------------------------------------------------
/phage/metagenomes/picobirnaviridae_rdrp.tblastn.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:d7e4dca4db1a61ec441da41739ef0ff227675e8861fd5e3305863682a2485d90
3 | size 1194
4 | 


--------------------------------------------------------------------------------
/phage/metagenomes/rdrp.tblastn.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:b627e3bb791c31a3e057379a6fb517f6f9fa0efaa3e54504b0f375ee42ca883b
3 | size 1391
4 | 


--------------------------------------------------------------------------------
/phage/metagenomes/rdrp_contig_count_table.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:af44802f46c8312f0edeac6693312dd6c60785bdaf306d289349dea61ba1df9b
3 | size 16733
4 | 


--------------------------------------------------------------------------------
/phage/metagenomes/rdrp_contigs.pl:
--------------------------------------------------------------------------------
 1 | use strict;
 2 | use Getopt::Std;
 3 | use Data::Dumper;
 4 | 
 5 | my %w;
 6 | open(IN, "gunzip -c rdrp.tblastn.tsv.gz |") || die "$! rdrp.tblastn.tsv.gz";
 7 | while (<IN>) {
 8 | 	chomp;
 9 | 	my @a=split /\t/;
10 | 	$w{$a[1]}=1;
11 | }
12 | close IN;
13 | 
14 | open(IN, "gunzip -c contig_count_table.tsv.gz|") || die "$! contig_count_table.tsv.gz";
15 | my $h = 1;
16 | while (<IN>) {
17 | 	if ($h) {print; $h=0; next}
18 | 	my @a=split /\t/;
19 | 	print if ($w{$a[1]} && $a[3]);
20 | }
21 | close IN;
22 | 


--------------------------------------------------------------------------------
/phage/metagenomes/rdrp_wehave.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:443a327f46ad6857a7770b00ee320b93f334d8c1501c33ca285db1b392c02b0e
3 | size 757
4 | 


--------------------------------------------------------------------------------
/phage/metagenomes/sampleSeqCounts.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:b631e6bf9d4e71d5215c90f34f728918ac6aeeee7b54d5c2464ef2888eca7dfd
3 | size 6888
4 | 


--------------------------------------------------------------------------------
/phage/metagenomes/sample_genome_read_counts.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:6b8262dda1891f5df6db39c95a80c77c20039e61ce85b7fb966b8dec3832f82a
3 | size 248549
4 | 


--------------------------------------------------------------------------------
/phage/metagenomes/virus_contig_annotations.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:a8a99a180550f1efa60633903c3f106e6756a0515b087c50aa734283b30c0949
3 | size 1028088
4 | 


--------------------------------------------------------------------------------
/phage/metagenomes/virus_contig_annotations_samples.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:22eb3940f17e72b095804f8c4a25d39242a150b400a10491eadb5e220dc12f81
3 | size 1056205
4 | 


--------------------------------------------------------------------------------
/phage/phage_quality_config.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | output_paths:
 3 |     contigs:   contigs
 4 |     orfs:      phanotate
 5 |     blast:     blast
 6 |     databases: /home3/redwards/phage/Sequencing/databases
 7 |     statistics: stats
 8 |     results: results
 9 | 
10 | executable_paths:
11 |     blast: /usr/local/blast+/bin/
12 | 
13 | databases:
14 |     phage_proteins: phages.faa
15 |     bacterial_proteins: bacteria.clusters.faa
16 |     nr: /home2/db/blast/nr/nr
17 |     phage_cluster_database: /home3/redwards/phage/genbank_phages/20200228/clusters.sql
18 | 


--------------------------------------------------------------------------------
/phage/submit_phispy_vogs_download.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | DATE=20220525
 4 | ASS=$DATE/assembly_summary_$DATE.txt.gz
 5 | VOGS=/home3/redwards/VOGs/VOGs.hmm 
 6 | 
 7 | NEED=0000$SGE_TASK_ID
 8 | NEED=${NEED:(-4)}
 9 | 
10 | snakemake -s ~/GitHubs/EdwardsLab/phage/phispy_vogs_download.snakefile --config filelist=$DATE/needed/x$NEED gbk=$DATE/gbk output=$DATE/phispy assembly=$ASS vogs=$VOGS --profile sge 
11 | 


--------------------------------------------------------------------------------
/phage_clustering/bit_score_by_len.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 
 3 | """
 4 | 
 5 | import os
 6 | import sys
 7 | import argparse
 8 | from roblib import bcolors, stream_blast_results
 9 | __author__ = 'Rob Edwards'
10 | 
11 | 
12 | 
13 | def bit_scores_len(blastf, verbose=False):
14 |     """
15 |     Generate a dict of self:self bitscores
16 |     """
17 | 
18 |     for b in stream_blast_results(blastf, verbose):
19 |         if b.query == b.db:
20 |             print(f"{b.query_length}\t{b.bitscore}")
21 | 
22 | if __name__ == "__main__":
23 |     parser = argparse.ArgumentParser(description=' ')
24 |     parser.add_argument('-b', help='blast input file', required=True)
25 |     parser.add_argument('-v', help='verbose output', action='store_true')
26 |     args = parser.parse_args()
27 | 
28 |     bit_scores_len(args.b, args.v)
29 | 


--------------------------------------------------------------------------------
/process_EK_metagenomes/bin/compress.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=CmprssOutput
 3 | #SBATCH --time=5-0
 4 | #SBATCH --ntasks=1
 5 | #SBATCH --cpus-per-task=1
 6 | #SBATCH --mem=2G
 7 | #SBATCH -o cmprss-%j.out
 8 | #SBATCH -e cmprss-%j.err
 9 | 
10 | set -euo pipefail
11 | # here the brackets are required otherwise -exec will only do the last one
12 | find ReadAnnotations/ \( -name "*.tsv" -o -name "*.m8" -o -name "*.xls" \) -exec pigz {} \;
13 | 
14 | 


--------------------------------------------------------------------------------
/process_EK_metagenomes/bin/merge_counts.pl:
--------------------------------------------------------------------------------
 1 | use strict;
 2 | use Getopt::Std;
 3 | use Data::Dumper;
 4 | use Rob;
 5 | my $rob = new Rob;
 6 | 
 7 | my $reads;
 8 | my %alltypes;
 9 | my %allsamples;
10 | foreach my $f (@ARGV) {
11 | 
12 | 	open(IN, $f) || die "$! : $f";
13 | 	while (<IN>) {
14 | 		chomp;
15 | 		s/.001.fast..gz//;# replace fastq or fasta
16 | 		s/.fast..gz//;
17 | 		s#//#/#g;
18 | 		my @a=split /\t/;
19 | 		my @b = split /\//, $a[0];
20 | 		$reads->{$b[1]}->{$b[0]}=$a[1];
21 | 		$allsamples{$b[1]}++;
22 | 		$alltypes{$b[0]}++;
23 | 	}
24 | 	close IN;
25 | }
26 | 
27 | my @types = sort {$a cmp $b} keys %alltypes;
28 | my @samples = sort {$a cmp $b} keys %allsamples;
29 | 
30 | print("Sample\t", join("\t", @types), "\n");
31 | foreach my $s (@samples) {
32 | 	print $s;
33 | 	foreach my $t (@types) {
34 | 		print "\t";
35 | 		(defined $reads->{$s}->{$t}) ? print $reads->{$s}->{$t} : print "not found";
36 | 	}
37 | 	print "\n";
38 | }
39 | 
40 | 
41 | 


--------------------------------------------------------------------------------
/process_EK_metagenomes/count_fastq.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=countfastq
 3 | #SBATCH --time=0-1
 4 | #SBATCH --ntasks=1
 5 | #SBATCH --cpus-per-task=1
 6 | #SBATCH --mem=4G
 7 | #SBATCH -o count_fastq-%j.out
 8 | #SBATCH -e count_fastq-%j.err
 9 | #SBATCH --partition=short
10 | 
11 | DIR=$1
12 | 
13 | if [[ -z $DIR ]]; then 
14 | 	echo "sbatch count_fastq.slurm <directory name>" >&2;
15 | 	echo "You can also use a single fastq file, I suppose" >&2;
16 | 	exit 1;
17 | fi
18 | 
19 | # This requires count_fastq from my EdwardsLab github to be installed
20 | 
21 | count_fastq $DIR
22 | 


--------------------------------------------------------------------------------
/process_EK_metagenomes/count_mmseqs.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=CountMMSeqs
 3 | #SBATCH --time=0-10
 4 | #SBATCH --ntasks=1
 5 | #SBATCH --cpus-per-task=1
 6 | #SBATCH --mem=64G
 7 | #SBATCH -o count_mmseqs-%j.out
 8 | #SBATCH -e count_mmseqs-%j.err
 9 | #SBATCH --partition=short
10 | 
11 | eval "$(conda shell.bash hook)"
12 | conda activate bioinformatics
13 | 
14 | perl ~/GitHubs/EdwardsLab/process_EK_metagenomes/count_mmseqs.pl -d mmseqs 
15 | 
16 | 


--------------------------------------------------------------------------------
/process_EK_metagenomes/count_sharks.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=count_sharks
 3 | #SBATCH --time=5-0
 4 | #SBATCH --ntasks=1
 5 | #SBATCH --cpus-per-task=1
 6 | #SBATCH --mem=128G
 7 | #SBATCH -o count_sharks-%j.out
 8 | #SBATCH -e count_sharks-%j.err
 9 | 
10 | for F in no_sharks/*; do echo -ne "$F\t"; gunzip -c $F  | perl -ne 'if (/^@/) {$s{$_}++} END {print scalar(keys %s), "\n"}'; done > count_nosharks.txt &
11 | for F in    sharks/*; do echo -ne "$F\t"; gunzip -c $F  | perl -ne 'if (/^@/) {$s{$_}++} END {print scalar(keys %s), "\n"}'; done > count_sharks.txt &
12 | 
13 | wait;
14 | 
15 | 


--------------------------------------------------------------------------------
/process_EK_metagenomes/count_subsystems.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=CountSS
 3 | #SBATCH --time=0-1
 4 | #SBATCH --ntasks=1
 5 | #SBATCH --cpus-per-task=1
 6 | #SBATCH --mem=64G
 7 | #SBATCH --partition=short
 8 | #SBATCH -o count_ss-%j.out
 9 | #SBATCH -e count_ss-%j.err
10 | 
11 | perl /home/edwa0468/GitHubs/EdwardsLab/process_EK_metagenomes/count_subsystems.pl -d mmseqs
12 | 


--------------------------------------------------------------------------------
/process_EK_metagenomes/megahit_submit.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=megahit_submit
 3 | #SBATCH --time=5-0
 4 | #SBATCH --ntasks=1
 5 | #SBATCH --cpus-per-task=1
 6 | #SBATCH --mem=4G
 7 | #SBATCH -o megahit_submit-%j.out
 8 | #SBATCH -e megahit_submit-%j.err
 9 | 
10 | set -euo pipefail
11 | eval "$(conda shell.bash hook)"
12 | conda activate bioinformatics
13 | 
14 | /home/edwa0468/slurm/stats.sh
15 | 
16 | 
17 | mkdir --parents megahit
18 | for R1 in $(cat R1_reads.txt); do
19 | 	R2=${R1/R1/R2};
20 | 	#FILEEND="_R1_001.fastq.gz";
21 | 	FILEEND="_R1.fastq.gz";
22 | 	O=${R1/$FILEEND/};
23 | 
24 | 
25 | 	if [[ ! -e megahit/$O ]]; then
26 | 		sbatch ~/GitHubs/EdwardsLab/process_EK_metagenomes/megahit.slurm no_sharks/$R1 no_sharks/$R2 megahit/$O
27 | 	fi;
28 | done
29 | 


--------------------------------------------------------------------------------
/process_EK_metagenomes/mmseqs_add_subsystems.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=mmseqs_ss
 3 | #SBATCH --time=0-10
 4 | #SBATCH --ntasks=1
 5 | #SBATCH --cpus-per-task=2
 6 | #SBATCH --mem=16G
 7 | #SBATCH -o mmseqs_ss-%A_%a.out
 8 | #SBATCH -e mmseqs_ss-%A_%a.err
 9 | 
10 | eval "$(conda shell.bash hook)"
11 | conda activate bioinformatics
12 | 
13 | if [[ ! -e R1_reads.txt ]]; then 
14 | 	echo "Please make a file with the R1 reads using this command:" >&2
15 | 	echo "find fastq -name \*R1\* -printf "%f\n" > R1_reads.txt" >&2;
16 | 	exit 2;
17 | fi
18 | 
19 | cp /home/edwa0468/UniRef/uniref.sqlite $BGFS
20 | R1=$(head -n $SLURM_ARRAY_TASK_ID R1_reads.txt | tail -n 1)
21 | FILEEND="_R1.fastq.gz";
22 | #FILEEND="_R1_001.fastq.gz";
23 | BASE=${R1/$FILEEND/}
24 | 
25 | python ~/GitHubs/EdwardsLab/mmseqs/easy_taxonomy_to_function.py -f mmseqs/$BASE/${BASE}_tophit_report.gz -d $BGFS/uniref.sqlite | gzip -c >  mmseqs/$BASE/${BASE}_tophit_report_subsystems.gz
26 | 


--------------------------------------------------------------------------------
/process_EK_metagenomes/mmseqs_easy_taxonomy_submit.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=mmseqs_submit
 3 | #SBATCH --time=5-0
 4 | #SBATCH --ntasks=1
 5 | #SBATCH --cpus-per-task=1
 6 | #SBATCH --mem=2G
 7 | #SBATCH -o mmseqs_submit-%j.out
 8 | #SBATCH -e mmseqs_submit-%j.err
 9 | 
10 | # this is so we can submit it as a slurm job :)
11 | 
12 | bash process_EK_metagenomes/mmseqs_easy_taxonomy_submit.sh UniRef50 mmseqs fasta
13 | 
14 | 


--------------------------------------------------------------------------------
/process_EK_metagenomes/vamb_create_fasta.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import argparse
 3 | import vamb
 4 | 
 5 | parser = argparse.ArgumentParser(
 6 |     description="""Command-line bin creator.
 7 | Will read the entire content of the FASTA file into memory - beware.""",
 8 |     formatter_class=argparse.RawDescriptionHelpFormatter,
 9 |     add_help=False,
10 | )
11 | 
12 | parser.add_argument("fastapath", help="Path to FASTA file")
13 | parser.add_argument("clusterspath", help="Path to clusters.tsv")
14 | parser.add_argument("minsize", help="Minimum size of bin", type=int, default=0)
15 | parser.add_argument("outdir", help="Directory to create")
16 | 
17 | if len(sys.argv) == 1:
18 |     parser.print_help()
19 |     sys.exit()
20 | 
21 | args = parser.parse_args()
22 | 
23 | with open(args.clusterspath) as file:
24 |     clusters = vamb.vambtools.read_clusters(file)
25 | 
26 | with vamb.vambtools.Reader(args.fastapath) as file:
27 |     vamb.vambtools.write_bins(
28 |         args.outdir, clusters, file, maxbins=None, minsize=args.minsize
29 |     )
30 | 


--------------------------------------------------------------------------------
/process_JCJ_metagenomes/megahit_submit.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | mkdir --parents megahit
 4 | for R1 in $(cat R1_reads.txt); do
 5 | 	R2=${R1/R1/R2};
 6 | 	FILEEND="_R1_001.fastq.gz";
 7 | 	O=${R1/$FILEEND/};
 8 | 
 9 | 
10 | 	if [[ ! -e megahit/$O ]]; then
11 | 		sbatch ~/GitHubs/EdwardsLab/process_JCJ_metagenomes/megahit.slurm no_human/$R1 no_human/$R2 megahit/$O
12 | 	fi;
13 | done
14 | 


--------------------------------------------------------------------------------
/process_JCJ_metagenomes/vamb_create_fasta.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import argparse
 3 | import vamb
 4 | 
 5 | parser = argparse.ArgumentParser(
 6 |     description="""Command-line bin creator.
 7 | Will read the entire content of the FASTA file into memory - beware.""",
 8 |     formatter_class=argparse.RawDescriptionHelpFormatter,
 9 |     add_help=False,
10 | )
11 | 
12 | parser.add_argument("fastapath", help="Path to FASTA file")
13 | parser.add_argument("clusterspath", help="Path to clusters.tsv")
14 | parser.add_argument("minsize", help="Minimum size of bin", type=int, default=0)
15 | parser.add_argument("outdir", help="Directory to create")
16 | 
17 | if len(sys.argv) == 1:
18 |     parser.print_help()
19 |     sys.exit()
20 | 
21 | args = parser.parse_args()
22 | 
23 | with open(args.clusterspath) as file:
24 |     clusters = vamb.vambtools.read_clusters(file)
25 | 
26 | with vamb.vambtools.Reader(args.fastapath) as file:
27 |     vamb.vambtools.write_bins(
28 |         args.outdir, clusters, file, maxbins=None, minsize=args.minsize
29 |     )
30 | 


--------------------------------------------------------------------------------
/process_metagenomes/mmseqs_add_subsystems.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=mmseqs_ss
 3 | #SBATCH --time=0-10
 4 | #SBATCH --ntasks=1
 5 | #SBATCH --cpus-per-task=2
 6 | #SBATCH --mem=16G
 7 | #SBATCH -o mmseqs_ss-%A_%a.out
 8 | #SBATCH -e mmseqs_ss-%A_%a.err
 9 | 
10 | eval "$(conda shell.bash hook)"
11 | conda activate bioinformatics
12 | 
13 | if [[ ! -e R1_reads.txt ]]; then 
14 | 	echo "Please make a file with the R1 reads using this command:" >&2
15 | 	echo "find fastq -name \*R1\* -printf "%f\n" > R1_reads.txt" >&2;
16 | 	exit 2;
17 | fi
18 | 
19 | 
20 | if [[ ! -e DEFINITIONS.sh ]]; then
21 | 	echo "Please create a DEFINITIONS.sh file with SOURCE, FILEEND, HOSTREMOVED" >&2
22 | 	exit 2;
23 | fi
24 | 
25 | source DEFINITIONS.sh
26 | 
27 | 
28 | cp /home/edwa0468/UniRef/uniref.sqlite $BGFS
29 | 
30 | R1=$(head -n $SLURM_ARRAY_TASK_ID R1_reads.txt | tail -n 1)
31 | BASE=${R1/$FILEEND/}
32 | 
33 | python ~/GitHubs/EdwardsLab/mmseqs/easy_taxonomy_to_function.py -f mmseqs/$BASE/${BASE}_tophit_report.gz -d $BGFS/uniref.sqlite | gzip -c >  mmseqs/$BASE/${BASE}_tophit_report_subsystems.gz
34 | 


--------------------------------------------------------------------------------
/process_metagenomes/mmseqs_taxonomy.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=mmTax
 3 | #SBATCH --time=0-1
 4 | #SBATCH --ntasks=1
 5 | #SBATCH --cpus-per-task=1
 6 | #SBATCH --mem=12G
 7 | #SBATCH --partition=short
 8 | #SBATCH -o mmtax-%j.out
 9 | #SBATCH -e mmtax-%j.err
10 | 
11 | 
12 | set -euo pipefail
13 | 
14 | python /home/edwa0468/GitHubs/EdwardsLab/taxon/mmseqs_report_to_table.py -d mmseqs -o mmseqs_taxonomy -v
15 | 


--------------------------------------------------------------------------------
/process_metagenomes/vamb_concat.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=VambConcat
 3 | #SBATCH --time=0-1
 4 | #SBATCH --ntasks=1
 5 | #SBATCH --cpus-per-task=1
 6 | #SBATCH --mem=32G
 7 | #SBATCH --partition=short
 8 | #SBATCH -o vamb_concat-%j.out
 9 | #SBATCH -e vamb_concat-%j.err
10 | 
11 | eval "$(conda shell.bash hook)"
12 | conda activate vamb
13 | 
14 | 
15 | mkdir -p vamb
16 | python /home/edwa0468/GitHubs/EdwardsLab/process_EK_metagenomes/vamb_concatenate.py vamb/contigs.fna.gz megahit/*/output/final.contigs.fa
17 | 
18 | 


--------------------------------------------------------------------------------
/prophages/run_phispy_snakemakes.sh:
--------------------------------------------------------------------------------
 1 | #  shell script so I can run all the snakemakes!
 2 | 
 3 | WD=$PWD
 4 | cd phispy_metrics
 5 | echo "Running phispy in phispy_metrics"
 6 | snakemake -s phispy_metrics.snakefile -j 12
 7 | snakemake -s phispy_no_metrics.snakefile -j 12
 8 | python3 summarize.py
 9 | cd $WD
10 | 
11 | cd phispy_tests
12 | echo "Running phispy in phispy_tests"
13 | snakemake -s /home3/redwards/GitHubs/EdwardsLab/prophages/phispy_training_vs_test.snakefile -j 12
14 | cd $WD
15 | 
16 | 
17 | cd phispy_training_set
18 | echo "Running phispy in phispy_training_set";
19 | snakemake -s /home3/redwards/GitHubs/EdwardsLab/prophages/phispy_with_training.snakefile -j 12
20 | cd $WD
21 | 
22 | cd phispy_phage_genes
23 | echo "Running phispy in phispy_phage_genes"
24 | snakemake -s /home3/redwards/GitHubs/EdwardsLab/prophages/phispy_phage_genes.snakefile -j 12
25 | cd $WD
26 | 
27 | 
28 | cd PhiSpy_SN
29 | echo "Running phispy in phispy_SN"
30 | snakemake -s /home3/redwards/GitHubs/EdwardsLab/snakemake/phispy.snakefile -j 12
31 | cd $WD
32 | 
33 | 


--------------------------------------------------------------------------------
/prophages/run_virsorter.snakefile:
--------------------------------------------------------------------------------
 1 | """
 2 | Run virsorter on a genbank file
 3 | 
 4 | """
 5 | 
 6 | 
 7 | import os
 8 | import sys
 9 | 
10 | 
11 | GBDIR = "genbank"
12 | FADIR = "fasta"
13 | 
14 | SAMPLES, =  glob_wildcards(os.path.join(GBDIR, '{sample}.gbf'))
15 | 
16 | 
17 | 
18 | rule all:
19 |     input:
20 |         expand(os.path.join(FADIR, "{sample}.fna"), sample=SAMPLES)
21 | 
22 | 
23 | rule genbank2fasta:
24 |     input:
25 |         os.path.join(GBDIR, "{sample}.gbf")
26 |     output:
27 |         os.path.join(FADIR, "{sample}.fna")
28 |     shell:
29 |         "any2fasta {input} > {output}"
30 | 
31 | 
32 | rule run_virsorter:
33 |     input:
34 |         os.path.join(FADIR, "{sample}.fna")
35 |     conda:
36 |         "virsorter"
37 |     output:
38 |         os.path.join(VIRDIR, "{sample}", 
39 | 
40 | 
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------
/proxymeta/README.md:
--------------------------------------------------------------------------------
 1 | # Proxymeta
 2 | 
 3 | These scripts are for analyzing data from [Hi-C](https://www.biorxiv.org/content/biorxiv/early/2017/10/05/198713.full.pdf) provided by this paper _Hi-C deconvolution of a human gut microbiome yields high-quality draft
 4 | genomes and reveals plasmid-genome interactions._
 5 | 
 6 | They have [three samples](https://www.ncbi.nlm.nih.gov/Traces/study/?acc=SAMN07736353)
 7 | 
 8 | SRA ID | Library name | Type | Experiment | Library Selection | MBp | Reads
 9 | ---|---|---|---|---|---|---
10 | SRR6131124 | AD007 | OTHER | SRX3243492 | Restriction Digest | 12,019 | 5,602
11 | SRR6131123 | AD002 | WGS | SRX3243493 | RANDOM | 36,128 | 15,610  
12 | SRR6131122 | AD012 | OTHER | SRX3243494 | Restriction Digest | 12,728 | 5,584
13 | 
14 | As usual, they probably do neither what you expect nor what you need! 
15 | 
16 | 
17 | 


--------------------------------------------------------------------------------
/pymol/draw_images.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Draw images from a directory of PDB files
 3 | """
 4 | 
 5 | import os
 6 | import sys
 7 | import argparse
 8 | import pymol
 9 | 
10 | 
11 | __author__ = 'Rob Edwards'
12 | 
13 | if __name__ == "__main__":
14 |     parser = argparse.ArgumentParser(description=' ')
15 |     parser.add_argument('-d', help='input directory', required=True)
16 |     parser.add_argument('-o', help='output directory', required=True)
17 |     parser.add_argument('-v', help='verbose output', action='store_true')
18 |     args = parser.parse_args()
19 | 
20 |     for f in os.listdir(args.d):
21 |         


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | PhiSpy
 2 | bcbio-gff>=0.6.6
 3 | biopython
 4 | bs4
 5 | crc64iso
 6 | datrie
 7 | h5py
 8 | ipython
 9 | jupyter
10 | lxml
11 | matplotlib
12 | natsort
13 | networkx
14 | numpy
15 | openpyxl
16 | pandas
17 | plotly
18 | pybtex
19 | pymongo
20 | pyreadr
21 | pysam
22 | pytaxonkit
23 | python-dateutil
24 | pytz
25 | requests
26 | scikit-bio
27 | scikit-learn
28 | scipy
29 | seaborn
30 | sklearn
31 | suffix_trees
32 | tk
33 | xmltodict
34 | 


--------------------------------------------------------------------------------
/requirements_mini.txt:
--------------------------------------------------------------------------------
 1 | bcbio-gff>=0.6.6
 2 | bs4
 3 | crc64iso
 4 | datrie
 5 | h5py
 6 | ipython
 7 | jupyter
 8 | lxml
 9 | matplotlib
10 | networkx
11 | numpy
12 | openpyxl
13 | pandas
14 | python-dateutil
15 | pytz
16 | requests
17 | scikit-bio
18 | scikit-learn
19 | scipy
20 | seaborn
21 | 


--------------------------------------------------------------------------------
/rob_tests/test_stream_pair.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Test streaming pairs of sequences but not a unit (nose) test. Sorry
 3 | """
 4 | 
 5 | import os
 6 | import sys
 7 | import argparse
 8 | 
 9 | from roblib import bcolors
10 | from roblib import stream_paired_fastq
11 | 
12 | 
13 | 
14 | if __name__ == '__main__':
15 |     parser = argparse.ArgumentParser(description='')
16 |     parser.add_argument('-l', help='R1 file', required=True)
17 |     parser.add_argument('-r', help='R2 file', required=True)
18 |     parser.add_argument('-v', help='verbose output', action='store_true')
19 |     args = parser.parse_args()
20 | 
21 | 
22 |     for seqid, h1, s1, q1, h2, s2, q2 in stream_paired_fastq(args.l, args.r):
23 |         print(f"{h1} :: {h2}")


--------------------------------------------------------------------------------
/roblib/bcolors.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Colors that you can import and make the text look pretty
 3 | 
 4 | Source: https://stackoverflow.com/questions/287871/print-in-terminal-with-colors
 5 | """
 6 | 
 7 | __author__ = 'Rob Edwards'
 8 | 
 9 | 
10 | class bcolors(object):
11 |     HEADER = '\033[95m'
12 |     OKBLUE = '\033[94m'
13 |     OKGREEN = '\033[92m'
14 |     WARNING = '\033[93m'
15 |     FAIL = '\033[91m'
16 |     ENDC = '\033[0m'
17 |     BOLD = '\033[1m'
18 |     UNDERLINE = '\033[4m'
19 | 
20 |     PINK = '\033[95m'
21 |     BLUE = '\033[94m'
22 |     GREEN = '\033[92m'
23 |     YELLOW = '\033[93m'
24 |     RED = '\033[91m'
25 |     WHITE = '\033[0m'
26 |     BOLD = '\033[1m'
27 |     UNDERLINE = '\033[4m'
28 | 
29 | 


--------------------------------------------------------------------------------
/roblib/files.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Helper functions for files
 3 | """
 4 | import binascii
 5 | 
 6 | 
 7 | def is_gzip(filename: str) -> bool:
 8 |     """
 9 |     Is this a gzip file?
10 |     """
11 | 
12 |     """
13 |     This is an elegant solution to test whether a file is gzipped by reading the first two characters.
14 |     I also use a version of this in fastq_pair if you want a C version :)
15 |     See https://stackoverflow.com/questions/3703276/how-to-tell-if-a-file-is-gzip-compressed for inspiration
16 |     :param f: the file to test
17 |     :return: True if the file is gzip compressed else false
18 |     """
19 |     with open(filename, 'rb') as i:
20 |         return binascii.hexlify(i.read(2)) == b'1f8b'
21 | 


--------------------------------------------------------------------------------
/roblib/rob_error.py:
--------------------------------------------------------------------------------
 1 | """
 2 | An Error Class so I can write my own errors
 3 | """
 4 | class Error(Exception):
 5 |     """
 6 |     Base class for exceptions in this module.
 7 |     """
 8 |     pass
 9 | 
10 | class SequencePairError(Error):
11 |     """
12 |     Exception raised for sequences not being paired properly.
13 | 
14 |     :param message: explanation of the error
15 |     """
16 | 
17 |     def __init__(self, message):
18 |         self.message = message
19 |         super().__init__(self.message)
20 | 
21 | class FastqFormatError(Error):
22 |     """
23 |     Exception raised for sequences not being paired properly.
24 | 
25 |     :param message: explanation of the error
26 |     """
27 | 
28 |     def __init__(self, message):
29 |         self.message = message
30 |         super().__init__(self.message)
31 | 
32 | class ColorNotFoundError(Error):
33 |     """
34 |     Exception raised for a color not being found.
35 | 
36 |     :param message: explanation of the error
37 |     """
38 | 
39 |     def __init__(self, message):
40 |         self.message = message
41 | 


--------------------------------------------------------------------------------
/roblib/strings.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import string
 4 | __author__ = 'Rob Edwards'
 5 | 
 6 | 
 7 | 
 8 | 
 9 | def ascii_clean(s):
10 |     """Remove non-ascii characters from a string"""
11 |     return filter(lambda x: x in string.printable, s)
12 | 
13 | 


--------------------------------------------------------------------------------
/roblib_tk/__init__.py:
--------------------------------------------------------------------------------
1 | from .file_chooser import choose_a_file, write_a_file
2 | 
3 | __all__ = [
4 |     'choose_a_file', 'write_a_file'
5 | ]


--------------------------------------------------------------------------------
/roblib_tk/file_chooser.py:
--------------------------------------------------------------------------------
 1 | """
 2 | choose a file. can use this if one is not provided
 3 | """
 4 | import tkinter as tk
 5 | from tkinter import filedialog
 6 | 
 7 | def choose_a_file(dialog_title="Choose a file..."):
 8 | 
 9 |     root = tk.Tk()
10 | 
11 |     filetypes = (
12 |         ('Text files', '*.TXT'),
13 |         ('All files', '*.*'),
14 |     )
15 | 
16 |     filename = tk.filedialog.askopenfilename(
17 |         title=dialog_title,
18 |         filetypes=filetypes,
19 |     )
20 |     root.destroy()
21 | 
22 |     return filename
23 | 
24 | 
25 | def write_a_file(dialog_title="Choose where to save the file..."):
26 |     filetypes = (
27 |         ('TSV files', '*.TSV'),
28 |         ('XLS files', '*.XLS'),
29 |         ('All files', '*.*'),
30 |     )
31 | 
32 |     filename = tk.filedialog.asksaveasfilename(
33 |         title=f'Choose where to save the file...',
34 |         filetypes=filetypes,defaultextension=".tsv"
35 |     )
36 | 
37 |     return filename


--------------------------------------------------------------------------------
/searchSRA/envs/samtools.yaml:
--------------------------------------------------------------------------------
1 | name: samtools
2 | channels:
3 |     - bioconda
4 | dependencies:
5 |     - samtools
6 | 


--------------------------------------------------------------------------------
/searchSRA/searchSRA_abstracts.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:431d7588717cfc6c514a337f127e101f31c9200c13615f310e097695ebf72d5f
3 | size 2715326
4 | 


--------------------------------------------------------------------------------
/seed_servers/RAST-alljobs.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl -w
 2 | #
 3 | 
 4 | use strict;
 5 | use Data::Dumper;
 6 | $ENV{SAS_SERVER}="PUBSEED";
 7 | print STDERR "SAS is $ENV{SAS_SERVER}\n";
 8 | use Term::ReadKey;
 9 | use RASTserver;
10 | 
11 | ## Use RAST test, not regular RAST
12 | # Now using regular RAST
13 | 
14 | 
15 | print "Please enter your RAST username:  ";
16 | my $user = ReadLine(0);
17 | chomp $user;
18 | 
19 | print "Please enter your RAST password:  ";
20 | ReadMode 2;
21 | my $password = ReadLine(0);
22 | chomp $password;
23 | ReadMode 1;
24 | print "\n";
25 | 
26 | 
27 | my $rast=new RASTserver($user, $password);
28 | unless (defined $rast) {die "Can't connect ot the rast server"}
29 | 
30 | print Dumper($rast->jobs());
31 | 


--------------------------------------------------------------------------------
/seed_servers/RAST-jobs.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl -w
 2 | #
 3 | #
 4 | 
 5 | use strict;
 6 | use RASTserver;
 7 | use Term::ReadKey;
 8 | use Data::Dumper;
 9 | $ENV{SAS_SERVER}="PUBSEED";
10 | 
11 | print "Please enter your RAST username:  ";
12 | my $user = ReadLine(0);
13 | chomp $user;
14 | 
15 | print "Please enter your RAST password:  ";
16 | ReadMode 2;
17 | my $password = ReadLine(0);
18 | chomp $password;
19 | ReadMode 1;
20 | print "\n";
21 | 
22 | my $rast=new RASTserver($user, $password);
23 | unless (defined $rast) {die "Can't connect ot the rast server"}
24 | 
25 | my $time = time; my $job = 0;
26 | my @jobs = $rast->jobs();
27 | 
28 | foreach my $j (@jobs) {
29 | 	print Dumper($j);
30 | 	print STDERR $job++, " : ", ($time-time), " seconds\n";
31 | }
32 | 


--------------------------------------------------------------------------------
/seed_servers/RAST-status.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | use strict;
 4 | use RASTserver;
 5 | $ENV{SAS_SERVER}="SEED";
 6 | use Term::ReadKey;
 7 | 
 8 | print "Please enter your RAST username:  ";
 9 | my $user = ReadLine(0);
10 | chomp $user;
11 | 
12 | print "Please enter your RAST password:  ";
13 | ReadMode 2;
14 | my $password = ReadLine(0);
15 | chomp $password;
16 | ReadMode 1;
17 | print "\n";
18 | 
19 | 
20 | my $rast=new RASTserver($user, $password);
21 | unless (defined $rast) {die "Can't connect ot the rast server"}
22 | 
23 | 
24 | 
25 | 
26 | 
27 | die "$0 <list of jobs>" unless (defined $ARGV[0]);
28 | my $stat = $rast->status_of_RAST_job({-job => \@ARGV});
29 | 
30 | foreach my $job (sort {$a <=> $b} keys %$stat) {
31 | 	print join("\t", $job, $stat->{$job}->{'status'}), "\n";
32 | }
33 | 


--------------------------------------------------------------------------------
/seed_servers/test_occ_roles.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Test the conversion of roles to pegs
 3 | """
 4 | 
 5 | import os
 6 | import sys
 7 | 
 8 | import argparse
 9 | from servers.SAP import SAPserver
10 | 
11 | def occ_to_roles(roles):
12 |     """
13 |     Convert roles to a dict of roles and the pegs they do
14 | 
15 |     :param roles:
16 |     :type roles:
17 |     :return:
18 |     :rtype:
19 |     """
20 | 
21 |     sv = SAPserver()
22 |     result = sv.occ_of_role({'-roles' : roles})
23 |     return result
24 | 
25 | if __name__ == '__main__':
26 |     roles = ['PTS system, N-acetylglucosamine-specific IIB component (EC 2.7.1.69)', 'Glycerol-3-phosphate dehydrogenase [NAD+] (EC 1.1.1.8)']
27 |     res = occ_to_roles(roles)
28 |     for r in res:
29 |         print(r + "\t" + "\n".join(res[r]))


--------------------------------------------------------------------------------
/snakemake/abricate.snakefile:
--------------------------------------------------------------------------------
 1 | """
 2 | Run [abricate](https://github.com/tseemann/abricate) with all 
 3 | options on a directory of sequence files
 4 | """
 5 | 
 6 | import os
 7 | import sys
 8 | import subprocess
 9 | 
10 | configfile: "abricate.yaml"
11 | 
12 | 
13 | indir = config['seq_files']
14 | outdir = config['abricate_output']
15 | 
16 | # read the known databases from abricate
17 | 
18 | proc = subprocess.Popen(['abricate','--list'],stdout=subprocess.PIPE, encoding='utf-8')
19 | databases = list(filter(lambda x: x != "DATABASE", [p[0] for p in [l.strip().split("\t") for l in proc.stdout]]))
20 | 
21 | SEQS, = glob_wildcards(os.path.join(indir, '{seq}'))
22 | 
23 | 
24 | rule all:
25 |     input:
26 |         expand(os.path.join(outdir, "{sample}.{db}.abricate.tsv"), sample=SEQS, db=databases)
27 | 
28 | rule abricate:
29 |     input:
30 |         os.path.join(indir, "{sample}")
31 |     output:
32 |         os.path.join(outdir, "{sample}.{db}.abricate.tsv")
33 |     params:
34 |         db = "{db}"
35 |     shell:
36 |         "abricate --noheader --nopath  --db {params.db} {input} > {output}"
37 | 


--------------------------------------------------------------------------------
/snakemake/annotate_phages.snakefile:
--------------------------------------------------------------------------------
 1 | """
 2 | Snakefile to upload all the genomes to PATRIC
 3 | """
 4 | 
 5 | 
 6 | # where is the data
 7 | FASTADIR = config['fasta']
 8 | OUTPUTDIR = config['output']
 9 | 
10 | 
11 | FASTA, = glob_wildcards(os.path.join(FASTADIR, '{fasta}.fasta'))
12 | 
13 | rule all:
14 |     input:
15 |         
16 | 
17 | 


--------------------------------------------------------------------------------
/snakemake/cluster_phages.snakefile:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | import os
 5 | 
 6 | gbk = "phage_100_genbank"
 7 | faa = "proteins"
 8 | fna = "nucleotides"
 9 | PHAGES, = glob_wildcards(os.path.join(gbk, '{phage}_phage.gbk'))
10 | 
11 | 
12 | rule all:
13 |     input:
14 |         expand(os.path.join(fna, '{phage}.fna'), phage=PHAGES)
15 | 
16 | rule gbk2faa:
17 |     input:
18 |         os.path.join(gbk, '{phage}_phage.gbk')
19 |     output:
20 |         faa = os.path.join(faa, '{phage}.faa'),
21 |         fna = os.path.join(fna, '{phage}.fna')
22 |     shell:
23 |         """
24 |         python3 ~/GitHubs/EdwardsLab/bin/genbank2sequences.py -g {input} -a {output.faa} -n {output.fna} -c
25 |         """
26 | 
27 | 


--------------------------------------------------------------------------------
/snakemake/envs/bowtie.yaml:
--------------------------------------------------------------------------------
 1 | name: bowtie
 2 | channels:
 3 |     - conda-forge
 4 |     - bioconda
 5 |     - default
 6 | dependencies:
 7 |     # There is an issue with newer tbb libraries. This might go away at some point: https://www.biostars.org/p/494922/
 8 |     - tbb=2020.2
 9 |     - bowtie2
10 |     - samtools
11 | 


--------------------------------------------------------------------------------
/snakemake/envs/canu.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |         - conda-forge
3 |         - bioconda
4 | dependencies:
5 |     - canu
6 | 


--------------------------------------------------------------------------------
/snakemake/envs/filtlong.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |         - conda-forge
3 |         - bioconda
4 | dependencies:
5 |         - filtlong
6 | 


--------------------------------------------------------------------------------
/snakemake/envs/flye.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |         - conda-forge
3 |         - bioconda
4 | dependencies:
5 |         - flye
6 | 


--------------------------------------------------------------------------------
/snakemake/envs/focus.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |     - bioconda
3 | dependencies:
4 |     - focus
5 | 


--------------------------------------------------------------------------------
/snakemake/envs/kraken.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |     - bioconda
3 | dependencies:
4 |     - kraken2
5 | 


--------------------------------------------------------------------------------
/snakemake/envs/megahit.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |     - bioconda
3 | dependencies:
4 |     - megahit
5 | 


--------------------------------------------------------------------------------
/snakemake/envs/miniasmminipolish.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |         - conda-forge
3 |         - bioconda
4 | dependencies:
5 |         - miniasm
6 |         - minipolish
7 |         - any2fasta
8 | 


--------------------------------------------------------------------------------
/snakemake/envs/minimap.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |     - bioconda
3 | dependencies:
4 |     - minimap2
5 | 


--------------------------------------------------------------------------------
/snakemake/envs/prinseq.yaml:
--------------------------------------------------------------------------------
1 | name: prinseq-plus-plus
2 | channels:
3 |     - conda-forge
4 |     - bioconda
5 |     - default
6 | dependencies:
7 |     - prinseq-plus-plus
8 | 


--------------------------------------------------------------------------------
/snakemake/envs/raven.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |         - conda-forge
3 |         - bioconda
4 | dependencies:
5 |     - raven-assembler
6 | 


--------------------------------------------------------------------------------
/snakemake/envs/seqtk.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |     - bioconda
3 | dependencies:
4 |     - seqtk
5 | 


--------------------------------------------------------------------------------
/snakemake/envs/superfocus.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |     - bioconda
3 | dependencies:
4 |     - super-focus
5 | 


--------------------------------------------------------------------------------
/snakemake/envs/trycycler.yaml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |         - conda-forge
 3 |         - bioconda
 4 | dependencies:
 5 |         - miniasm
 6 |         - mash
 7 |         - minimap2
 8 |         - muscle
 9 |         - r-ape
10 |         - r-phangorn
11 | 


--------------------------------------------------------------------------------
/snakemake/phispy.yaml:
--------------------------------------------------------------------------------
1 | directories:
2 |         fasta_files: fasta
3 |         genbank_files: genbank
4 |         phispy_files: phispy
5 |         gto_files : gto
6 | number_of_genomes : 200
7 | 
8 | 


--------------------------------------------------------------------------------
/snakemake/process_metagenomes.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "tatabox_executables" : {
 3 |         "assembler" : "/usr/local/genome/megahit/current/bin/megahit",
 4 |         "mmseqs"  : "/usr/local/genome/mmseqs2/mmseqs/bin/mmseqs"
 5 |     },
 6 |     "anth_executables" : {
 7 |         "assembler" : "/home3/redwards/opt/megahit/current/bin/megahit",
 8 |         "mmseqs"  : "/home3/redwards/opt/mmseqs/current/bin/mmseqs",
 9 |         "bowtie2-build" : "/usr/local/bowtie2/bin/bowtie2-build",
10 |         "bowtie2" : "/usr/local/bowtie2/bin/bowtie2"
11 |     },
12 |     "directories" : {
13 |         "Reads" : "fastq",
14 |         "round1_assembly_output" : "assembly.1",
15 |         "round1_contig_read_mapping" : "reads.contigs.1",
16 |         "round2_unassembled_reads" : "unassembled_reads",
17 |         "round2_assembly_output" : "reassembled_reads",
18 |         "combined_contig_merging" : "final.combined_contigs"
19 |     },
20 |     "threads" : 8
21 | }
22 | 
23 | 


--------------------------------------------------------------------------------
/snakemake_tests/test.snakefile:
--------------------------------------------------------------------------------
 1 | """
 2 | 
 3 | A simple test
 4 | 
 5 | """
 6 | 
 7 | VAR = "hello world"
 8 | 
 9 | rule all:
10 |     input:
11 |         "ls.txt"
12 | 
13 | rule lsd:
14 |     input:
15 |         "bac_giant_unique_species"
16 |     output:
17 |         "ls.txt"
18 |     shell:
19 |         #"/bin/ls {input} > {output}"
20 |         "echo {VAR} > {output}"
21 | 


--------------------------------------------------------------------------------
/sra/SRA.partie.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:f8f55f7174d6960c8d182661c5c1346a053f12d594c8c9b8b8445377872e9607
3 | size 5169243
4 | 


--------------------------------------------------------------------------------
/sra/run_accession-experiment_lib.tsv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:558b0ffdef5b29a3b9d0f130e5085f89c18c99b590ec6538e6ce422c0999be2b
3 | size 5365649
4 | 


--------------------------------------------------------------------------------
/taxon/Error.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Custom exceptions for taxnomy parsing
 3 | """
 4 | 
 5 | class Error(Exception):
 6 |     """Base class for other exceptions"""
 7 |     pass
 8 | 
 9 | 
10 | class EntryNotInDatabaseError(Exception):
11 |     """Entry not in the db. Obvs"""
12 | 
13 |     def __init__(self, message):
14 |         self.message = message
15 | 
16 | 
17 | class NoNameFoundError(Exception):
18 |     """No name was found for this entry"""
19 |     def __init__(self, message):
20 |         self.message = message


--------------------------------------------------------------------------------
/taxon/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | __author__ = 'Rob Edwards'
 5 | from .taxon import read_taxa, read_nodes, extended_names, read_names, read_divisions, read_gi_tax_id, read_tax_id_gi
 6 | from .config import get_db_dir
 7 | from .load_from_database import get_taxonomy_db, get_taxonomy, connect_to_db, get_taxid_for_name, taxonomy_hierarchy_as_list
 8 | from .load_from_database import all_ids, taxonomy_hierarchy, all_species_ids, taxonomy_ids_as_list, acc_to_taxonomy
 9 | from .taxonomy import TaxonNode, TaxonName, TaxonDivision
10 | from .Error import NoNameFoundError, EntryNotInDatabaseError
11 | from .read_accession_files import read_acc_tax_id
12 | 
13 | __all__ = [
14 |     'read_taxa', 'read_nodes', 'extended_names', 'read_names', 'read_divisions', 'read_gi_tax_id', 'read_tax_id_gi',
15 |     'get_taxonomy_db', 'get_taxonomy', 'connect_to_db', 'get_db_dir', 'get_taxid_for_name', 'all_ids',
16 |     'taxonomy_hierarchy', 'taxonomy_hierarchy_as_list', 'taxonomy_ids_as_list', 'read_acc_tax_id', 'acc_to_taxonomy'
17 |     ]
18 | 
19 | 
20 | 
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/taxon/config.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Some settings for the config files
 3 | """
 4 | 
 5 | import os
 6 | import sys
 7 | 
 8 | 
 9 | def get_db_dir():
10 |     """
11 |     Just return the default dir listed above
12 |     :return: the default location for the sqllite database
13 |     """
14 | 
15 |     if 'NCBI_TAXONOMY' in os.environ:
16 |         if os.path.exists(os.environ['NCBI_TAXONOMY']):
17 |             return os.environ['NCBI_TAXONOMY']
18 |         else:
19 |             print(f"WARNING: NCBI_TAXONOMY variable is set but {os.environ['NCBI_TAXONOMY']} does not exist", file=sys.stderr)
20 |     if 'TAXONKIT_DB' in os.environ:
21 |         if os.path.exists(os.environ['TAXONKIT_DB']):
22 |             return os.environ['TAXONKIT_DB']
23 |         else:
24 |             print(f"WARNING: TAXONKIT_DB variable is set but {os.environ['TAXONKIT_DB']} does not exist", file=sys.stderr)
25 | 
26 |     return None
27 | 


--------------------------------------------------------------------------------
/taxon/taxonomy/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'Rob Edwards'
2 | from .taxonomy import TaxonNode, TaxonName, TaxonDivision
3 | __all__ = ["TaxonNode", "TaxonName", "TaxonDivision"]
4 | 
5 | 


--------------------------------------------------------------------------------
/testrepeatfinder/ROBTEST.repeatfinder:
--------------------------------------------------------------------------------
1 | 1	12	17	28
2 | 1	11	18	28
3 | 2	12	17	27
4 | 


--------------------------------------------------------------------------------
/testrepeatfinder/compare.pl:
--------------------------------------------------------------------------------
 1 | use strict;
 2 | 
 3 | my $f1 = shift || die "file 1?";
 4 | my $f2 = shift || die "file 2?";
 5 | 
 6 | my %data; my %first;
 7 | open(IN, $f1) || die "cant open $f1";
 8 | while (<IN>) {
 9 | 	$data{$_}=1;
10 | 	my @a=split /\t/;
11 | 	$first{$a[0]}=$_;
12 | }
13 | close IN;
14 | 
15 | open(IN, $f2) || die "cant open $f2";
16 | while (<IN>) {
17 | 	if (!$data{$_}) {
18 | 		my @a=split /\t/;
19 | 		if ($first{$a[0]}) {
20 | 			print "\n$first{$a[0]}$_\n";
21 | 		}
22 | 		else {
23 | 			print STDERR "NONE: $_";
24 | 		}
25 | 	}
26 | }
27 | 


--------------------------------------------------------------------------------
/testrepeatfinder/files.txt:
--------------------------------------------------------------------------------
1 | /home/redwards/.local/lib/python3.7/site-packages/repeatFinder.cpython-37m-x86_64-linux-gnu.so
2 | /home/redwards/.local/lib/python3.7/site-packages/repeatfinder-1.0.0.egg-info
3 | 


--------------------------------------------------------------------------------
/testrepeatfinder/repeatFinder.h:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | #ifndef PHISPYREPEATFINDER_H
 4 | #define PHISPYREPEATFINDER_H
 5 | 
 6 | static PyObject * python_input(PyObject *self, PyObject *args);
 7 | 
 8 | static PyMethodDef PhiSpyRepeatFinderMethods[] = {
 9 |     {"repeatFinder", python_input, METH_VARARGS, "Python interface for C++ repeat finder for PhiSpy"},
10 |     {NULL, NULL, 0, NULL}
11 | };
12 | 
13 | static struct PyModuleDef PhiSpyRepeatFinderModule = {
14 |     PyModuleDef_HEAD_INIT,
15 |     "repeatFinder",
16 |     "Python for a C++ repeat finder used by PhiSpy to identify potential prophage ends",
17 |     -1,
18 |     PhiSpyRepeatFinderMethods
19 | };
20 | 
21 | #endif //PHISPYREPEATFINDER_H


--------------------------------------------------------------------------------
/testrepeatfinder/setup.py:
--------------------------------------------------------------------------------
 1 | from distutils.core import setup, Extension
 2 | 
 3 | def main():
 4 |     setup(name="RobRepeatFinder",
 5 |           version="1.0.0",
 6 |           description="Python interface for repeatFinder",
 7 |           author="Rob Edwards",
 8 |           author_email="raedwards@gmail.com",
 9 |           ext_modules=[Extension("RobRepeatFinder", sources=["repeatFinder.cpp"], language='c++')])
10 | 
11 | if __name__ == "__main__":
12 |     main()
13 | 


--------------------------------------------------------------------------------
/testrepeatfinder/test.fasta:
--------------------------------------------------------------------------------
1 | >sequence
2 | AAAAAAAAAAATGCATGCATGCATCGTCAGCATCGACATGGCTACTTTTTTTTTTT
3 | 


--------------------------------------------------------------------------------
/testrepeatfinder/test.fasta.repeatfinder:
--------------------------------------------------------------------------------
1 | 1	11	56	46
2 | 


--------------------------------------------------------------------------------
/testrepeatfinder/test_repeatfinder.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Test the implementation of the repeatfinder extension
 3 | """
 4 | 
 5 | import os
 6 | import sys
 7 | import argparse
 8 | 
 9 | from roblib import bcolors
10 | 
11 | import RobRepeatFinder
12 | import pprint
13 | 
14 | s = "TTTTTTTTTTTTagcaTTTTTTTTTTTT"
15 | print(f"s: {s}")
16 | r = RobRepeatFinder.repeatFinder(s, 0)
17 | pp = pprint.PrettyPrinter(indent=4)
18 | pp.pprint(r)
19 | 
20 | 


--------------------------------------------------------------------------------
/text_matching/vfdb.txt.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:769e678ac6583edd63005c0a8eb5ce5f8611bd39e658d308ddf1ae3cd4766381
3 | size 147539
4 | 


--------------------------------------------------------------------------------