├── .gitattributes ├── .gitignore ├── .gitmodules ├── ANI ├── C │ ├── Version1.1 │ │ ├── Libraries │ │ │ ├── FNACharactersLib.c │ │ │ ├── FNACharactersLib.h │ │ │ ├── levenshteinDistanceLib.c │ │ │ ├── levenshteinDistanceLib.h │ │ │ ├── numberOfLinesLib.c │ │ │ └── numberOfLinesLib.h │ │ ├── Main │ │ │ └── main.c │ │ └── Tests │ │ │ ├── ANITestCase │ │ │ └── main.c │ │ │ ├── TESTS.txt │ │ │ ├── fnaFiles │ │ │ ├── 5mers.fna │ │ │ ├── 5mers2.fna │ │ │ ├── NC_021215.fna │ │ │ └── NC_022886.fna │ │ │ └── kmersTestCase │ │ │ └── main.c │ ├── Version1.2 │ │ ├── Libraries │ │ │ ├── FNACharactersLib.c │ │ │ ├── FNACharactersLib.h │ │ │ ├── levenshteinDistanceLib.c │ │ │ ├── levenshteinDistanceLib.h │ │ │ ├── numberOfLinesLib.c │ │ │ └── numberOfLinesLib.h │ │ ├── Main │ │ │ └── main.c │ │ └── Tests │ │ │ └── TESTS.txt │ └── Version1 │ │ ├── Libraries │ │ ├── FNACharactersLib │ │ │ ├── FNACharactersLib.c │ │ │ └── FNACharactersLib.h │ │ ├── levenshteinDistanceLib │ │ │ ├── levenshteinDistanceLib.c │ │ │ └── levenshteinDistanceLib.h │ │ ├── numberOfLinesLib │ │ │ ├── numberOfLinesLib.c │ │ │ └── numberOfLinesLib.h │ │ └── queryKmersLib │ │ │ ├── queryKmersLib.c │ │ │ └── queryKmersLib.h │ │ ├── Main │ │ ├── README │ │ └── main.c │ │ └── Tests │ │ ├── ANI │ │ ├── 5mers.fna │ │ ├── 5mers2.fna │ │ └── main.c │ │ ├── TESTS │ │ └── kmers │ │ └── 5mers │ │ ├── 5mers.fna │ │ └── main.c └── Python │ └── Version1.2 │ └── GUI │ └── Main │ ├── README │ └── aniGUI.py ├── Adapters └── IlluminaAdapters.fa ├── AlphaFold ├── best_scores.py └── ranking_debug.json ├── AustralianMetagenomes ├── README.md └── australian_metagenomes.txt ├── AuthorInformation ├── author.py └── parse_addresses.py ├── Bangers ├── JQ995537.faa.gz ├── README.md ├── amino_acids.txt ├── bangers.c ├── kseq.h └── test.out ├── CF ├── count_coverage.py └── filter_metagenomes.py ├── CommunityAssembly ├── README.md ├── count.py └── distribution.py ├── ENA └── parse_ENA_xml.py ├── Flinders ├── __init_.py ├── alignment_score.py ├── cd-hit-cluster-sizes.py ├── cd-hit-to-clusters.py ├── kmer_sim.py ├── plot_scores_3d.py ├── score_independently.py └── substitution_rules.py ├── GregFrederickson ├── parse_xls_to_taxonomy.py └── patric_to_tax.py ├── Jody └── blast2subsys.py ├── JohnMolkili ├── correlate_contigs.py ├── count_reads_to_contigs.py ├── join.py ├── separate_contig_by_group.py └── sequence_coverage.py ├── LICENSE ├── LizMetagenomes ├── README.md ├── envs │ ├── focus.yaml │ ├── prinseq.yaml │ └── superfocus.yaml ├── process_shark_metagenomes.snakefile └── read_destiny.py ├── ModelSEED ├── json_keys.py ├── json_keys_keys.py ├── json_list_keys.py └── parse_biochemistry.py ├── PAF └── summarize_hits.py ├── PythonClass ├── parse_genbank.py ├── plot_blast.py ├── random_sequence.py └── sequence.gb ├── RAST ├── RAST-alljobs.pl ├── RAST-jobs.pl ├── RAST-retrieve-jobs.pl ├── RAST-status.pl ├── RAST-submit-jobs.pl └── make_assigned_functions.pl ├── README.md ├── ViralBioinformaticsTools ├── README.md ├── git_hub_dates.tsv.gz ├── github_urls ├── proj_start_stop.tsv.gz └── viral_bioinformatics_tools.tsv.gz ├── VirusDiscoveryProject └── DataSelection │ ├── README.md │ ├── datasets.ipynb │ ├── phage_size_selection.txt │ ├── random_selection.txt │ ├── size_selection.txt │ └── wgs_datasets.tsv.gz ├── annotations ├── bacterial_pathogens.py └── singlem_reads_to_contigs.py ├── assembly ├── README.md ├── assemble.snakefile ├── mummerplot.snakefile ├── mummerplot.yaml ├── mummerplot_barcoded.snakefile ├── nanopore_assembly_barcoded.snakefile ├── nanopore_phage_assembly.snakefile ├── nanopore_phage_assembly_simple.snakefile └── nanopore_phage_assembly_simple_nohost.snakefile ├── bam ├── assign_mapped_reads.py ├── bam2fasta.py ├── bam2fastq.py ├── bam2fastq_paired.py ├── bam2reads.py ├── count_bam_hits.py ├── coverage_average.py ├── coverage_depth.py ├── fastq_not_in_bam.py ├── fastq_pairs.py ├── kurtosis.py ├── list_reads.py ├── read_differences.py └── samtools2table.pl ├── bin ├── Makefile ├── NSF_bibtex_by_year.py ├── NSF_conflicts.py ├── all_4mers.py ├── average_quality_scores.pl ├── blast2seq.py ├── cd-hit2fasta.py ├── checkR1R2.sh ├── check_fasta.py ├── cif2pdb.py ├── clustering.py ├── correlation_clustering.py ├── correlation_clusters_to_fasta.py ├── correlations.py ├── count_fasta.c ├── count_fastq.c ├── countfasta.py ├── countfastq.py ├── countgenbank.py ├── countgfa.py ├── cpgs.py ├── crAss_contig_correlations.py ├── crc64.py ├── create_newusers.py ├── distances.py ├── download_sra_lists.sh ├── dump_all_tables.py ├── embl_export.py ├── environment_violin_lot.py ├── expedition_xml2csv.py ├── extract.py ├── extract_fasta_sequence.pl ├── factorial.py ├── fake_fastq.py ├── fasta2sequence.pl ├── fasta_split.c ├── fastapercent.pl ├── fastg2gfa.c ├── fastq2fasta.c ├── fastq2fasta.cpp ├── fastq2fasta.py ├── fastq_average_qual.cpp ├── fastq_avqual.c ├── filter_fasta_length.py ├── filter_fastq.py ├── filter_seq_by_length.py ├── genbank2fasta.pl ├── genbank2flatfile.pl ├── genbank2fna.pl ├── genbank2sequences.py ├── genbank_count_motifs.py ├── genbank_list_features.py ├── genbanktable2fasta.pl ├── get_genbank.pl ├── get_genbank_batch.pl ├── get_genbank_batch_proteins.pl ├── get_lastlogs.sh ├── get_wgs_eutils.pl ├── getopt.cpp ├── gfa2fasta.sh ├── greedy_clustering.py ├── index_to_contig.py ├── joinlists.pl ├── json_validator.py ├── jsonl2tsv.py ├── kseq.h ├── latlon2km.py ├── longest_contig.py ├── merge_last_logs.py ├── merge_pdf.py ├── pair_fastq_bloom.py ├── pair_fastq_fast.py ├── pair_fastq_files.py ├── pair_fastq_lowmem.py ├── pairwise_percent_ids.py ├── parse_websites.py ├── parsebz2xml.py ├── pdb2fa.py ├── plot_pairwise_percents.py ├── print_taxonomy.py ├── rc.pl ├── rename_fasta.py ├── renumber_fasta.pl ├── renumber_fasta.py ├── renumber_merge_fasta.py ├── resample.py ├── riddler.py ├── samtools.pl ├── separate_multigenbank.py ├── separatemultifasta.pl ├── separatemultifasta.py ├── sge_summary.pl ├── sort_fasta_by_len.pl ├── sort_fasta_by_len_lengths_only.pl ├── stream_fasta.py ├── test.py ├── transpose.py ├── update_blastdb.sh ├── xml2csv.py ├── xml_print_all_attributes.py └── zotkill.pl ├── blast ├── blast2taxonomy.py ├── blast_to_network.py ├── blast_to_sequences.py ├── filter_fastq_by_blast.py ├── plot_blast.py ├── simple_blast_plot.py └── summarize_blast.py ├── bwt └── generate_table.py ├── cartopy ├── crAssphage_cophenetic.py ├── crAssphage_distance.py ├── crAssphage_ete.py └── example.py ├── cluster ├── split_blast_queries_edwards.pl ├── split_blast_queries_edwards_blastplus.pl └── submit2cluster_edwards ├── concoct ├── concoct_bins_to_reads.py └── concoct_csv_to_fasta.py ├── covid19 ├── README.md ├── nCoV-BarGraph.py └── nCoV-Viz.py ├── cpp ├── fastq │ ├── fastq.cbp │ ├── fastq2fasta.cpp │ ├── include │ │ └── stream_fastq.h │ ├── main.cpp │ └── src │ │ └── stream_fastq.cpp ├── two-bit-optimized.c └── two-bit.cpp ├── crAssphage ├── NCBI_SRA_Submission.py ├── NCBI_add_biosample_to_tsv.py ├── NCBI_submission.py ├── README.md ├── average_seq_dist.py ├── check_duplicates.py ├── check_gp.py ├── collapse_bam_variants.py ├── collectors_curve.py ├── compare2sra_all.sh ├── countries.py ├── coverage_heatmap.py ├── coverage_heatmap_orfs.py ├── dnadist2anova.py ├── extract_genotypes.py ├── extract_pcr_reads_from_fq.py ├── extract_pcr_regions.py ├── fastq2crassphage.sh ├── kmer_table.py ├── mutation_freqs.py ├── pcr_fastq_coverage.py ├── phylip2clustal.py ├── plot_contig_sizes.py ├── plot_coverage_ABC.py ├── plot_genotypes.py ├── print_ondrej_pcr_regions.py ├── runs_that_match.py ├── snp_frequency.py ├── tom_jeffries_data.py └── transpose_and_join.pl.py ├── deconvolute_minion_reads ├── README.md ├── fastq │ ├── __init__.py │ └── sequences.py └── split_fastq.py ├── django └── django_notes.md ├── dna └── randomise_dna.py ├── email └── extract_email_from_pst.py ├── fasta ├── extract_sequence.py ├── fasta_qual_to_fastq.py ├── length_filter.py ├── lengths.py ├── reservoir_sample_fasta.py ├── sequence_len_distributions.py ├── split_contigs.py ├── split_fasta_r1r2.py ├── subsample_fasta.py └── test.fasta ├── fastq ├── README.md ├── average_quality.py ├── change_fastq_pair_symbol.c ├── compare_directory_fastq_counts.py ├── deduplicate_fastq.py ├── filter_fastq.py ├── filter_fastq_length.c ├── index_fastq.py ├── index_in_fastq.py ├── percent_quality.py ├── predict_primers.py ├── print_fastq.c ├── random_split_paired_fastq.py ├── randomly_sample_fastq.py ├── randomly_sample_fastq_to_dir.py ├── split_by_tags.py ├── split_fastq_files.py ├── split_fastq_sequences.py ├── test.fastq.gz ├── trim_fastq.py └── trim_primers.py ├── fifo └── README.md ├── gfa └── find_complete_circles.py ├── github └── get_repo_dates.py ├── h5py ├── files_to_h5.py ├── files_to_h5_2d.py ├── matrix_to_h5.py ├── read_h5.ipynb ├── test_data.py └── tmp.h5 ├── hecatomb └── track_sequences.py ├── hmms └── run_hmmer.py ├── include └── kseq.h ├── isolation_sources ├── README.md └── genera-environment.py ├── jplacer ├── README.md ├── README2.md ├── add_metadata_to_matrix.py ├── color_based_on_fastq.py ├── count_metagenomes.py ├── create_colorstrip.py ├── create_multibar.py ├── explore_tree.py ├── fastq2color_strip.py ├── fastq2ids.py ├── generate_color_strip.py ├── parse_jplacer.py ├── parse_rename_write.py ├── rename_tree.py ├── test_taxonomy.py └── tree_to_cophenetic_matrix.py ├── jupyter ├── Bacteroides_prophage_lengths.json ├── Emma_subsystems │ ├── EagleRay_level1.tsv.gz │ ├── EagleRay_level1_norm_all.tsv.gz │ ├── EagleRay_level1_norm_ss.tsv.gz │ ├── NorfolkWater_level1.tsv.gz │ ├── NorfolkWater_level1_norm_all.tsv.gz │ ├── NorfolkWater_level1_norm_ss.tsv.gz │ ├── PortJackson_level1.tsv.gz │ ├── PortJackson_level1_norm_all.tsv.gz │ ├── PortJackson_level1_norm_ss.tsv.gz │ ├── TigerSharks_level1.tsv.gz │ ├── TigerSharks_level1_norm_all.tsv.gz │ ├── TigerSharks_level1_norm_ss.tsv.gz │ └── eagle_ray_types │ │ ├── level1_norm_all.tsv.gz │ │ ├── level1_norm_ss.tsv.gz │ │ ├── level1_raw.tsv.gz │ │ ├── level2_norm_all.tsv.gz │ │ ├── level2_norm_ss.tsv.gz │ │ ├── level2_raw.tsv.gz │ │ ├── ss_typed_norm_all.tsv.gz │ │ ├── ss_typed_norm_ss.tsv.gz │ │ └── ss_typed_raw.tsv.gz ├── IBD_Data_PCAs.ipynb ├── Lactobacillus_prophage_lengths.json ├── Lactobacillus_prophage_lengths2.json ├── SarahHeatmaps.ipynb ├── bacteroides_prophages.png ├── circles.ipynb ├── class │ └── phylum.tsv ├── data │ ├── get_headers.pl │ ├── rockart_subsystems │ │ ├── all_norm_all.tsv │ │ ├── all_norm_ss.tsv │ │ ├── all_raw.tsv │ │ ├── class_norm_all.tsv │ │ ├── class_norm_ss.tsv │ │ ├── class_raw.tsv │ │ ├── level1_idx.tsv │ │ ├── level1_norm_all.tsv │ │ ├── level1_norm_ss.r.tsv │ │ ├── level1_norm_ss.tsv │ │ ├── level1_raw.tsv │ │ ├── level2_idx.tsv │ │ ├── level2_norm_all.tsv │ │ ├── level2_norm_ss.r.tsv │ │ ├── level2_norm_ss.tsv │ │ ├── level2_raw.tsv │ │ ├── subsystems_norm_all.tsv │ │ ├── subsystems_norm_ss.r.tsv │ │ ├── subsystems_norm_ss.tsv │ │ ├── subsystems_norm_ss_idx.tsv │ │ └── subsystems_raw.tsv │ └── rockart_taxonomy │ │ ├── all_levels.tsv │ │ ├── class.tsv │ │ ├── family.tsv │ │ ├── genus.tsv │ │ ├── order.tsv │ │ ├── phylum.idx │ │ ├── phylum.r.tsv │ │ ├── phylum.sample.idx │ │ ├── phylum.sample.tsv │ │ ├── phylum.tsv │ │ ├── species.tsv │ │ └── superkingdom.tsv ├── example2.tsv ├── gfa_to_fasta.ipynb ├── heatmap.ipynb ├── histogram.ipynb ├── jess_countries.ipynb ├── jess_pca.ipynb ├── lactobacillus_prophages.png ├── liz_spreadsheets.ipynb ├── merged.ipynb ├── pca.ipynb ├── phyloseq2pandas.ipynb ├── rds_to_py.ipynb ├── reduced_protein_alphabet.ipynb ├── sarah_data │ ├── mmseqs_taxonomy │ │ ├── all_levels.tsv.gz │ │ ├── all_levels_renamed.tsv.gz │ │ ├── animation │ │ │ ├── img_10.png │ │ │ ├── img_11.png │ │ │ ├── img_12.png │ │ │ ├── img_13.png │ │ │ ├── img_14.png │ │ │ ├── img_15.png │ │ │ ├── img_16.png │ │ │ ├── img_2.png │ │ │ ├── img_3.png │ │ │ ├── img_4.png │ │ │ ├── img_5.png │ │ │ ├── img_6.png │ │ │ ├── img_7.png │ │ │ ├── img_8.png │ │ │ ├── img_9.png │ │ │ └── taxonomy.gif │ │ ├── class.tsv.gz │ │ ├── class_renamed.tsv.gz │ │ ├── family.tsv.gz │ │ ├── family_renamed.tsv.gz │ │ ├── genus.tsv.gz │ │ ├── genus_renamed.tsv.gz │ │ ├── order.tsv.gz │ │ ├── order_renamed.tsv.gz │ │ ├── pca_by_approach.png │ │ ├── pca_by_filter.png │ │ ├── pca_by_method.png │ │ ├── pca_by_replicate.png │ │ ├── pca_by_sample.png │ │ ├── pca_combined_replicates.png │ │ ├── phylum.tsv.gz │ │ ├── phylum_renamed.tsv.gz │ │ ├── species.tsv.gz │ │ ├── species_renamed.tsv.gz │ │ ├── superkingdom.tsv.gz │ │ └── superkingdom_renamed.tsv.gz │ ├── sarah_subsystems_pca.ipynb │ ├── sarah_taxonomy_pca.ipynb │ └── subsystems │ │ ├── README.md │ │ ├── all_norm_all.tsv.gz │ │ ├── all_norm_all_renamed.tsv.gz │ │ ├── all_norm_ss.tsv.gz │ │ ├── all_norm_ss_renamed.tsv.gz │ │ ├── all_raw.tsv.gz │ │ ├── all_raw_renamed.tsv.gz │ │ ├── animation │ │ ├── img_10.png │ │ ├── img_11.png │ │ ├── img_12.png │ │ ├── img_13.png │ │ ├── img_14.png │ │ ├── img_15.png │ │ ├── img_16.png │ │ ├── img_2.png │ │ ├── img_3.png │ │ ├── img_4.png │ │ ├── img_5.png │ │ ├── img_6.png │ │ ├── img_7.png │ │ ├── img_8.png │ │ ├── img_9.png │ │ └── subsystems.gif │ │ ├── class_norm_all.tsv.gz │ │ ├── class_norm_all_renamed.tsv.gz │ │ ├── class_norm_ss.tsv.gz │ │ ├── class_norm_ss_renamed.tsv.gz │ │ ├── class_raw.tsv.gz │ │ ├── class_raw_renamed.tsv.gz │ │ ├── level1_norm_all.tsv.gz │ │ ├── level1_norm_all_renamed.tsv.gz │ │ ├── level1_norm_ss.tsv.gz │ │ ├── level1_norm_ss_renamed.tsv.gz │ │ ├── level1_raw.tsv.gz │ │ ├── level1_raw_renamed.tsv.gz │ │ ├── level2.png │ │ ├── level2_norm_all.tsv.gz │ │ ├── level2_norm_all_renamed.tsv.gz │ │ ├── level2_norm_ss.tsv.gz │ │ ├── level2_norm_ss_renamed.tsv.gz │ │ ├── level2_raw.tsv.gz │ │ ├── level2_raw_renamed.tsv.gz │ │ ├── normalised_subsystem_level2.png │ │ ├── pca_by_approach.png │ │ ├── pca_by_filter.png │ │ ├── pca_by_method.png │ │ ├── pca_by_replicate.png │ │ ├── pca_by_sample.png │ │ ├── pca_combined_replicates.png │ │ ├── subsystems_norm_all.tsv.gz │ │ ├── subsystems_norm_all_renamed.tsv.gz │ │ ├── subsystems_norm_ss.tsv.gz │ │ ├── subsystems_norm_ss_renamed.tsv.gz │ │ ├── subsystems_raw.tsv.gz │ │ └── subsystems_raw_renamed.tsv.gz ├── subsystems_data.tsv.gz ├── subsystems_data_all.tsv.gz ├── taxonomy_data.tsv.gz ├── taxonomy_data_all.tsv.gz └── test2.ipynb ├── kbase ├── json_to_model.py └── parse_json.py ├── kmers ├── check_strand.py ├── compare_kmers_genbank.py ├── count_kmers.py ├── count_kmers_genbank.py ├── count_kmers_ordered_genbank.py ├── count_leading_kmers.py ├── count_trailing_kmers.py ├── find_kmers.py ├── hashcode.py ├── kmer_entropy.py ├── kmer_entropy3.3.py ├── kmer_entropy_sa.py ├── kmer_union_intersection.py ├── mash_env.py ├── plot_kmer_evenness.py └── reassemble_kmers.py ├── kyle └── kmerbias.py ├── manipulate_genomes ├── README.md ├── filter_from_blast.py ├── rotate_phage.py ├── trim_fasta.py └── upstream_regions.py ├── matplotlib graphs ├── 16S_smooth.py ├── 3d_scatter_plot.py ├── KernelDensityEstimator.ipynb ├── KernelRegression.ipynb ├── kde.py ├── plot_16S_coverage.py ├── plot_16S_coverage_kde.py ├── plot_16S_coverage_kernelregression.py ├── plot_16s_coverage_all.py ├── show_fig.py └── xy_scatter.py ├── mmseqs ├── dummy_database.sqlite └── easy_taxonomy_to_function.py ├── mongodb ├── find_biomasses.py ├── load_models.py ├── print_keys.py ├── search_mongo.py ├── simple_find.py └── simple_load_models.py ├── mummer ├── reverse_complement_fasta.py └── six_mers.py ├── nanopore └── split_fastq_by_barcode.py ├── ncbi ├── accession2taxonomy.py ├── all_taxid_by_rank.py ├── blast2taxonmy.py ├── blast2taxonomy_col.py ├── blast2taxonomy_sqlite.py ├── combine_gbff_fna.py ├── datasets │ ├── genome_information.py │ └── one_genome_information.py ├── filter_uniprot50_by_taxonomy.py ├── filter_uniprot50_precalculated.py ├── genbank_phages_via_ftp.py ├── get_protein_sequence.py ├── get_wgs_eutils.pl ├── mikes_taxonomy.py ├── name2animal_plant.py ├── parse_genbank.py ├── parse_sra.py ├── patric_add_taxonomy.py ├── phage_longest_gene.py ├── product_protein_seq.py ├── pubmed_to_csv.py ├── tax2spreadsheet.py ├── tax2spreadsheetdb.py ├── taxonomy.py ├── taxonomy_database.py ├── taxonomy_database_add_to_tsv.py ├── taxonomy_database_table.py ├── taxonomy_phylum_kingdom.py └── taxonomy_to_kingdom.py ├── patric └── parse_gto.py ├── percent_pairwise_identity ├── RecA_uniprot.faa.gz ├── RecA_uniprot_cdhit.aln ├── RecA_uniprot_cdhit.faa ├── average_pairwise.pl ├── identical_percent_ids.json.gz ├── list2matrix.pl ├── min_percent_counts.py ├── min_percent_counts_only.py ├── min_percent_counts_subsample.py ├── needleman_wunsch-0.3.5 │ ├── LICENSE │ ├── Makefile │ ├── README │ ├── README.md │ ├── libs │ │ ├── alignment_scoring │ │ │ ├── alignment.c │ │ │ ├── alignment.h │ │ │ ├── alignment_scoring.c │ │ │ ├── alignment_scoring.h │ │ │ ├── alignment_scoring_load.c │ │ │ └── alignment_scoring_load.h │ │ ├── bioinf │ │ │ ├── bioinf.c │ │ │ └── bioinf.h │ │ ├── string_buffer │ │ │ ├── string_buffer.c │ │ │ └── string_buffer.h │ │ └── utility_lib │ │ │ ├── utility_lib.c │ │ │ └── utility_lib.h │ ├── needleman_wunsch.c │ ├── needleman_wunsch.h │ ├── nw_cmdline.c │ ├── seq1.fna │ ├── seq2.fna │ └── uthash.h ├── pairwise_percent_ids.py ├── permute_fasta.py ├── plot_pairwise_percents.py ├── strain_taxonomy.txt.gz └── taxon_focus2.csv ├── perl ├── Clustal.pm ├── MinSeed.pm ├── OGD.pm ├── ParseTree.pm ├── RAEProtein.pm ├── RepeatFinder.pm ├── Rob.pm ├── SGE-0.02 │ ├── Changes │ ├── Control │ │ ├── CVS │ │ │ ├── Entries │ │ │ ├── Repository │ │ │ └── Root │ │ ├── Control.pm │ │ ├── MANIFEST │ │ └── Makefile.PL │ ├── Copying │ ├── MANIFEST │ ├── Makefile.PL │ ├── README │ ├── Run │ │ ├── CVS │ │ │ ├── Entries │ │ │ ├── Repository │ │ │ └── Root │ │ ├── MANIFEST │ │ ├── Makefile.PL │ │ └── Run.pm │ ├── SGE.pm │ ├── Status │ │ ├── CVS │ │ │ ├── Entries │ │ │ ├── Repository │ │ │ └── Root │ │ ├── MANIFEST │ │ ├── Makefile.PL │ │ └── Status.pm │ ├── examples │ │ ├── CVS │ │ │ ├── Entries │ │ │ ├── Repository │ │ │ └── Root │ │ ├── README │ │ ├── submit2cluster.pl │ │ └── test.pl │ └── t │ │ ├── CVS │ │ ├── Entries │ │ ├── Repository │ │ └── Root │ │ └── SGE.t ├── Teragrid-0.02 │ ├── Changes │ ├── Control │ │ ├── CVS │ │ │ ├── Entries │ │ │ ├── Repository │ │ │ └── Root │ │ ├── Control.pm │ │ ├── MANIFEST │ │ └── Makefile.PL │ ├── Copying │ ├── Jobs │ │ ├── CVS │ │ │ ├── Entries │ │ │ ├── Repository │ │ │ └── Root │ │ ├── Jobs.pm │ │ ├── MANIFEST │ │ └── Makefile.PL │ ├── LSGW.pm │ ├── MANIFEST │ ├── Makefile.PL │ ├── README │ └── examples │ │ ├── CVS │ │ ├── Entries │ │ ├── Repository │ │ └── Root │ │ ├── all_jobs.pl │ │ ├── blast.pl │ │ ├── job.pl │ │ └── job_data.pl ├── alignment │ ├── __init__.py │ ├── dna_alignment.py │ ├── edit_distance.py │ ├── gapped_alignment.py │ ├── gapped_alignment2.py │ ├── local_alignment.py │ └── matrices.py └── raeseqlib.pm ├── phage ├── comapre_gpdb_mgv.py ├── envs │ └── phispy.yaml ├── genbank_has_phage.py ├── is_phage_function.py ├── metagenomes │ ├── contig_mv_samples.txt.gz │ ├── contigs_gokushovirus.blastn.gz │ ├── count_contigs.pl │ ├── count_contigs2.pl │ ├── count_phables.pl │ ├── crass_contigs.tsv.gz │ ├── crassphage_1percent.png │ ├── crassus.ipynb │ ├── crassus_results.tsv.gz │ ├── find_pb_segment2.pl │ ├── freezer.txt.gz │ ├── hist.png │ ├── hist_1percent.png │ ├── ibd_16s │ │ ├── RC2_16S_IBD_OTU.tsv.gz │ │ ├── RC2_16S_IBD_metadata.tsv.gz │ │ └── RC2_16S_IBD_taxadata.tsv.gz │ ├── join_vir.pl │ ├── limit_contigs.pl │ ├── microviridae.ipynb │ ├── microviridae.png │ ├── microviridae58782.png │ ├── microviridae_correlations.tsv.gz │ ├── microvirus_contig_count_table.tsv.gz │ ├── microvirus_contigs.pl │ ├── most_abundant.pl │ ├── most_abundant.txt.gz │ ├── mv_gen_cont_samples.txt.gz │ ├── mv_samples.txt.gz │ ├── mv_sequences.txt.gz │ ├── our_crassphage.tsv.gz │ ├── pb199070.png │ ├── pb199070_2.png │ ├── pb199070_both.png │ ├── pb58328.png │ ├── phables_mv_samples.txt.gz │ ├── pharokka_top_hits_mash_inphared.nonone.tsv.gz │ ├── pharokka_top_hits_mash_inphared.tsv.gz │ ├── picobirnaviridae edited.png │ ├── picobirnaviridae.ipynb │ ├── picobirnaviridae_contig_count_table.tsv.gz │ ├── picobirnaviridae_contigs.pl │ ├── picobirnaviridae_correlations.tsv.gz │ ├── picobirnaviridae_rdrp.tblastn.gz │ ├── rdrp.tblastn.tsv.gz │ ├── rdrp_contig_count_table.tsv.gz │ ├── rdrp_contigs.pl │ ├── rdrp_wehave.tsv.gz │ ├── sampleSeqCounts.tsv.gz │ ├── sample_genome_read_counts.tsv.gz │ ├── virus_contig_annotations.tsv.gz │ └── virus_contig_annotations_samples.tsv.gz ├── phage_functions.py ├── phage_functions_gbk.py ├── phage_quality.snakefile ├── phage_quality_assessment_scripts │ ├── av_protein_lengths.py │ ├── check_phage_functions.py │ ├── coding_vs_noncoding.py │ ├── count_adjacent_orfs.py │ └── phage_quality_cluster.snakefile ├── phage_quality_config.yaml ├── phispy_download_conda.snakefile ├── phispy_vogs_download.snakefile ├── phispy_vogs_download_submit.sh ├── plot_phage_tsne.py ├── prophage_from_genbank.py ├── read_ends.py ├── remove_prophages.py ├── remove_prophages_sequences.py ├── separate_prophages.py ├── separate_prophages_coordinates.py ├── submit_phispy_vogs_download.sh └── write_prophages.py ├── phage_clustering ├── bit_score.py └── bit_score_by_len.py ├── phage_protein_blast_genera ├── README.md ├── blast_tax_to_genera.py ├── genera_per_phage_protein.py ├── num_best_hits.py ├── num_prots_vs_taxa.py ├── phage_host_location.txt ├── plot_best_hits.py └── tax_violin_plots.py ├── primers ├── count_primers.pl ├── find_adapter_sequences.pl ├── find_dangling_adapters.py └── match_primers.py ├── process_EK_metagenomes ├── IlluminaAdapters.fa ├── README.md ├── bin │ ├── compress.slurm │ ├── merge_counts.pl │ ├── normalize_data.pl │ └── sankey_matic.pl ├── count_fastq.slurm ├── count_mmseqs.pl ├── count_mmseqs.slurm ├── count_sharks.slurm ├── count_subsystems.pl ├── count_subsystems.slurm ├── fastp.slurm ├── fastq2fasta.slurm ├── join_sagc_lanes.slurm ├── kraken2otu.py ├── megahit.slurm ├── megahit_submit.slurm ├── mmseqs_add_subsystems.slurm ├── mmseqs_easy_taxonomy.slurm ├── mmseqs_easy_taxonomy_submit.sh ├── mmseqs_easy_taxonomy_submit.slurm ├── sharks.slurm ├── vamb.slurm ├── vamb_concatenate.py ├── vamb_create_fasta.py └── vamb_minimap.slurm ├── process_JCJ_metagenomes ├── README.md ├── bin │ └── sankey_matic.pl ├── fastp.slurm ├── fastq2fasta.slurm ├── humans.slurm ├── megahit.slurm ├── megahit_submit.sh ├── mmseqs_easy_taxonomy.slurm ├── mmseqs_easy_taxonomy_submit.slurm ├── vamb.slurm ├── vamb_concatenate.py ├── vamb_create_fasta.py └── vamb_minimap.slurm ├── process_metagenomes ├── README.md ├── fastp.slurm ├── fastq2fasta.slurm ├── host_removal.slurm ├── megahit.slurm ├── mmseqs_add_subsystems.slurm ├── mmseqs_easy_taxonomy.slurm ├── mmseqs_easy_taxonomy_submit.slurm ├── mmseqs_taxonomy.slurm ├── vamb.slurm ├── vamb_concat.slurm ├── vamb_create_fasta_clusters.py └── vamb_minimap.slurm ├── prophages ├── download_phage_slices.py ├── phage_finder_tests.snakefile ├── phageboost_genbank.py ├── phageboost_tests.snakefile ├── phigaro_tests.snakefile ├── phispy_phage_genes.snakefile ├── phispy_training_vs_test.snakefile ├── phispy_with_training.snakefile ├── prophage_proteins.snakefile ├── run_phispy_snakemakes.sh ├── run_virsorter.snakefile └── virsorter_tests.snakefile ├── proteins ├── md5_to_ncbi_taxonomy.py ├── protein_md5.py ├── protein_md5_fast.py └── unique_protein_ids.py ├── proxymeta ├── README.md └── find_mates.py ├── pymol └── draw_images.py ├── rc2 └── compare_fastq_files.py ├── refs_and_citations ├── altmetric_one.py ├── altmetrics.py ├── compare_titles.py ├── gs_download_cites.py ├── orcid_vs_google.py ├── refs2csv.py ├── refs2csv_tk.py └── summarise_pubs_counts.py ├── requirements.txt ├── requirements_mini.txt ├── rob_tests ├── hashes.c ├── hashing.c ├── sequences.py └── test_stream_pair.py ├── roblib ├── __init__.py ├── alignments.py ├── bcolors.py ├── blast.py ├── colours.py ├── date_parsing.py ├── dna.py ├── dnadist.py ├── files.py ├── functions.py ├── genbank.py ├── geography.py ├── newick.py ├── rob_error.py ├── seqio_filter.py ├── sequences.py ├── stats.py ├── strings.py └── translate.py ├── roblib_tk ├── __init__.py └── file_chooser.py ├── sankey └── sankey_plot.ipynb ├── sdsu ├── average_grade.pl ├── parse_pos.py └── thesis_parse.py ├── searchSRA ├── envs │ └── samtools.yaml ├── filter_reads.sh ├── merge_counts_abstracts.py ├── process.smk ├── process_expand.smk └── searchSRA_abstracts.tsv.gz ├── seed └── pegs_in_order.py ├── seed_servers ├── RAST-alljobs.pl ├── RAST-jobs.pl ├── RAST-retrieve-jobs.pl ├── RAST-status.pl ├── RAST-submit-jobs.pl └── test_occ_roles.py ├── silva └── parse_silva_act.py ├── snakemake ├── EnvBiotec_process_metagenomes.snakefile ├── SAGC_process_metagenomes.snakefile ├── abricate.snakefile ├── annotate_phages.snakefile ├── cluster.snakefile ├── cluster_phages.snakefile ├── deconseq.snakefile ├── envs │ ├── bowtie.yaml │ ├── canu.yaml │ ├── filtlong.yaml │ ├── flye.yaml │ ├── focus.yaml │ ├── kraken.yaml │ ├── megahit.yaml │ ├── miniasmminipolish.yaml │ ├── minimap.yaml │ ├── prinseq.yaml │ ├── raven.yaml │ ├── seqtk.yaml │ ├── superfocus.yaml │ └── trycycler.yaml ├── kraken.snakefile ├── patric_complete_genomes_proteins.snakefile ├── phispy.snakefile ├── phispy.yaml ├── process_metagenomes.json └── process_metagenomes.snakefile ├── snakemake_tests └── test.snakefile ├── sra ├── README.md ├── SRA.partie.tsv.gz ├── bigquery_json.py ├── bigquery_json2csv.py ├── filter.py ├── phage_bacteria.ipynb ├── plot_3d.ipynb ├── plot_partie_3d.py ├── plot_partie_boxes.py ├── run_accession-experiment_lib.tsv.gz ├── runs_to_abstracts.pl ├── sra_by_date.py ├── sra_file_sizes.py ├── sra_status.py ├── sra_xml.py ├── sra_xml_dir.py ├── sra_xml_print_all_attributes.py └── study_types.py ├── superfocus_all ├── join_output.py ├── summarize_hits.py └── superfocus_to_taxonomy.py ├── taxon ├── Error.py ├── README.md ├── __init__.py ├── config.py ├── load_from_database.py ├── mmseqs_report_to_table.py ├── read_accession_files.py ├── sqlite_taxon.py ├── taxon.py └── taxonomy │ ├── __init__.py │ └── taxonomy.py ├── testrepeatfinder ├── README.md ├── ROBTEST.repeatfinder ├── compare.pl ├── errors ├── files.txt ├── fna_repeats_to_seq.pl ├── fna_to_repeats.py ├── pp1.test ├── pp2.test ├── repeatFinder.cpp ├── repeatFinder.h ├── setup.py ├── tempRepeatDNA.99620.pp.1.fasta ├── tempRepeatDNA.99620.pp.1.fasta.repeatfinder ├── tempRepeatDNA.99620.pp.2.fasta ├── tempRepeatDNA.99620.pp.2.fasta.repeatfinder ├── test.fasta ├── test.fasta.repeatfinder ├── test │ ├── pp1.fasta │ ├── pp3.fasta │ ├── pp4.fasta │ ├── pp5.fasta │ ├── pp6.fasta │ └── pp7.fasta └── test_repeatfinder.py ├── text_matching ├── vfdb.txt.gz └── virulence_matching.py ├── thea ├── add_source_to_rapsearch.py ├── count_hits.cpp ├── count_lastal_hits.cpp ├── lastal_abund_ubiq.cpp ├── load_rapsearch_sqlite.py ├── locate_orfs.py ├── normalize_hits.cpp ├── orf_evidence.py ├── overlapping_orfs.py └── rapsearch_check_translation.py └── trees ├── dist_matrix.py ├── negative_branch_lengths.py ├── rename_trees.ori.py ├── rename_trees.py ├── rename_trees_crassphage.py ├── tree_to_cophenetic_matrix.py ├── tree_to_pairwisedistance.py └── trim_alignment.py /.gitattributes: -------------------------------------------------------------------------------- 1 | *.gz filter=lfs diff=lfs merge=lfs -text 2 | *.sqlite filter=lfs diff=lfs merge=lfs -text 3 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "ProphageGenomics"] 2 | path = ProphageGenomics 3 | url = git@github.com:hkang408/ProphageGenomics.git 4 | -------------------------------------------------------------------------------- /ANI/C/Version1.1/Libraries/FNACharactersLib.h: -------------------------------------------------------------------------------- 1 | #ifndef _FNACHARACTERSLIB_H_ 2 | #define _FNACHARACTERSLIB_H_ 3 | 4 | extern char** fnaCharactersOf(const char* fnaFileNameAndLocation, const int LINESIZE, const int numberOfLines); 5 | 6 | #endif -------------------------------------------------------------------------------- /ANI/C/Version1.1/Libraries/levenshteinDistanceLib.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "levenshteinDistanceLib.h" 4 | //Levenshtein edit distance 5 | int editDistance(const char* S, const char* T, int sLength, int tLength) { 6 | int minimum, first, second, third, conditional; 7 | if(sLength == 0) { 8 | return tLength; 9 | } 10 | if(tLength == 0) { 11 | return sLength; 12 | } 13 | if(S[sLength-1] != T[tLength - 1]) { 14 | conditional = 1; 15 | } else { 16 | conditional = 0; 17 | } 18 | first = editDistance(S, T, sLength - 1, tLength) + 1; 19 | second = editDistance(S, T, sLength, tLength - 1) + 1; 20 | third = editDistance(S, T, sLength - 1, tLength - 1) + conditional; 21 | minimum = first; 22 | if(first > second) { 23 | minimum = second; 24 | } 25 | if (second > third) { 26 | minimum = third; 27 | } 28 | return minimum; 29 | } -------------------------------------------------------------------------------- /ANI/C/Version1.1/Libraries/levenshteinDistanceLib.h: -------------------------------------------------------------------------------- 1 | #ifndef _LEVENSHTEINDISTANCELIB_H_ 2 | #define _LEVENSHTEINDISTANCELIB_H_ 3 | 4 | extern int editDistance(const char* S, const char* T, int sLength, int tLength); 5 | 6 | #endif -------------------------------------------------------------------------------- /ANI/C/Version1.1/Libraries/numberOfLinesLib.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "numberOfLinesLib.h" 5 | 6 | int numberOfLines(const int LINESIZE, const char *FNANAME) { 7 | FILE *fnaPointer; 8 | char singleCharLine[100]; 9 | fnaPointer = fopen(FNANAME, "r"); 10 | if(fnaPointer == NULL) { 11 | printf("%s", "Error: fnaPointer is null\n"); 12 | exit(EXIT_FAILURE); 13 | } 14 | int numberOfLines = 0; 15 | while (fgets(singleCharLine, 100, fnaPointer) != NULL) { 16 | numberOfLines++; 17 | } 18 | if(numberOfLines == 0) { 19 | printf("%s", "Error: No Lines in .fna file\n"); 20 | exit(EXIT_FAILURE); 21 | } 22 | fclose(fnaPointer); 23 | return numberOfLines; 24 | } -------------------------------------------------------------------------------- /ANI/C/Version1.1/Libraries/numberOfLinesLib.h: -------------------------------------------------------------------------------- 1 | #ifndef _NUMBEROFLINESLIB_H_ 2 | #define _NUMBEROFLINESLIB_H_ 3 | 4 | extern int numberOfLines(const int LINESIZE, const char *FNANAME); 5 | 6 | #endif -------------------------------------------------------------------------------- /ANI/C/Version1.1/Tests/fnaFiles/5mers.fna: -------------------------------------------------------------------------------- 1 | Expected ANI: 21 2 | CTGATCGATC -------------------------------------------------------------------------------- /ANI/C/Version1.1/Tests/fnaFiles/5mers2.fna: -------------------------------------------------------------------------------- 1 | Expected ANI: 21 2 | TCAGCTGCTA -------------------------------------------------------------------------------- /ANI/C/Version1.2/Libraries/FNACharactersLib.h: -------------------------------------------------------------------------------- 1 | #ifndef _FNACHARACTERSLIB_H_ 2 | #define _FNACHARACTERSLIB_H_ 3 | 4 | extern char** fnaCharactersOf(const char* fnaFileNameAndLocation, const int LINESIZE, const int numberOfLines); 5 | 6 | #endif -------------------------------------------------------------------------------- /ANI/C/Version1.2/Libraries/levenshteinDistanceLib.c: -------------------------------------------------------------------------------- 1 | #include 2 | int editDistance(const char* S, const char* T, int kmerLength) { 3 | int matrix[kmerLength + 1][kmerLength + 1]; 4 | for (int i = 0; i <= kmerLength; i++) { 5 | matrix[i][0] = i; 6 | matrix[0][i] = i; 7 | } 8 | for (int j = 1; j <= kmerLength; j++) { 9 | for (int k = 1; k <= kmerLength; k++) { 10 | if (S[j-1] == T[k-1]) 11 | matrix[j][k] = matrix[j-1][k-1]; 12 | else 13 | matrix[j][k] = (matrix[j][k-1] + 1) > (matrix[j-1][k] + 1) ? ((matrix[j-1][k-1] + 1) > (matrix[j-1][k] + 1) ? (matrix[j-1][k] + 1) : (matrix[j-1][k-1] + 1)) : ((matrix[j-1][k-1] + 1) > (matrix[j][k-1] + 1) ? (matrix[j][k-1] + 1) : (matrix[j-1][k-1] + 1)); 14 | } 15 | } 16 | return matrix[kmerLength][kmerLength]; 17 | } -------------------------------------------------------------------------------- /ANI/C/Version1.2/Libraries/levenshteinDistanceLib.h: -------------------------------------------------------------------------------- 1 | #ifndef _LEVENSHTEINDISTANCELIB_H_ 2 | #define _LEVENSHTEINDISTANCELIB_H_ 3 | 4 | extern int editDistance(const char* S, const char* T, int sLength, int tLength); 5 | 6 | #endif -------------------------------------------------------------------------------- /ANI/C/Version1.2/Libraries/numberOfLinesLib.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "numberOfLinesLib.h" 5 | 6 | int numberOfLines(const int LINESIZE, const char *FNANAME) { 7 | FILE *fnaPointer; 8 | char singleCharLine[256]; 9 | fnaPointer = fopen(FNANAME, "r"); 10 | if(fnaPointer == NULL) { 11 | printf("%s", "Error: fnaPointer is null\n"); 12 | exit(EXIT_FAILURE); 13 | } 14 | int numberOfLines = 0; 15 | while (fgets(singleCharLine, 256, fnaPointer) != NULL) { 16 | numberOfLines++; 17 | } 18 | if(numberOfLines == 0) { 19 | printf("%s", "Error: No Lines in .fna file\n"); 20 | exit(EXIT_FAILURE); 21 | } 22 | fclose(fnaPointer); 23 | return numberOfLines; 24 | } -------------------------------------------------------------------------------- /ANI/C/Version1.2/Libraries/numberOfLinesLib.h: -------------------------------------------------------------------------------- 1 | #ifndef _NUMBEROFLINESLIB_H_ 2 | #define _NUMBEROFLINESLIB_H_ 3 | 4 | extern int numberOfLines(const int LINESIZE, const char *FNANAME); 5 | 6 | #endif -------------------------------------------------------------------------------- /ANI/C/Version1/Libraries/FNACharactersLib/FNACharactersLib.h: -------------------------------------------------------------------------------- 1 | #ifndef _FNACHARACTERSLIB_H_ 2 | #define _FNACHARACTERSLIB_H_ 3 | 4 | extern char** fnaCharactersOf(const char* fnaFileNameAndLocation, const int LINESIZE, const int numberOfLines); 5 | 6 | #endif -------------------------------------------------------------------------------- /ANI/C/Version1/Libraries/levenshteinDistanceLib/levenshteinDistanceLib.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "levenshteinDistanceLib.h" 4 | //Levenshtein edit distance 5 | int editDistance(const char* S, const char* T, int sLength, int tLength) { 6 | int minimum, first, second, third, conditional; 7 | if(sLength == 0) { 8 | return tLength; 9 | } 10 | if(tLength == 0) { 11 | return sLength; 12 | } 13 | if(S[sLength-1] != T[tLength - 1]) { 14 | conditional = 1; 15 | } else { 16 | conditional = 0; 17 | } 18 | first = editDistance(S, T, sLength - 1, tLength) + 1; 19 | second = editDistance(S, T, sLength, tLength - 1) + 1; 20 | third = editDistance(S, T, sLength - 1, tLength - 1) + conditional; 21 | minimum = first; 22 | if(first > second) { 23 | minimum = second; 24 | } 25 | if (second > third) { 26 | minimum = third; 27 | } 28 | return minimum; 29 | } -------------------------------------------------------------------------------- /ANI/C/Version1/Libraries/levenshteinDistanceLib/levenshteinDistanceLib.h: -------------------------------------------------------------------------------- 1 | #ifndef _LEVENSHTEINDISTANCELIB_H_ 2 | #define _LEVENSHTEINDISTANCELIB_H_ 3 | 4 | extern int editDistance(const char* S, const char* T, int sLength, int tLength); 5 | 6 | #endif -------------------------------------------------------------------------------- /ANI/C/Version1/Libraries/numberOfLinesLib/numberOfLinesLib.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "numberOfLinesLib.h" 5 | 6 | int numberOfLines(const int LINESIZE, const char *FNANAME) { 7 | FILE *fnaPointer; 8 | char singleCharLine[100]; 9 | fnaPointer = fopen(FNANAME, "r"); 10 | if(fnaPointer == NULL) { 11 | printf("%s", "Error: fnaPointer is null\n"); 12 | exit(EXIT_FAILURE); 13 | } 14 | int numberOfLines = 0; 15 | while (fgets(singleCharLine, 100, fnaPointer) != NULL) { 16 | numberOfLines++; 17 | } 18 | if(numberOfLines == 0) { 19 | printf("%s", "Error: No Lines in .fna file\n"); 20 | exit(EXIT_FAILURE); 21 | } 22 | fclose(fnaPointer); 23 | return numberOfLines; 24 | } -------------------------------------------------------------------------------- /ANI/C/Version1/Libraries/numberOfLinesLib/numberOfLinesLib.h: -------------------------------------------------------------------------------- 1 | #ifndef _NUMBEROFLINESLIB_H_ 2 | #define _NUMBEROFLINESLIB_H_ 3 | 4 | extern int numberOfLines(const int LINESIZE, const char *FNANAME); 5 | 6 | #endif -------------------------------------------------------------------------------- /ANI/C/Version1/Libraries/queryKmersLib/queryKmersLib.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "queryKmersLib.h" 5 | // 2 <= kmerSize <= 66 6 | //67+ will return kmers of 1000+ characters for some reason(lineSize = 70?) 7 | char** queryKmersIntoArray(const char* dnaChars, const int KMERSIZE, const int numberOfLines, const int LINESIZE) { 8 | const int numberOfKmers = numberOfLines*LINESIZE; 9 | char** kmersArrayMalloc = (char**) malloc(numberOfKmers*sizeof(char*)); 10 | if(kmersArrayMalloc == NULL) { 11 | printf("%s", "Error: kmersArrayMalloc\n"); 12 | exit(EXIT_FAILURE); 13 | } 14 | for(int i = 0; i < numberOfKmers; i++) { 15 | kmersArrayMalloc[i] = (char *) malloc(sizeof(char)*KMERSIZE*10); 16 | if(kmersArrayMalloc[i] == NULL) { 17 | printf("%s", "Error: kmersArrayMalloc\n"); 18 | exit(EXIT_FAILURE); 19 | } 20 | } 21 | for(int i = 0; i < strlen(dnaChars) - KMERSIZE + 1; i++) { 22 | memcpy(kmersArrayMalloc[i], &dnaChars[i], KMERSIZE); 23 | } 24 | return (char**) kmersArrayMalloc; 25 | 26 | } -------------------------------------------------------------------------------- /ANI/C/Version1/Libraries/queryKmersLib/queryKmersLib.h: -------------------------------------------------------------------------------- 1 | #ifndef _QUERYKMERSLIB_H_ 2 | #define _QUERYKMERSLIB_H_ 3 | 4 | extern char **queryKmersIntoArray(const char* dnaChars, int kmerSize, const int numberOfLines, const int LINESIZE); 5 | 6 | #endif -------------------------------------------------------------------------------- /ANI/C/Version1/Tests/ANI/5mers.fna: -------------------------------------------------------------------------------- 1 | Expected ANI: 21 2 | CTGATCGATC -------------------------------------------------------------------------------- /ANI/C/Version1/Tests/ANI/5mers2.fna: -------------------------------------------------------------------------------- 1 | Expected ANI: 21 2 | TCAGCTGCTA -------------------------------------------------------------------------------- /ANI/C/Version1/Tests/kmers/5mers/5mers.fna: -------------------------------------------------------------------------------- 1 | Expected kmer array output: CTGAT TGATC GATCG ATCGA TCGAT CGATC 2 | CTGATCGATC -------------------------------------------------------------------------------- /AlphaFold/best_scores.py: -------------------------------------------------------------------------------- 1 | """ 2 | Read all the ranking_debug.json files and report the best model and its pLDDT score 3 | """ 4 | 5 | import os 6 | import sys 7 | import argparse 8 | import json 9 | 10 | __author__ = 'Rob Edwards' 11 | 12 | if __name__ == "__main__": 13 | parser = argparse.ArgumentParser(description=' ') 14 | parser.add_argument('-d', help='directory with ranking_debug.json', required=True) 15 | parser.add_argument('-v', help='verbose output', action='store_true') 16 | args = parser.parse_args() 17 | 18 | rdj = os.path.join(args.d, "ranking_debug.json") 19 | if not os.path.exists(rdj): 20 | sys.stderr.write(f"Error: {rdj} not found\n") 21 | sys.exit(1) 22 | 23 | f = open(rdj, 'r') 24 | d = json.load(f) 25 | bm = d['order'][0] 26 | pl = d['plddts'][bm] 27 | print("\t".join(map(str, [args.d, bm, pl]))) -------------------------------------------------------------------------------- /AlphaFold/ranking_debug.json: -------------------------------------------------------------------------------- 1 | { 2 | "plddts": { 3 | "model_1": 87.45812934795958, 4 | "model_2": 84.52258185860828, 5 | "model_3": 87.78593692844834, 6 | "model_4": 87.99449366375426, 7 | "model_5": 85.32976078502992 8 | }, 9 | "order": [ 10 | "model_4", 11 | "model_3", 12 | "model_1", 13 | "model_5", 14 | "model_2" 15 | ] 16 | } -------------------------------------------------------------------------------- /Bangers/JQ995537.faa.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:f1e079a13647da7641cb9fb2417984df8a7b70655dfc1d166a2a3baa84cf2aeb 3 | size 19527 4 | -------------------------------------------------------------------------------- /Bangers/README.md: -------------------------------------------------------------------------------- 1 | # Bangers 2 | 3 | `B`eyond `A` `N`ucleotide `G`enerated `E`valuation of `R`elationships between `S`equences. 4 | 5 | This is a project of Mike and Rob, and you should probably ignore it. 6 | 7 | If you want to play, compile like so: 8 | 9 | ``` 10 | gcc -Wall -o bangers bangers.c -lz 11 | ``` 12 | 13 | and run like so: 14 | 15 | ``` 16 | ./bangers JQ995537.faa.gz 17 | ``` 18 | 19 | It will convert amino acid strings into something else 20 | -------------------------------------------------------------------------------- /Flinders/__init_.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | __author__ = 'Rob Edwards' 5 | 6 | from .substitution_rules import score 7 | 8 | __all__ = [ 9 | 'scores' 10 | ] 11 | -------------------------------------------------------------------------------- /LizMetagenomes/envs/focus.yaml: -------------------------------------------------------------------------------- 1 | name: focus 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - default 6 | dependencies: 7 | - focus 8 | -------------------------------------------------------------------------------- /LizMetagenomes/envs/prinseq.yaml: -------------------------------------------------------------------------------- 1 | name: prinseq-plus-plus 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - default 6 | dependencies: 7 | - prinseq-plus-plus 8 | -------------------------------------------------------------------------------- /LizMetagenomes/envs/superfocus.yaml: -------------------------------------------------------------------------------- 1 | name: superfocus 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - default 6 | dependencies: 7 | - super-focus 8 | -------------------------------------------------------------------------------- /ModelSEED/json_keys.py: -------------------------------------------------------------------------------- 1 | """ 2 | Read some json files and print all keys 3 | """ 4 | 5 | import os 6 | import sys 7 | import argparse 8 | import json 9 | 10 | 11 | __author__ = 'Rob Edwards' 12 | 13 | if __name__ == "__main__": 14 | parser = argparse.ArgumentParser(description=' ') 15 | parser.add_argument('-f', help='json file', required=True) 16 | parser.add_argument('-v', help='verbose output', action='store_true') 17 | args = parser.parse_args() 18 | 19 | akeys = set() 20 | t = json.load(open(args.f, 'r')) 21 | 22 | print("{}".format("\n".join(t.keys()))) 23 | -------------------------------------------------------------------------------- /ModelSEED/json_keys_keys.py: -------------------------------------------------------------------------------- 1 | """ 2 | Read some json files and print all keys 3 | """ 4 | 5 | import os 6 | import sys 7 | import argparse 8 | 9 | from roblib import bcolors 10 | import json 11 | 12 | 13 | __author__ = 'Rob Edwards' 14 | 15 | if __name__ == "__main__": 16 | parser = argparse.ArgumentParser(description=' ') 17 | parser.add_argument('-f', help='json file', required=True) 18 | parser.add_argument('-v', help='verbose output', action='store_true') 19 | args = parser.parse_args() 20 | 21 | akeys = set() 22 | t = json.load(open(args.f, 'r')) 23 | for k in t: 24 | akeys.update(t[k].keys()) 25 | 26 | print("{}".format("\n".join(akeys))) 27 | -------------------------------------------------------------------------------- /ModelSEED/json_list_keys.py: -------------------------------------------------------------------------------- 1 | """ 2 | Read some json files and print all keys 3 | """ 4 | 5 | import os 6 | import sys 7 | import argparse 8 | 9 | from roblib import bcolors 10 | import json 11 | 12 | 13 | __author__ = 'Rob Edwards' 14 | 15 | if __name__ == "__main__": 16 | parser = argparse.ArgumentParser(description=' ') 17 | parser.add_argument('-f', help='json file', required=True) 18 | parser.add_argument('-v', help='verbose output', action='store_true') 19 | args = parser.parse_args() 20 | 21 | akeys = set() 22 | t = json.load(open(args.f, 'r')) 23 | for k in t: 24 | akeys.update(k.keys()) 25 | 26 | print("{}".format("\n".join(akeys))) 27 | -------------------------------------------------------------------------------- /PythonClass/parse_genbank.py: -------------------------------------------------------------------------------- 1 | 2 | from Bio import SeqIO 3 | 4 | seq = SeqIO.read('sequence.gb', 'genbank') 5 | print(seq.id) 6 | 7 | 8 | with open('features.tsv', 'w') as out: 9 | for feature in seq.features: 10 | if 'locus_tag' in feature.qualifiers: 11 | lt = feature.qualifiers['locus_tag'][0] 12 | if 'product' in feature.qualifiers: 13 | out.write(lt + "\t" + feature.qualifiers['product'][0] + "\n") -------------------------------------------------------------------------------- /PythonClass/random_sequence.py: -------------------------------------------------------------------------------- 1 | """ 2 | Generate a random sequence 3 | """ 4 | 5 | import os 6 | import sys 7 | import argparse 8 | from random import randint 9 | __author__ = 'Rob Edwards' 10 | 11 | 12 | def random_sequence(maxlen): 13 | """ 14 | Generate a random DNA sequence less than maxlen size 15 | """ 16 | 17 | bases = ["A", "G", "C", "T"] 18 | for i in range(maxlen): 19 | print(bases[randint(0,3)], end="") 20 | print() 21 | 22 | 23 | 24 | 25 | 26 | if __name__ == "__main__": 27 | parser = argparse.ArgumentParser(description=' ') 28 | parser.add_argument('-n', type=int, default=1000, 29 | help='maximum sequence length (default=1000)') 30 | parser.add_argument('-v', help='verbose output', action='store_true') 31 | args = parser.parse_args() 32 | 33 | random_sequence(args.n) 34 | 35 | 36 | 37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /RAST/RAST-alljobs.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # 3 | 4 | use strict; 5 | use Data::Dumper; 6 | $ENV{SAS_SERVER}="PUBSEED"; 7 | print STDERR "SAS is $ENV{SAS_SERVER}\n"; 8 | use Term::ReadKey; 9 | use RASTserver; 10 | 11 | ## Use RAST test, not regular RAST 12 | # Now using regular RAST 13 | 14 | 15 | print "Please enter your RAST username: "; 16 | my $user = ReadLine(0); 17 | chomp $user; 18 | 19 | print "Please enter your RAST password: "; 20 | ReadMode 2; 21 | my $password = ReadLine(0); 22 | chomp $password; 23 | ReadMode 1; 24 | print "\n"; 25 | 26 | 27 | my $rast=new RASTserver($user, $password); 28 | unless (defined $rast) {die "Can't connect ot the rast server"} 29 | 30 | print Dumper($rast->jobs()); 31 | -------------------------------------------------------------------------------- /RAST/RAST-jobs.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # 3 | # 4 | 5 | use strict; 6 | use RASTserver; 7 | use Term::ReadKey; 8 | use Data::Dumper; 9 | $ENV{SAS_SERVER}="PUBSEED"; 10 | 11 | print "Please enter your RAST username: "; 12 | my $user = ReadLine(0); 13 | chomp $user; 14 | 15 | print "Please enter your RAST password: "; 16 | ReadMode 2; 17 | my $password = ReadLine(0); 18 | chomp $password; 19 | ReadMode 1; 20 | print "\n"; 21 | 22 | my $rast=new RASTserver($user, $password); 23 | unless (defined $rast) {die "Can't connect ot the rast server"} 24 | 25 | my $time = time; my $job = 0; 26 | my @jobs = $rast->jobs(); 27 | 28 | foreach my $j (@jobs) { 29 | print Dumper($j); 30 | print STDERR $job++, " : ", ($time-time), " seconds\n"; 31 | } 32 | -------------------------------------------------------------------------------- /RAST/RAST-status.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use RASTserver; 5 | $ENV{SAS_SERVER}="SEED"; 6 | use Term::ReadKey; 7 | 8 | die "$0 [-u username] [-p password] " unless (defined $ARGV[0]); 9 | 10 | my @ids; 11 | my $user; my $password; 12 | while (@ARGV) { 13 | my $t=shift @ARGV; 14 | if ($t eq "-u") {$user = shift @ARGV} 15 | elsif ($t eq "-p") {$password = shift @ARGV} 16 | else {push @ids, $t} 17 | } 18 | 19 | if (!$user) { 20 | print "Please enter your RAST username: "; 21 | $user = ReadLine(0); 22 | chomp $user; 23 | } 24 | 25 | if (!$password) { 26 | print "Please enter your RAST password: "; 27 | ReadMode 2; 28 | $password = ReadLine(0); 29 | chomp $password; 30 | ReadMode 1; 31 | print "\n"; 32 | } 33 | 34 | 35 | my $rast=new RASTserver($user, $password); 36 | unless (defined $rast) {die "Can't connect ot the rast server"} 37 | 38 | my $stat = $rast->status_of_RAST_job({-job => \@ids}); 39 | 40 | foreach my $job (sort {$a <=> $b} keys %$stat) { 41 | print join("\t", $job, $stat->{$job}->{'status'}), "\n"; 42 | } 43 | -------------------------------------------------------------------------------- /RAST/make_assigned_functions.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # 3 | 4 | use strict; 5 | 6 | my $dir = shift || die "seed directory"; 7 | my $odir = shift; 8 | unless ($odir) {$odir = $dir} 9 | 10 | if (-e "$dir/assigned_functions") { 11 | print STDERR "Backing up assigned_functions\n"; 12 | `cp -f $dir/assigned_functions $dir/assigned_functions.bak`; 13 | } 14 | 15 | my %fn; 16 | open(IN, "$dir/proposed_non_ff_functions") ||die "$! proposed_non_ff_functions"; 17 | while() { 18 | chomp; 19 | my @a=split /\t/; 20 | $fn{$a[0]}=$a[1]; 21 | } 22 | close IN; 23 | 24 | open(IN, "$dir/proposed_functions") ||die "$! proposed_functions"; 25 | while() { 26 | chomp; 27 | my @a=split /\t/; 28 | $fn{$a[0]}=$a[1]; 29 | } 30 | close IN; 31 | 32 | 33 | 34 | open(OUT, ">$odir/assigned_functions") || die "$! assigned_functions"; 35 | map {print OUT "$_\t$fn{$_}\n"} sort {$a cmp $b} keys %fn; 36 | close OUT; 37 | 38 | 39 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Edwards Lab](https://img.shields.io/badge/Bioinformatics-EdwardsLab-03A9F4)](https://edwards.sdsu.edu/research) 2 | 3 | EdwardsLab 4 | ========== 5 | 6 | Code from the Edwards lab, including bioinformatics, image analysis and more. All this code is created and maintained by folks at Rob Edwards' bioinformatics lab at SDSU. 7 | 8 | For more information about the Edwards' lab see http://edwards.sdsu.edu/research. 9 | 10 | We use this repository to share our code and data with each other, and with the world. 11 | 12 | We make all this code available to everyone to use. If you find errors or bugs please let Rob Edwards know ... see the above URL for contact information. 13 | 14 | The [bin](bin) directory contains general scripts that we use on a day to day basis 15 | -------------------------------------------------------------------------------- /ViralBioinformaticsTools/README.md: -------------------------------------------------------------------------------- 1 | # Viral Bioinformatics Tools 2 | 3 | See [the Google Form](https://forms.gle/BaWcsAf6iqB7gkNGA) that you can fill in or the [submitted tools](https://docs.google.com/spreadsheets/d/1ClNgip08olKK-oBMMlPHBwIcilqSxsan8MEaYphUei4/edit?usp=sharing) 4 | 5 | This data was generated from those forms. 6 | 7 | See also the [Google Colab](https://colab.research.google.com/drive/1nsyMjnbjm_8AMR1FCubuTvNwU4KeqVQg?usp=sharing) notebook that uses this data. 8 | 9 | 10 | -------------------------------------------------------------------------------- /ViralBioinformaticsTools/git_hub_dates.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:f405738a0a23e7b44db74810ffcc20005b999fe20b83c34d85acfd63f685328a 3 | size 4725 4 | -------------------------------------------------------------------------------- /ViralBioinformaticsTools/proj_start_stop.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:85aaa5789b4c2e2e13f07661bcf7b0f29b15ac897dfb49fdc646ccf81f3216c2 3 | size 2522 4 | -------------------------------------------------------------------------------- /ViralBioinformaticsTools/viral_bioinformatics_tools.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:b8658d0fa7f99b9f3f205bd9aa2d9fd2f452f037c77edbae994b6eff242fee4f 3 | size 13631 4 | -------------------------------------------------------------------------------- /VirusDiscoveryProject/DataSelection/README.md: -------------------------------------------------------------------------------- 1 | # Data Selection 2 | 3 | First, this is a temporary location, these files should be in the VirusDiscoveryProject repo. 4 | 5 | Take a look at the [datasets](datasets.ipynb) jupyter notebook that summarizes everything. 6 | 7 | The [WGS datasets](wgs_datasets.tsv.gz) file (*Note*: This is gzip compressed) has all the raw data. This is a `.tsv` file and so you can load it into a spreadsheet program. 8 | 9 | The [random selection](random_selection.txt) file has a set of 1,000 metagenomes chosen at random. 10 | 11 | The [size selection](size_selection.txt) file has 999 metagenome IDs chosen as small, medium, or large data sets. 12 | 13 | The [phage selection](phage_size_selection.txt) file has 999 metagenome IDs chosen as small, medium, or large data sets but that have the most number of phage fragments. 14 | 15 | 16 | We should use the data sets listed in the [phage selection](phage_size_selection.txt) file. 17 | -------------------------------------------------------------------------------- /VirusDiscoveryProject/DataSelection/wgs_datasets.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:7187ab967e1b6b4f141a3028b839d5850f9163b8eadd7225b92aa96c03aaab8c 3 | size 5003956 4 | -------------------------------------------------------------------------------- /assembly/mummerplot.yaml: -------------------------------------------------------------------------------- 1 | # YAML config file for the mummerplot snakefile 2 | 3 | #directory where the fasta files reside 4 | fasta: fasta 5 | 6 | # the directory for reverse complemented fasta files as required 7 | fasta_rc: fasta_rc 8 | 9 | # where to write the mummer output (*.mums) 10 | mummer_output: mummer 11 | 12 | # where to write the png files 13 | mummer_png: mummer_png 14 | 15 | # final montage output plot 16 | montage: "mummer_montage.png" 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /bam/bam2fasta.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import pysam 4 | 5 | __author__ = 'Rob Edwards' 6 | 7 | 8 | def qual2fastq(quals): 9 | """ 10 | Convert a list of quality scores to a single fastq line 11 | 12 | :param quals: A list of quality scores 13 | :type quals: list 14 | :return: A fastq quality string 15 | :rtype: str 16 | """ 17 | quality = [chr(q) for q in quals] 18 | return "".join(quality) 19 | 20 | if __name__ == '__main__': 21 | parser = argparse.ArgumentParser(description='Convert bam to fastq') 22 | parser.add_argument('-b', help='bam file', required=True) 23 | parser.add_argument('-v', help='verbose output') 24 | args = parser.parse_args() 25 | 26 | bamfile = pysam.AlignmentFile(args.b, "rb") 27 | for read in bamfile.fetch(until_eof=True): 28 | print(">{}\n{}".format(read.query_name, read.query_sequence)) 29 | -------------------------------------------------------------------------------- /bam/bam2fastq.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import pysam 4 | 5 | __author__ = 'Rob Edwards' 6 | 7 | 8 | def qual2fastq(quals): 9 | """ 10 | Convert a list of quality scores to a single fastq line 11 | 12 | :param quals: A list of quality scores 13 | :type quals: list 14 | :return: A fastq quality string 15 | :rtype: str 16 | """ 17 | quality = [chr(q + 33) for q in quals] 18 | return "".join(quality) 19 | 20 | if __name__ == '__main__': 21 | parser = argparse.ArgumentParser(description='Convert bam to fastq') 22 | parser.add_argument('-b', help='bam file', required=True) 23 | parser.add_argument('-v', help='verbose output') 24 | args = parser.parse_args() 25 | 26 | bamfile = pysam.AlignmentFile(args.b, "rb") 27 | for read in bamfile.fetch(until_eof=True): 28 | if read.query_qualities: 29 | print("@{}\n{}\n+\n{}".format(read.query_name, read.query_sequence, qual2fastq(read.query_qualities))) 30 | else: 31 | print("@{}\n{}\n+\n".format(read.query_name, read.query_sequence)) 32 | -------------------------------------------------------------------------------- /bam/list_reads.py: -------------------------------------------------------------------------------- 1 | """ 2 | List all the reads that map to a bam file 3 | """ 4 | 5 | import os 6 | import sys 7 | import argparse 8 | import pysam 9 | 10 | parser = argparse.ArgumentParser(description="List all the reads in a bam file") 11 | parser.add_argument('-b', help='bam file', required=True) 12 | parser.add_argument('-v', help='verbose output', action="store_true") 13 | args = parser.parse_args() 14 | 15 | bamfile = pysam.AlignmentFile(args.b, "rb") 16 | for read in bamfile.fetch(until_eof=True): 17 | print(read.query_name) 18 | -------------------------------------------------------------------------------- /bin/Makefile: -------------------------------------------------------------------------------- 1 | # A simple Makefile to compile the C code here 2 | # 3 | 4 | FLAGS := $(FLAGS) -Wall -O3 5 | override CFLAGS += $(shell pkg-config --cflags --libs python3-embed) 6 | 7 | all: 8 | gcc $(FLAGS) -o fastq_avqual fastq_avqual.c -lz 9 | gcc $(FLAGS) -o count_fasta count_fasta.c -lz 10 | # c++ $(FLAGS) -o fq2fa fastq2fasta.cpp 11 | gcc $(FLAGS) -o fastq2fasta fastq2fasta.c -lz 12 | gcc $(FLAGS) -o count_fastq count_fastq.c -lz 13 | gcc $(FLAGS) -o fastg2gfa fastg2gfa.c -lz 14 | gcc $(FLAGS) -o fasta_split fasta_split.c -lz 15 | 16 | clean: 17 | rm -f fastq_avqual count_fasta fastq2fasta count_fastq fastg2gfa fasta_split 18 | 19 | -------------------------------------------------------------------------------- /bin/all_4mers.py: -------------------------------------------------------------------------------- 1 | """ 2 | print all 4 mers 3 | """ 4 | 5 | import os 6 | import sys 7 | import argparse 8 | 9 | from roblib import bcolors 10 | import itertools 11 | 12 | alphabet = ['A', 'C', "G", 'T'] 13 | c=0 14 | for a in itertools.product(alphabet, repeat=4): 15 | if (c == 16): 16 | print() 17 | c=0 18 | print("".join(a), end=' ') 19 | c+=1 20 | print() -------------------------------------------------------------------------------- /bin/average_quality_scores.pl: -------------------------------------------------------------------------------- 1 | #__perl__ 2 | # 3 | # Calculate the average quality score of one or more files 4 | # 5 | # Usage: average_quality_scores.pl [-l] 6 | # 7 | # -l initiates printing all scores for each file otherwise 8 | # a summary is produced 9 | 10 | use strict; 11 | use Rob; 12 | my $rob = new Rob; 13 | my ($total, $n)=(0,0); 14 | my ($min, $max)=(10000, 0); 15 | 16 | my $printall = 0; 17 | 18 | foreach my $f (@ARGV) { 19 | if ($f eq "-l") {$printall = 1; next} 20 | my $qu = $rob->read_fasta($f, 1); 21 | foreach my $id (keys %$qu) { 22 | my @qual = split /\s+/, $qu->{$id}; 23 | my $t=0; 24 | map {$t+=$_} @qual; 25 | my $av = $t/($#qual+1); 26 | $printall && print "$id\t$av\n"; 27 | $total+=$t; 28 | $n+=$#qual+1; 29 | $av > $max ? $max = $av : 1; 30 | $av < $min ? $min = $av : 1; 31 | } 32 | } 33 | print "TOTAL: $total NBASES: $n MINIMUM: $min MAXIMUM: $max AVERAGE: ", $total/$n, "\n"; 34 | 35 | -------------------------------------------------------------------------------- /bin/blast2seq.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | from roblib import read_fasta 5 | import argparse 6 | __author__ = 'Rob Edwards' 7 | 8 | 9 | -------------------------------------------------------------------------------- /bin/checkR1R2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | if [[ $# != 1 ]]; then echo "$0 " >&2; exit; fi 5 | 6 | # Check that we have an R2 for every R1 and vice-versa 7 | 8 | for R1 in $(find $1 -name \*R1\*); do 9 | R2=${R1/R1/R2}; 10 | if [[ ! -e $R2 ]]; then echo "Not found: $R2 for associated R1: $R1" >&2; fi 11 | done 12 | 13 | 14 | for R2 in $(find $1 -name \*R2\*); do 15 | R1=${R2/R2/R1}; 16 | if [[ ! -e $R1 ]]; then echo "Not found: $R1 for associated R2: $R2" >&2; fi 17 | done 18 | 19 | -------------------------------------------------------------------------------- /bin/cpgs.py: -------------------------------------------------------------------------------- 1 | """ 2 | Read a fastq file and count CpGs 3 | """ 4 | 5 | import os 6 | import sys 7 | import argparse 8 | 9 | from roblib import bcolors 10 | from roblib import stream_fastq 11 | 12 | def countcpgs(fqfile): 13 | """ 14 | Count the CpGs in a file 15 | :param fqfile: the fastq file 16 | :return: 17 | """ 18 | 19 | count = {} 20 | for seqid, header, seq, qual in stream_fastq(fqfile): 21 | cg = seq.count('CG') 22 | count[cg] = count.get(cg, 0) + 1 23 | return count 24 | 25 | 26 | if __name__ == '__main__': 27 | parser = argparse.ArgumentParser(description='Count CGs in a fastq file') 28 | parser.add_argument('-f', help='fastq file', required=True) 29 | parser.add_argument('-v', help='verbose output', action='store_true') 30 | args = parser.parse_args() 31 | 32 | count = countcpgs(args.f) 33 | for c in sorted(list(count.keys())): 34 | print(f"{c}\t{count[c]}") 35 | -------------------------------------------------------------------------------- /bin/crc64.py: -------------------------------------------------------------------------------- 1 | """ 2 | Calculate the crc64 checksum of a fasta sequence. 3 | """ 4 | 5 | import os 6 | import sys 7 | import argparse 8 | from crc64iso.crc64iso import crc64 9 | from roblib import stream_fasta 10 | 11 | __author__ = 'Rob Edwards' 12 | 13 | if __name__ == "__main__": 14 | parser = argparse.ArgumentParser(description=' ') 15 | parser.add_argument('-f', help='fasta file', required=True) 16 | parser.add_argument('-o', help='output file (default -)', default=sys.stdout) 17 | parser.add_argument('-v', help='verbose output', action='store_true') 18 | args = parser.parse_args() 19 | 20 | for seqid, seq in stream_fasta(args.f): 21 | print(f"{seqid}\t{crc64(seq)}") -------------------------------------------------------------------------------- /bin/dump_all_tables.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sqlite3 3 | import pandas as pd 4 | 5 | 6 | def to_csv(filename): 7 | db = sqlite3.connect(filename) 8 | cursor = db.cursor() 9 | cursor.execute("SELECT name FROM sqlite_master WHERE type='table';") 10 | tables = cursor.fetchall() 11 | for table_name in tables: 12 | table_name = table_name[0] 13 | table = pd.read_sql_query("SELECT * from %s" % table_name, db) 14 | table.to_csv(table_name + '.csv', index_label='index', encoding='utf-8') 15 | 16 | 17 | if __name__ == '__main__': 18 | parser = argparse.ArgumentParser(description='Dump the contents of an SQL file to CSV. This was taken from http://stackoverflow.com/questions/305378/get-list-of-tables-db-schema-dump-etc-in-sqlite-databases') 19 | parser.add_argument('-d', help='SQLlite database file', required=True) 20 | args = parser.parse_args() 21 | to_csv(args.d) 22 | -------------------------------------------------------------------------------- /bin/extract.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import sys 4 | 5 | __author__ = 'Rob Edwards' 6 | 7 | print('CWD' + os.getcwd()) 8 | 9 | fname = os.path.join(os.environ['HOME'], 'Dropbox/Metagenomics/51.hits_small.fa.lrz') 10 | 11 | f = subprocess.Popen(['/usr/bin/lrunzip', '-q', '-d', '-f', '-o-', fname], stdout=subprocess.PIPE).stdout 12 | 13 | for l in f: 14 | print("READ: {}".format(l)) -------------------------------------------------------------------------------- /bin/extract_fasta_sequence.pl: -------------------------------------------------------------------------------- 1 | =pod 2 | 3 | Extract one or more sequences from a fasta file, but not using much memory 4 | 5 | =cut 6 | 7 | use strict; 8 | 9 | unless ($#ARGV >= 1) { 10 | die "$0 [list of sequences to extract]\n"; 11 | } 12 | 13 | my $faf = shift; 14 | my %want; 15 | map {$want{$_}=1} @ARGV; 16 | 17 | if ($faf =~ /.gz$/) { 18 | open(IN, "gunzip -c $faf|") || die "Can't open a pipe to $faf"; 19 | } else { 20 | open(IN, $faf) || die "$! $faf"; 21 | } 22 | my $p=0; 23 | while () { 24 | if (index($_, ">") == 0) { 25 | $p = 0; 26 | if (/^>(\S+)/ && $want{$1}) {$p=1} 27 | } 28 | print if ($p); 29 | } 30 | close IN; 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /bin/factorial.py: -------------------------------------------------------------------------------- 1 | """ 2 | Calculate some factorials 3 | """ 4 | 5 | import os 6 | import sys 7 | 8 | 9 | def factorial(n): 10 | if n == 2: 11 | return 2 12 | return n * factorial(n-1) 13 | 14 | 15 | for n in range(2,100): 16 | if factorial(n) > 100000 ** n: 17 | print("{}\tFACTORIAL".format(n)) 18 | else: 19 | print("{}\texponent".format(n)) 20 | # print("{}\t{}\t{}".format(n, factorial(n), 100000 ** n)) -------------------------------------------------------------------------------- /bin/fasta2sequence.pl: -------------------------------------------------------------------------------- 1 | =pod 2 | 3 | Extract one or more sequences from a fasta file 4 | 5 | =cut 6 | 7 | use strict; 8 | use Data::Dumper; 9 | use Rob; 10 | my $rob = new Rob; 11 | 12 | unless ($#ARGV >= 1) { 13 | die "$0 [list of sequences to extract]\n"; 14 | } 15 | 16 | my $faf = shift; 17 | my $fa = $rob->read_fasta($faf); 18 | foreach my $seq (@ARGV) { 19 | if (!defined $fa->{$seq}) { 20 | print STDERR "ERROR: $seq not found in $faf\n"; 21 | next; 22 | } 23 | print ">$seq\n", $fa->{$seq}, "\n"; 24 | } 25 | 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /bin/fastapercent.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | 3 | # look through a fasta file and figure out what percent is done 4 | 5 | use strict; 6 | 7 | my ($file, $tag)=@ARGV; 8 | unless ($file && $tag) {die "$0 ')>"} 9 | my ($c, $s)=(0,0); 10 | open(IN, $file) || die "can't open $file"; 11 | while () { 12 | next unless (/^>/); 13 | (/^>$tag\s+/) ? eval {$s=$c} : 1; 14 | $c++; 15 | } 16 | 17 | print "$tag is at $s, and the file is $c. We have done ", int(($s/$c)*100000)/1000, " percent\n"; 18 | -------------------------------------------------------------------------------- /bin/fastq2fasta.py: -------------------------------------------------------------------------------- 1 | """ 2 | Convert a fastq file to a fasta file. Note in this case I just ignore the quailty scores! 3 | """ 4 | 5 | import os 6 | import sys 7 | import argparse 8 | from roblib import stream_fastq 9 | 10 | __author__ = 'Rob Edwards' 11 | 12 | if __name__ == "__main__": 13 | parser = argparse.ArgumentParser(description=' ') 14 | parser.add_argument('-f', help='input fastq file', required=True) 15 | parser.add_argument('-o', help='output fasta file', required=True) 16 | args = parser.parse_args() 17 | 18 | with open(args.o, 'w') as out: 19 | for (sid, label, seq, qual) in stream_fastq(args.f): 20 | out.write(">{}\n{}\n".format(sid, seq)) 21 | -------------------------------------------------------------------------------- /bin/fastq_avqual.c: -------------------------------------------------------------------------------- 1 | /* Average the quality scores in a fastq file 2 | * 3 | * Rob Edwards, 10/12/21 4 | * 5 | */ 6 | 7 | 8 | #include 9 | #include 10 | #include 11 | #include "kseq.h" 12 | 13 | KSEQ_INIT(gzFile, gzread) 14 | 15 | #define table_size 10000 16 | 17 | int main(int argc, char *argv[]) { 18 | 19 | if ( argc < 2) { 20 | printf("Usage: %s \n", argv[0]); 21 | return 1; 22 | } 23 | gzFile fp; 24 | kseq_t *seq; 25 | int c=0; 26 | long total=0; 27 | long n=0; 28 | 29 | fp = gzopen(argv[1], "r"); 30 | seq = kseq_init(fp); 31 | int l; 32 | while ((l = kseq_read(seq)) >= 0) { 33 | c++; 34 | for (int i = 0; i < strlen(seq->qual.s); i++) { 35 | total+= (int) seq->qual.s[i]; 36 | n++; 37 | } 38 | } 39 | kseq_destroy(seq); 40 | gzclose(fp); 41 | printf("File\tNumber of sequences\tTotal bp\tTotal quality\tAverage quality\n"); 42 | printf("%s\t%d\t%ld\t%ld\t%ld\n", argv[1], c, n, total, total/n); 43 | 44 | } 45 | 46 | -------------------------------------------------------------------------------- /bin/filter_fasta_length.py: -------------------------------------------------------------------------------- 1 | """ 2 | Stream a fasta file and print it out 3 | """ 4 | 5 | import os 6 | import sys 7 | import argparse 8 | from roblib import sequences 9 | if __name__ == '__main__': 10 | parser = argparse.ArgumentParser(description="stream the contents of a fasta file") 11 | parser.add_argument('-f', help='file to stream', required=True) 12 | parser.add_argument('-m', help='minimum sequence length', type=int, default=1000) 13 | args = parser.parse_args() 14 | 15 | for (seqid, seq) in sequences.stream_fasta(args.f): 16 | if len(seq) > args.m: 17 | print(">{}\n{}".format(seqid, seq)) 18 | -------------------------------------------------------------------------------- /bin/filter_seq_by_length.py: -------------------------------------------------------------------------------- 1 | """ 2 | Filter a fasta file for sequences longer than a specified length 3 | """ 4 | 5 | import os 6 | import sys 7 | import argparse 8 | from roblib import sequences 9 | 10 | __author__ = 'Rob Edwards' 11 | 12 | if __name__ == "__main__": 13 | parser = argparse.ArgumentParser(description='Filter a fasta file for sequences longer than a specified length') 14 | parser.add_argument('-f', help='fasta file', required=True) 15 | parser.add_argument('-l', help='Minimum length to filter on (seq >= this number)', type=int) 16 | args = parser.parse_args() 17 | 18 | for seqid, seq in sequences.stream_fasta(args.f): 19 | if len(seq) >= args.l: 20 | print(">{}\n{}".format(seqid, seq)) -------------------------------------------------------------------------------- /bin/genbank2fna.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | 3 | # get DNA out of a genbank file 4 | 5 | use Bio::SeqIO; 6 | use strict; 7 | 8 | my $usage=< 10 | 11 | EOF 12 | 13 | die $usage unless ($ARGV[0]); 14 | 15 | foreach my $file (@ARGV) 16 | { 17 | my $of = $file; 18 | $of =~ s/\.gbk/.fasta/; 19 | $of =~ s/\.genbank/.fasta/; 20 | if ($of eq $file) {$of .= ".fasta"} 21 | my $sin=Bio::SeqIO->new(-file=>$file, -format=>'genbank'); 22 | my $sout = Bio::SeqIO->new(-file=>">$of", -format=>'fasta'); 23 | while (my $seq = $sin->next_seq()) { 24 | $sout->write_seq($seq); 25 | } 26 | } 27 | 28 | -------------------------------------------------------------------------------- /bin/genbank_count_motifs.py: -------------------------------------------------------------------------------- 1 | """ 2 | Count the occurrence of motifs in a genbank file 3 | """ 4 | 5 | import os 6 | import sys 7 | import argparse 8 | from roblib import genbank_seqio, rc 9 | 10 | 11 | __author__ = 'Rob Edwards' 12 | 13 | if __name__ == "__main__": 14 | parser = argparse.ArgumentParser(description='Count the motif and its occurrence in a sequence') 15 | parser.add_argument('-f', help='input genbank file', required=True) 16 | parser.add_argument('-m', help='motif to look for', type=str) 17 | parser.add_argument('-v', help='verbose output', action='store_true') 18 | args = parser.parse_args() 19 | 20 | motif = args.m.upper() 21 | 22 | print("File\tContig\tLength\tNumber matches") 23 | for seq in genbank_seqio(args.f, args.v): 24 | dna = seq.seq.upper() 25 | count = dna.count(motif) 26 | count += dna.count(rc(motif)) 27 | print(f"{args.f}\t{seq.id}\t{len(dna)}\t{count}") 28 | -------------------------------------------------------------------------------- /bin/genbanktable2fasta.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # 3 | 4 | use strict; 5 | 6 | # convert a genbanktable to fasta using either contig/start/end/strand or gene id 7 | 8 | use Getopt::Std; 9 | my %opts; 10 | getopts('lg:i', \%opts); 11 | unless (($opts{i} || $opts{l}) && $opts{g}) { 12 | die <) { 26 | chomp; 27 | my @a=split /\t/; 28 | my $header; 29 | if ($opts{i}) { 30 | $header = ">$a[6]\n"; 31 | } else { 32 | my ($b, $e, $strand) = ($a[7], $a[8], $a[9]); 33 | if ($strand < 0) {($b, $e) = ($e, $b)} 34 | $header = ">$a[0]_${b}_${e}\n"; 35 | } 36 | unless ($header) {print STDERR "Can't construct a header in $_\n"} 37 | print $header, $a[10], "\n"; 38 | } 39 | -------------------------------------------------------------------------------- /bin/get_genbank.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use LWP::Simple; 4 | use Data::Dumper; 5 | 6 | my $id = shift || die "$0 "; 7 | 8 | 9 | open(OUT, ">${id}_sequences.gbk") || die "Can't write to ${id}_sequences.gbk"; 10 | my $url='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&retmode=text&rettype=gb&id=' . $id; 11 | 12 | print OUT get($url); 13 | print "$url\n"; 14 | exit 0; 15 | 16 | -------------------------------------------------------------------------------- /bin/get_genbank_batch_proteins.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | use strict; 3 | use LWP::Simple; 4 | use Data::Dumper; 5 | 6 | unless ($ARGV[0]) {die "$0 ) { 10 | chomp; 11 | push @ids, $_; 12 | } 13 | close IN; 14 | my $n=100; 15 | if ($ARGV[1]) {$n=$ARGV[1]} 16 | 17 | my $time=time-10; 18 | my $url='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&retmode=text&rettype=gb&tool=robsgetter&email=rob@salmonella.org&id='; 19 | open(OUT, ">proteins.gbk") || die "Can't write to proteins.gbk"; 20 | my $total=0; 21 | while (@ids) { 22 | my @pieces = splice(@ids, 0, $n); 23 | $total+=scalar(@pieces); 24 | my $urlid=$url . join(",", @pieces); 25 | while (time-$time < 1.5) {sleep 1} 26 | print STDERR "Getting upto $total\n"; 27 | $time=time; 28 | print OUT get($urlid); 29 | } 30 | close OUT; 31 | 32 | exit 0; 33 | 34 | -------------------------------------------------------------------------------- /bin/get_lastlogs.sh: -------------------------------------------------------------------------------- 1 | DATE=`date +'%Y%m%d'` 2 | for i in anthill.sdsu.edu edwards-data.sdsu.edu rambox phantome.org edwards-dna; do 3 | echo $i; 4 | ssh $i 'lastlog' > $i.$DATE.lastlog 5 | done 6 | 7 | 8 | python2.7 ~/EdwardsLab/bin/merge_last_logs.py -l anthill.sdsu.edu.$DATE.lastlog -l edwards-data.sdsu.edu.$DATE.lastlog -l rambox.$DATE.lastlog -l phantome.org.$DATE.lastlog -l edwards-dna.$DATE.lastlog > lastlog.$DATE 9 | 10 | -------------------------------------------------------------------------------- /bin/getopt.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | using namespace std; 7 | 8 | int main (int argc, char* argv[]) { 9 | if ( argc < 3 ) { 10 | cerr << "Please add some options\n"; 11 | return 1; 12 | } 13 | 14 | int replace = 0; 15 | for (;;) { 16 | switch(getopt(argc, argv, "r")) { 17 | default: 18 | cerr << " fount " << optarg << "\n"; 19 | case -1: 20 | cerr << "Found -1\n"; 21 | break; 22 | case 'r': 23 | cout << "Flag r set"; 24 | replace = 1; 25 | continue; 26 | } 27 | break; 28 | } 29 | cerr << "argc: " << argc << "\n"; 30 | cerr << "Optind: " << optind << "\n"; 31 | if (optind +2 != argc) { 32 | cerr << " "; 33 | return 1; 34 | } 35 | char* fqf = argv[optind]; 36 | char* faf = argv[optind+1]; 37 | cout << "Fastq: " << fqf << " Fasta: " << faf << "\n"; 38 | } 39 | 40 | 41 | 42 | 43 | -------------------------------------------------------------------------------- /bin/gfa2fasta.sh: -------------------------------------------------------------------------------- 1 | if [ -z $1 ]; then 2 | echo "$0 > " 3 | exit $E_BADARGS 4 | fi 5 | 6 | awk -v id=$1 '/^S/{print ">"id"_"$2"\n"$3}' $1 7 | -------------------------------------------------------------------------------- /bin/json_validator.py: -------------------------------------------------------------------------------- 1 | """ 2 | A very simple JSON validator. We read the JSON file in and print it out using pprint. 3 | """ 4 | 5 | import sys 6 | import json 7 | import pprint 8 | __author__ = 'Rob Edwards' 9 | 10 | 11 | h = "A very simple JSON validator. We read the JSON file in and print it out using pprint.\n" 12 | h += f"\nUsage: {sys.argv[0]} \n" 13 | 14 | 15 | if len(sys.argv) < 2: 16 | sys.exit(h) 17 | 18 | 19 | pp = pprint.PrettyPrinter(indent=4) 20 | 21 | with open(sys.argv[1], 'r') as f: 22 | j = json.load(f) 23 | 24 | pp.pprint(j) 25 | 26 | -------------------------------------------------------------------------------- /bin/merge_pdf.py: -------------------------------------------------------------------------------- 1 | """ 2 | Merge multiple pdf files into 3 | """ 4 | 5 | import os 6 | import sys 7 | import argparse 8 | import PyPDF2 as PDF 9 | 10 | merger = PDF.PdfFileMerger(strict=False) 11 | for f in sys.argv: 12 | if f.endswith('.pdf') and os.path.exists(f): 13 | sys.stderr.write("Adding {}\n".format(f)) 14 | merger.append(PDF.PdfFileReader(f, 'rb')) 15 | else: 16 | sys.stderr.write("Skipped {}\n".format(f)) 17 | 18 | merger.write("AllDocs.pdf") 19 | -------------------------------------------------------------------------------- /bin/parsebz2xml.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | """ 4 | 5 | import os 6 | import sys 7 | import argparse 8 | import bz2 9 | from bs4 import BeautifulSoup 10 | 11 | 12 | 13 | # with bz2.BZ2File('biosample_set.xml.bz2', 'r') as input: 14 | # for i in range(10): 15 | # l = input.readline() 16 | # print("{}\n".format(l)) 17 | 18 | 19 | # SAMN00000002\n' 20 | 21 | def primaryId(tag): 22 | return tag['db'] == 'BioSample' and tag['is_primary'] 23 | 24 | 25 | with bz2.BZ2File('biosample_set.xml.bz2', 'r') as input: 26 | soup = BeautifulSoup(input, 'xml') 27 | pi = soup.find_next(primaryId) 28 | print("{}".format(pi)) 29 | -------------------------------------------------------------------------------- /bin/pdb2fa.py: -------------------------------------------------------------------------------- 1 | """ 2 | Convert a PDB file to a fasta file 3 | 4 | Taken from https://www.biostars.org/p/435629/ 5 | """ 6 | 7 | import os 8 | import sys 9 | from Bio import SeqIO 10 | import argparse 11 | 12 | if __name__ == "__main__": 13 | parser = argparse.ArgumentParser(description='Convert a PDB file to a fasta file') 14 | parser.add_argument('-p', help='input PDB file', required=True) 15 | parser.add_argument('-f', help='output fasta file', required=True) 16 | parser.add_argument('-v', help='verbose output', action='store_true') 17 | args = parser.parse_args() 18 | 19 | with open(args.p, 'r') as pdb_file, open(args.f, 'w') as fasta_file: 20 | for record in SeqIO.parse(pdb_file, 'pdb-atom'): 21 | if record.id.startswith("???"): 22 | print(f">{args.p}\n{record.seq}", file=fasta_file) 23 | else: 24 | print(f">{record.id}\n{record.seq}", file=fasta_file) 25 | 26 | 27 | -------------------------------------------------------------------------------- /bin/rc.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # 3 | 4 | use strict; 5 | use Rob; 6 | my $r = new Rob; 7 | 8 | my $f = shift || die "Fasta file to reverse complement?"; 9 | my $fa = $r->read_fasta($f); 10 | foreach my $id (keys %$fa) { 11 | print ">$id\n", $r->rc($fa->{$id}), "\n"; 12 | } 13 | -------------------------------------------------------------------------------- /bin/rename_fasta.py: -------------------------------------------------------------------------------- 1 | """ 2 | Rename the sequences in a fasta file 3 | 4 | If you give an optional -r the sequences will be renamed with that, otherwise with the file name 5 | """ 6 | 7 | import os 8 | import sys 9 | import argparse 10 | from roblib import stream_fasta 11 | 12 | __author__ = 'Rob Edwards' 13 | 14 | if __name__ == "__main__": 15 | parser = argparse.ArgumentParser(description='Rename the sequences in a fasta file') 16 | parser.add_argument('-f', help='input fasta file', required=True) 17 | parser.add_argument('-r', help='string to rename the seqids to. Default=use the file name') 18 | parser.add_argument('-v', help='verbose output', action='store_true') 19 | args = parser.parse_args() 20 | 21 | if args.r: 22 | ren = args.r 23 | else: 24 | ren = args.f 25 | 26 | counter = 0 27 | for seqid, seq in stream_fasta(args.f): 28 | counter += 1 29 | print(f">{ren}_{counter}\n{seq}") 30 | -------------------------------------------------------------------------------- /bin/renumber_fasta.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use Getopt::Std; 3 | use Data::Dumper; 4 | use Rob; 5 | my $rob = new Rob; 6 | 7 | my $counter = 0; 8 | 9 | my $usage = < 11 | id.map and output file must not exist (will not be automatically overwritten) 12 | list of fasta files can be long 13 | EOF 14 | 15 | my $idf = shift || die $usage; 16 | my $out = shift || die $usage; 17 | 18 | if (-e $idf) {die "$idf exists.\n$usage"} 19 | if (-e $out) {die "$out exists.\n$usage"} 20 | 21 | open(IDF, ">$idf") || die "Can't write $idf"; 22 | open(OUT, ">$out") || die "Can't write $out"; 23 | 24 | foreach my $f (@ARGV) { 25 | my $fa = $rob->read_fasta($f); 26 | foreach my $seqid (keys %$fa) { 27 | $counter++; 28 | print OUT ">$counter\n", $fa->{$seqid}, "\n"; 29 | print IDF "$f\t$seqid\t$counter\n"; 30 | } 31 | } 32 | 33 | close(IDF); 34 | close(OUT); 35 | -------------------------------------------------------------------------------- /bin/riddler.py: -------------------------------------------------------------------------------- 1 | """ 2 | An answer to http://fivethirtyeight.com/features/how-long-will-your-smartphone-distract-you-from-family-dinner/ 3 | """ 4 | 5 | from random import choice 6 | 7 | if __name__ == '__main__': 8 | tasks = [1,2,3,4,5] 9 | total = [] 10 | for i in range(1000): 11 | mine = choice(tasks) 12 | sisters = choice(tasks) 13 | while mine != sisters: 14 | mine += choice(tasks) 15 | sisters += choice(tasks) 16 | total.append(mine) 17 | print(1.0 * sum(total)/len(total)) 18 | 19 | -------------------------------------------------------------------------------- /bin/separatemultifasta.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | 3 | # rewritten with BioPerl 4 | 5 | use strict; 6 | use Bio::SeqIO; 7 | 8 | my $file=shift || die "$0 "; 9 | my $dir=$file.".files"; 10 | if (-e $dir) {die "$dir already exists. Not overwriting"} 11 | else {mkdir $dir, 0755} 12 | 13 | my %seen; 14 | my $sio=Bio::SeqIO->new(-file=>$file, -format=>"fasta"); 15 | while (my $seq=$sio->next_seq) 16 | { 17 | my $id=$seq->id; 18 | $id =~ s/\s+/_/g; 19 | while ($seen{$id}) {print "$id already written, "; $id.="1"; print " now trying $id\n"} 20 | my $outfn = $id; 21 | $outfn =~ s/\|/_/g; 22 | $outfn =~ s/\_$//; 23 | $outfn =~ s/\//_/g; 24 | my $fout=Bio::SeqIO->new(-file=>">$dir/$outfn.fasta", -format=>"fasta"); 25 | $seen{$id}=1; 26 | $fout->write_seq($seq); 27 | } 28 | 29 | -------------------------------------------------------------------------------- /bin/sge_summary.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | 3 | open(IN, "qstat |") || die "$! pipe to qstat"; 4 | my $job; 5 | my %type; 6 | while () { 7 | chomp; 8 | next if (/^job-ID/ || /^\-/); 9 | my @a=split /\s+/; 10 | $job->{$a[2]}->{$a[4]}++; 11 | $type{$a[4]}=1; 12 | } 13 | 14 | my @t = sort {lc($a) cmp lc($b)} keys %type; 15 | print join("\t", "Job ", @t), "\n"; 16 | 17 | foreach my $j (sort {lc($a) cmp lc($b)} keys %{$job}) { 18 | print $j; 19 | foreach my $t (@t) { 20 | print $job->{$j}->{$t} ? "\t". $job->{$j}->{$t} : "\t0"; 21 | } 22 | print "\n"; 23 | } 24 | 25 | 26 | -------------------------------------------------------------------------------- /bin/sort_fasta_by_len.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | =pod 4 | 5 | Just sort a fasta file by the length of the sequences. 6 | 7 | Default is longest -> shortest. 8 | 9 | =cut 10 | 11 | use strict; 12 | use Getopt::Std; 13 | use Data::Dumper; 14 | use Rob; 15 | my $rob = new Rob; 16 | 17 | my %opts; 18 | getopts('f:vr', \%opts); 19 | unless ($opts{f}) { 20 | die <read_fasta($opts{f}); 29 | my @keys; 30 | if ($opts{r}) {@keys = sort {length($fa->{$a}) <=> length($fa->{$b})} keys %$fa} 31 | else {@keys = sort {length($fa->{$b}) <=> length($fa->{$a})} keys %$fa} 32 | 33 | map {print ">$_\n", $fa->{$_}, "\n"} @keys; 34 | 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /bin/sort_fasta_by_len_lengths_only.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | =pod 4 | 5 | Just sort a fasta file by the length of the sequences and print out the sequence lengths 6 | 7 | Default is longest -> shortest. 8 | 9 | =cut 10 | 11 | use strict; 12 | use Getopt::Std; 13 | use Data::Dumper; 14 | use Rob; 15 | my $rob = new Rob; 16 | 17 | my %opts; 18 | getopts('f:vr', \%opts); 19 | unless ($opts{f}) { 20 | die <read_fasta($opts{f}); 29 | my @keys; 30 | if ($opts{r}) {@keys = sort {length($fa->{$a}) <=> length($fa->{$b})} keys %$fa} 31 | else {@keys = sort {length($fa->{$b}) <=> length($fa->{$a})} keys %$fa} 32 | 33 | map {print length($fa->{$_}), "\t$_\n"} @keys; 34 | 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /bin/stream_fasta.py: -------------------------------------------------------------------------------- 1 | """ 2 | Stream a fasta file and print it out 3 | """ 4 | 5 | import os 6 | import sys 7 | import argparse 8 | from roblib import sequences 9 | if __name__ == '__main__': 10 | parser = argparse.ArgumentParser(description="stream the contents of a fasta file") 11 | parser.add_argument('-f', help='file to stream', required=True) 12 | args = parser.parse_args() 13 | 14 | for (seqid, seq) in sequences.stream_fasta(args.f): 15 | print("{}\t{}".format(seqid, seq)) -------------------------------------------------------------------------------- /bin/test.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test stuff! 3 | """ 4 | 5 | import os 6 | import sys 7 | 8 | import argparse 9 | import scipy 10 | import scipy.cluster.hierarchy as sch 11 | 12 | import dateutil.parser 13 | 14 | d = 'Thu Feb 11 16:39:56 -0800 2016' 15 | z = dateutil.parser.parse(d) 16 | print(z) 17 | 18 | 19 | 20 | 21 | sys.exit(0) 22 | 23 | 24 | X = scipy.randn(100, 2) # 100 2-dimensional observations 25 | print(X) 26 | 27 | d = sch.distance.pdist(X) 28 | print(len(d)) 29 | -------------------------------------------------------------------------------- /bin/transpose.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Read a tsv file and transpose it. 5 | """ 6 | 7 | import os 8 | import sys 9 | import argparse 10 | import pandas as pd 11 | from roblib import bcolors 12 | __author__ = 'Rob Edwards' 13 | 14 | 15 | 16 | 17 | 18 | if __name__ == "__main__": 19 | parser = argparse.ArgumentParser(description=' ') 20 | parser.add_argument('-i', '--input', help='input file', required=True) 21 | parser.add_argument('-o', '--output', help='output file', required=True) 22 | parser.add_argument('-s', '--sep', help='separator (default=tab)', default="\t") 23 | parser.add_argument('-v', help='verbose output', action='store_true') 24 | args = parser.parse_args() 25 | 26 | if args.v: 27 | print(f"{bcolors.GREEN}Reading {args.input}{bcolors.ENDC}") 28 | df = pd.read_csv(args.input, sep=args.sep) 29 | dft = df.T 30 | if args.v: 31 | print(f"{bcolors.GREEN}Writing {args.output}{bcolors.ENDC}") 32 | dft.to_csv(args.output, sep=args.sep) 33 | 34 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /bin/update_blastdb.sh: -------------------------------------------------------------------------------- 1 | DATE=$(date +%Y%m%d) 2 | cd /home2/db/blast 3 | mkdir nr_$DATE 4 | cd nr_$DATE 5 | ncftpget ftp://ftp.ncbi.nlm.nih.gov/blast/db/nr* 6 | cat *md5 > all.md5 7 | md5sum -c all.md5 8 | for t in *.tar.gz; do echo $t; tar xf $t; done 9 | cd /home2/db/blast 10 | rm -f nr 11 | ln -s nr_$DATE nr 12 | 13 | mkdir nt_$DATE 14 | cd nt_$DATE 15 | ncftpget ftp://ftp.ncbi.nlm.nih.gov/blast/db/nt* 16 | cat *md5 > all.md5 17 | md5sum -c all.md5 18 | for t in *.tar.gz; do echo $t; tar xf $t; done 19 | cd /home2/db/blast 20 | rm -f nt 21 | ln -s nt_$DATE nt 22 | 23 | 24 | -------------------------------------------------------------------------------- /cartopy/example.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | 3 | import cartopy.crs as ccrs 4 | 5 | """ 6 | This example is from the cartopy website, and is mostly to make sure things are installed and working. 7 | """ 8 | 9 | 10 | def main(): 11 | ax = plt.axes(projection=ccrs.Robinson()) 12 | 13 | # make the map global rather than have it zoom in to 14 | # the extents of any plotted data 15 | ax.set_global() 16 | 17 | ax.stock_img() 18 | ax.coastlines() 19 | 20 | # san diego 21 | sdlat, sdlon = 32.7157, -117.1611 22 | # brisbane 23 | brislat, brislon = -27.4698, 153.0251 24 | 25 | 26 | # NOTE: longitude before latitude!! 27 | plt.plot([sdlon, brislon], [sdlat, brislat], color='blue', linewidth=2, transform=ccrs.Geodetic()) 28 | 29 | 30 | 31 | plt.show() 32 | 33 | 34 | if __name__ == '__main__': 35 | main() 36 | -------------------------------------------------------------------------------- /cluster/submit2cluster_edwards: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | 3 | # submit a job using my new method 4 | 5 | use strict; 6 | use lib '/home/redwards/perl/lib/perl5/site_perl/5.8.8/'; 7 | use Schedule::SGE; 8 | 9 | my $usage=< 12 | 13 | OPTIONS: 14 | -N name (default=first word of command) 15 | -P project (default=redwards) 16 | EOF 17 | 18 | my ($name, $project, $command)=('', 'redwards', ''); 19 | while (@ARGV) { 20 | my $test=shift @ARGV; 21 | if ($test eq "-N") {$name=shift @ARGV} 22 | elsif ($test eq "-P") {$project=shift @ARGV} 23 | else {$command .= " ". $test} 24 | } 25 | 26 | die $usage unless $command; 27 | 28 | unless ($name) { 29 | $command =~ m/^(\S+)/; $name=$1; 30 | } 31 | 32 | 33 | my $sge=Schedule::SGE->new( 34 | -executable => {qsub=>'/usr/local/bin/qsub', qstat=>'/usr/local/bin/qstat'}, 35 | -name => $name, 36 | -verbose => 0, 37 | -notify => 1, 38 | -mailto => 'rob@salmonella.org', 39 | ); 40 | 41 | $sge->command($command); 42 | 43 | my $pid=$sge->execute(); 44 | exit(0); 45 | 46 | 47 | -------------------------------------------------------------------------------- /cpp/fastq/include/stream_fastq.h: -------------------------------------------------------------------------------- 1 | #ifndef STREAM_FASTQ_H 2 | #define STREAM_FASTQ_H 3 | 4 | 5 | class stream_fastq 6 | { 7 | public: 8 | stream_fastq(); 9 | virtual ~stream_fastq(); 10 | protected: 11 | private: 12 | }; 13 | 14 | #endif // STREAM_FASTQ_H 15 | -------------------------------------------------------------------------------- /cpp/fastq/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | using namespace std; 4 | 5 | int main() 6 | { 7 | cout << "Hello world!" << endl; 8 | return 0; 9 | } 10 | -------------------------------------------------------------------------------- /cpp/fastq/src/stream_fastq.cpp: -------------------------------------------------------------------------------- 1 | #include "stream_fastq.h" 2 | 3 | stream_fastq::stream_fastq() 4 | { 5 | //ctor 6 | } 7 | 8 | stream_fastq::~stream_fastq() 9 | { 10 | //dtor 11 | } 12 | -------------------------------------------------------------------------------- /crAssphage/collapse_bam_variants.py: -------------------------------------------------------------------------------- 1 | """ 2 | Collapse variants from a bamfile and try and make as few variants as possible 3 | """ 4 | 5 | import os 6 | import sys 7 | import argparse 8 | from roblib import read_fasta 9 | 10 | if __name__ == '__main__': 11 | parser = argparse.ArgumentParser(description="Collapse all variants from a bam file and make as few options as possible") 12 | parser.add_argument('-b', help='bam file', required=True) 13 | parser.add_argument('-r', help='reference fasta sequence', required=True) 14 | parser.add_argument('-s', help='start of the region to look at (default = whole sequence)', default=0, type=int) 15 | parser.add_argument('-e', help='end of sequence to look at (default = whole sequence)', type=int) 16 | parser.add_argument('-v', help='verbose output', action="store_true") 17 | args = parser.parse_args() 18 | 19 | fa = read_fasta(args.r) 20 | 21 | if not args.e: 22 | args.e = max([len(fa[f]) for f in fa]) 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /crAssphage/phylip2clustal.py: -------------------------------------------------------------------------------- 1 | """ 2 | A simple BioPython converter to move from phylip to clustal formats for the alignments 3 | """ 4 | 5 | import os 6 | import sys 7 | 8 | import argparse 9 | from Bio import SeqIO 10 | 11 | if __name__ == '__main__': 12 | parser = argparse.ArgumentParser(description="Convert an alignment file from phylip format to clustal format") 13 | parser.add_argument('-i', help='Alignment input file', required=True) 14 | parser.add_argument('-o', help='Output file name (optional: default = input file with clustal appended)') 15 | args = parser.parse_args() 16 | 17 | outfile = args.i + ".clustal" 18 | if args.o: 19 | outfile = args.o 20 | 21 | records=SeqIO.parse(args.i, 'phylip') 22 | 23 | with open(outfile, 'w') as out: 24 | SeqIO.write(records, out, 'clustal') -------------------------------------------------------------------------------- /crAssphage/transpose_and_join.pl.py: -------------------------------------------------------------------------------- 1 | """ 2 | Transpose and join a whole lot of files created by coverage_depth.py 3 | """ 4 | 5 | import os, sys 6 | import argparse 7 | 8 | if __name__ == '__main__': 9 | parser = argparse.ArgumentParser(description="Read a directory of files and join them as a single file") 10 | parser.add_argument('-d', help='directory of output files from coverage_depth.py', required=True) 11 | args = parser.parse_args() 12 | 13 | for f in os.listdir(args.d): 14 | data = [] 15 | with open(os.path.join(args.d, f), 'r') as fin: 16 | for l in fin: 17 | p=l.strip().split("\t") 18 | data.append(p[1]) 19 | sumd = sum(map(int, data[1:])) 20 | data[1:0] = [sumd] 21 | print("\t".join(map(str, data))) 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /deconvolute_minion_reads/README.md: -------------------------------------------------------------------------------- 1 | # Deconvolute Minion Reads 2 | 3 | Separate minion reads based on a timestamp or some other characteristic of the file. 4 | 5 | 6 | # Why we do this! 7 | 8 | When we do minion sequencing of some (but not all) samples, we often cheat and load multiple samples on the chip. Usually, the way that we do this is to start the run processing, let it run for a couple of hours, and then add the next sample. We can use the timestamps in the fastq file to separate out the individual reads. 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /deconvolute_minion_reads/fastq/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | __author__ = 'Rob Edwards' 3 | 4 | from .sequences import stream_fastq 5 | 6 | __all__ = ['stream_fastq'] 7 | -------------------------------------------------------------------------------- /fasta/extract_sequence.py: -------------------------------------------------------------------------------- 1 | """ 2 | Extract a sequence from a fasta file 3 | """ 4 | 5 | import os 6 | import sys 7 | import argparse 8 | from roblib import stream_fasta 9 | 10 | 11 | 12 | if __name__ == '__main__': 13 | parser = argparse.ArgumentParser(description=" ") 14 | parser.add_argument('-f', help='fasta file', required=True) 15 | parser.add_argument('-i', help='sequence id, (multiple allowed)', nargs='+') 16 | parser.add_argument('-v', help='verbose output', action='store_true') 17 | args = parser.parse_args() 18 | 19 | for seqid, seq in stream_fasta(args.f): 20 | if seqid in args.i: 21 | print(f">{seqid}\n{seq}\n") 22 | -------------------------------------------------------------------------------- /fasta/length_filter.py: -------------------------------------------------------------------------------- 1 | """ 2 | Filter a fasta file on length 3 | """ 4 | 5 | import os 6 | import sys 7 | import argparse 8 | from roblib import stream_fasta 9 | 10 | def length_filter(f, l, verbose=False): 11 | """ 12 | Filter a fasta file based on the minimum length, l 13 | :param f: fasta file 14 | :param l: minimum sequene length 15 | :param verbose: more output 16 | :return: 17 | """ 18 | 19 | for seqid, seq in stream_fasta(f, True): 20 | if len(seq) < l: 21 | continue 22 | print(">{}\n{}".format(seqid, seq)) 23 | 24 | 25 | if __name__ == '__main__': 26 | parser = argparse.ArgumentParser(description="Filter a file based on length") 27 | parser.add_argument('-f', help='the fasta file to filter', required=True) 28 | parser.add_argument('-l', help='minimum length (default=1000)', default=1000, type=int) 29 | parser.add_argument('-v', help='verbose output', action="store_true") 30 | args = parser.parse_args() 31 | 32 | length_filter(args.f, args.l, args.v) -------------------------------------------------------------------------------- /fasta/lengths.py: -------------------------------------------------------------------------------- 1 | """ 2 | Print the IDs and lengths of sequences in a fasta file 3 | """ 4 | 5 | import os 6 | import sys 7 | import argparse 8 | from roblib import stream_fasta 9 | 10 | parser = argparse.ArgumentParser(description="Print the lengths of sequences in a fasta file") 11 | parser.add_argument('-f', help='fasta file', required=True) 12 | parser.add_argument('-w', help='whole sequence ID. Default is to use ID upto whitespace', action="store_true", default=False) 13 | parser.add_argument('-v', help='verbose output', action="store_true") 14 | args = parser.parse_args() 15 | 16 | for seqid, seq in stream_fasta(args.f, args.w): 17 | print("{}\t{}".format(seqid, len(seq))) 18 | -------------------------------------------------------------------------------- /fasta/test.fasta: -------------------------------------------------------------------------------- 1 | >seq1 2 | AAAAAA 3 | >seq2 4 | TTTTTT 5 | >seq3 6 | CCCCCC 7 | >seq4 8 | GGGGGG 9 | >seq5 10 | AAATTT 11 | >seq6 12 | TTTCCC 13 | >seq7 14 | CCCGGG 15 | >seq8 16 | GGGAAA 17 | -------------------------------------------------------------------------------- /fastq/README.md: -------------------------------------------------------------------------------- 1 | # Scripts for manipulating fastq files 2 | 3 | -------------------------------------------------------------------------------- /fastq/average_quality.py: -------------------------------------------------------------------------------- 1 | """ 2 | Print the average quality score of a set of sequences 3 | """ 4 | 5 | import os 6 | import sys 7 | import argparse 8 | 9 | from roblib import stream_fastq, qual_to_numbers 10 | 11 | __author__ = 'Rob Edwards' 12 | __copyright__ = 'Copyright 2020, Rob Edwards' 13 | __credits__ = ['Rob Edwards'] 14 | __license__ = 'MIT' 15 | __maintainer__ = 'Rob Edwards' 16 | __email__ = 'raedwards@gmail.com' 17 | 18 | if __name__ == '__main__': 19 | parser = argparse.ArgumentParser(description=" ") 20 | parser.add_argument('-f', help='file', required=True) 21 | parser.add_argument('-o', help='output file') 22 | parser.add_argument('-v', help='verbose output', action='store_true') 23 | args = parser.parse_args() 24 | 25 | print("SeqID\tLength\tAverage Qual") 26 | for sid, seqid, seq, qual in stream_fastq(args.f): 27 | q2n = list(qual_to_numbers(qual)) 28 | av = sum(q2n)/len(q2n) 29 | print(f"{sid}\t{len(seq)}\t{av}") -------------------------------------------------------------------------------- /fastq/filter_fastq_length.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "kseq.h" 6 | 7 | KSEQ_INIT(gzFile, gzread); 8 | 9 | // compile with: 10 | // gcc -I../include -o filter_fastq_length ./filter_fastq_length.c -lz 11 | 12 | 13 | int main(int argc, char* argv[]) { 14 | if (argc < 2) { 15 | fprintf(stderr, "%s \n", argv[0]); 16 | exit(1); 17 | } 18 | 19 | gzFile fp; 20 | kseq_t *seq; 21 | fp = gzopen(argv[1], "r"); 22 | seq = kseq_init(fp); 23 | int l; 24 | int kept = 0; 25 | int dropped = 0; 26 | int minlen = atoi(argv[2]); 27 | // fprintf(stderr, "Filtering %s. Reads shorter than %d will be ignored\n", argv[1], minlen); 28 | while ((l = kseq_read(seq)) >= 0) { 29 | if (seq->seq.l > minlen) { 30 | printf("@%s %s\n%s\n+\n%s\n", seq->name.s, seq->comment.s, seq->seq.s, seq->qual.s); 31 | kept++; 32 | } else{ 33 | dropped++; 34 | } 35 | } 36 | fprintf(stderr, "Kept: %d Dropped: %d\n", kept, dropped); 37 | return 0; 38 | } 39 | 40 | 41 | -------------------------------------------------------------------------------- /fastq/test.fastq.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:b526c90ab96ca8ec6bacfbfec56bc760991c7e0c38ed055f8776cccebd647df4 3 | size 78966 4 | -------------------------------------------------------------------------------- /fastq/trim_primers.py: -------------------------------------------------------------------------------- 1 | """ 2 | Please note: This is a practice for code that will end up in primer-trimming github (that we should rename). 3 | 4 | You should use that version 5 | """ 6 | 7 | import os 8 | import sys 9 | import argparse 10 | import PyPrinseq 11 | 12 | __author__ = 'Rob Edwards' 13 | __copyright__ = 'Copyright 2020, Rob Edwards' 14 | __credits__ = ['Rob Edwards'] 15 | __license__ = 'MIT' 16 | __maintainer__ = 'Rob Edwards' 17 | __email__ = 'raedwards@gmail.com' 18 | 19 | if __name__ == '__main__': 20 | parser = argparse.ArgumentParser(description=" ") 21 | parser.add_argument('-f', help='fastq file', required=True) 22 | parser.add_argument('-l', help='left primers file', required=True) 23 | args = parser.parse_args() 24 | 25 | PyPrinseq.primertrimming(args.f, args.l, None) 26 | -------------------------------------------------------------------------------- /h5py/test_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | Create an hdf5 test data set for turbocor 3 | 4 | 5 | """ 6 | 7 | import os 8 | import sys 9 | import argparse 10 | import h5py 11 | import matplotlib.pyplot as plt 12 | import numpy as np 13 | import seaborn as sns 14 | 15 | # this is taken from SO: https://stackoverflow.com/questions/18683821/generating-random-correlated-x-and-y-points-using-numpy 16 | 17 | 18 | 19 | xx = np.array([-0.51, 51.2]) 20 | yy = np.array([0.33, 51.6]) 21 | means = [xx.mean(), yy.mean()] 22 | stds = [xx.std() / 3, yy.std() / 3] 23 | corr = 0.9 # correlation 24 | covs = [[stds[0]**2 , stds[0]*stds[1]*corr], 25 | [stds[0]*stds[1]*corr, stds[1]**2]] 26 | 27 | m = np.random.multivariate_normal(means, covs, 1000).T 28 | with h5py.File('correlated.h5', 'a') as f: 29 | if 'data' in f: 30 | d = f['data'] 31 | d.resize(d.shape[0]+2, axis=0) 32 | d[-2:] = m 33 | else: 34 | f.create_dataset("data", data=m, maxshape=(None,1000), chunks=True) 35 | 36 | # sns.scatterplot(m[0], m[1]) 37 | # plt.show() 38 | 39 | -------------------------------------------------------------------------------- /h5py/tmp.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/h5py/tmp.h5 -------------------------------------------------------------------------------- /isolation_sources/README.md: -------------------------------------------------------------------------------- 1 | # For data in ~/Dropbox/Genotype-phenotype/Isolation Sources 2 | 3 | -------------------------------------------------------------------------------- /isolation_sources/genera-environment.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import argparse 4 | 5 | 6 | def genera(envF): 7 | """ 8 | Read genera per environment 9 | :param envF: 10 | :return: 11 | """ 12 | 13 | with open(envF, 'r') as f: 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | if __name__ == '__main__': 22 | parser = argparse.ArgumentParser(description="Calculate shared genera per env") 23 | parser.add_argument('-f', help='file of environments and genera') 24 | args = parser.parse_args() -------------------------------------------------------------------------------- /jupyter/Emma_subsystems/EagleRay_level1.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:2ce48ce7c25fceb521de89fa24fe95088c1a6caa8faa4c9c001b35a22d9e66e1 3 | size 10782 4 | -------------------------------------------------------------------------------- /jupyter/Emma_subsystems/EagleRay_level1_norm_all.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:9502beb78f6f347bb55375bd179cd4b35d3068752e4f434826d2c25761166ed7 3 | size 10791 4 | -------------------------------------------------------------------------------- /jupyter/Emma_subsystems/EagleRay_level1_norm_ss.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:bc056f5111cf33f4aeb49f9a17e289bbf0d39fd378ef2356c43bac975710808d 3 | size 10814 4 | -------------------------------------------------------------------------------- /jupyter/Emma_subsystems/NorfolkWater_level1.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:b3e768aa89233898b08b131c9eda6ce955cc403130f0fdb91b3f7772832b5b1a 3 | size 20425 4 | -------------------------------------------------------------------------------- /jupyter/Emma_subsystems/NorfolkWater_level1_norm_all.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:e8b3d3133227c0714e97fb1f147d714ea97dc660563b583ad349069ef4e44146 3 | size 20434 4 | -------------------------------------------------------------------------------- /jupyter/Emma_subsystems/NorfolkWater_level1_norm_ss.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:932f8f94d2401c32380cc722c5141f6b647c5843ed95d57cdad87ef808810247 3 | size 20586 4 | -------------------------------------------------------------------------------- /jupyter/Emma_subsystems/PortJackson_level1.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:92b8639efee2e5f7c8584b5d8420dfcac6708bf429607bb68acf123f4a4eaf83 3 | size 11498 4 | -------------------------------------------------------------------------------- /jupyter/Emma_subsystems/PortJackson_level1_norm_all.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:1ebd7e6f6b34282c612807173699148329ca5749a2c637e1bb0cdefe9a3a0b26 3 | size 11507 4 | -------------------------------------------------------------------------------- /jupyter/Emma_subsystems/PortJackson_level1_norm_ss.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:cde6a00307bcf3e0c9d7734131fa430d93baf416a5c55206d982f20e25a60dd8 3 | size 11512 4 | -------------------------------------------------------------------------------- /jupyter/Emma_subsystems/TigerSharks_level1.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:3d9815154bab584826ca4a99e6a60fc251995228f5effc1cbaf511d542831758 3 | size 20971 4 | -------------------------------------------------------------------------------- /jupyter/Emma_subsystems/TigerSharks_level1_norm_all.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:5edb1d45808bca5f177e2fd2f3bb600a6085af670e3de5ce883df784bdd621b4 3 | size 20980 4 | -------------------------------------------------------------------------------- /jupyter/Emma_subsystems/TigerSharks_level1_norm_ss.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:957c447aa3cecf547cc6010f477c01a9eb5ef6b5168f97bb8402e88512ab5633 3 | size 21068 4 | -------------------------------------------------------------------------------- /jupyter/Emma_subsystems/eagle_ray_types/level1_norm_all.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:15cf6cf7c75c7eafdebedcced60e162c7ccbd4d39f9a107aee7fb27f3b840e31 3 | size 10724 4 | -------------------------------------------------------------------------------- /jupyter/Emma_subsystems/eagle_ray_types/level1_norm_ss.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:cce4747eb366ab9b852d231b1f696a8e2a0b8c3545a5929f2e515fd6a6b8dd4b 3 | size 10770 4 | -------------------------------------------------------------------------------- /jupyter/Emma_subsystems/eagle_ray_types/level1_raw.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:0fa890683819d6b9e91f72e2ba1e05cf4a9bb7b1a0f5011bec5aafb453251600 3 | size 5413 4 | -------------------------------------------------------------------------------- /jupyter/Emma_subsystems/eagle_ray_types/level2_norm_all.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:b4a3f5d5fa237a3fec88e848110316ca78d81985d26377da71922a208127e8fa 3 | size 43265 4 | -------------------------------------------------------------------------------- /jupyter/Emma_subsystems/eagle_ray_types/level2_norm_ss.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:045384a1fd95a31501ee32e651c2f46431b57cd390e2f718f8d1e98140a96c6a 3 | size 43436 4 | -------------------------------------------------------------------------------- /jupyter/Emma_subsystems/eagle_ray_types/level2_raw.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:ab8ffe777679dbee1512b3c5e3275d89d05cae5feb9136bf7ffd384d75e4e11f 3 | size 16711 4 | -------------------------------------------------------------------------------- /jupyter/Emma_subsystems/eagle_ray_types/ss_typed_norm_all.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:02273ecfd057264f4f37d527bc640cb0034f75793bc344fce197c2ef99e41a15 3 | size 174172 4 | -------------------------------------------------------------------------------- /jupyter/Emma_subsystems/eagle_ray_types/ss_typed_norm_ss.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:b09ccb52f25f7ed0e42b029e67b1d7f547c7fc369c22856b016d8f67cd86ced9 3 | size 175390 4 | -------------------------------------------------------------------------------- /jupyter/Emma_subsystems/eagle_ray_types/ss_typed_raw.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:3562afeeab0169cbca687ffcdd7c8dc5a35df75fae3dc7dc9b08adf461f96a7f 3 | size 60431 4 | -------------------------------------------------------------------------------- /jupyter/bacteroides_prophages.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/bacteroides_prophages.png -------------------------------------------------------------------------------- /jupyter/data/get_headers.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use Getopt::Std; 3 | use Data::Dumper; 4 | use Rob; 5 | 6 | my $file = shift || die "file to process"; 7 | open(IN, $file) || die "$! : $file"; 8 | my @a = split /\t/, ; 9 | chomp(@a); 10 | my $idx = shift @a; 11 | 12 | my @first; 13 | my @second; 14 | map {if (m/chip/) {push @first, $_} elsif ($_) {push @second, $_}} @a; 15 | my $c=0; 16 | foreach my $l (sort {$a cmp $b} @first) { 17 | $c++; 18 | print "$l\t$c\n"; 19 | } 20 | foreach my $l (sort {$a cmp $b} @second) { 21 | $c++; 22 | print "$l\t$c\n"; 23 | } 24 | 25 | 26 | -------------------------------------------------------------------------------- /jupyter/lactobacillus_prophages.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/lactobacillus_prophages.png -------------------------------------------------------------------------------- /jupyter/sarah_data/mmseqs_taxonomy/all_levels.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:58856f191c98072e280bee0acbfe5fef927d33c301bb7b9b32368156ff2a6e7b 3 | size 1788929 4 | -------------------------------------------------------------------------------- /jupyter/sarah_data/mmseqs_taxonomy/all_levels_renamed.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:268178f7756f2135ed31f6a51f5a188d65cb47c1e449167afc531fd63976f58a 3 | size 1788864 4 | -------------------------------------------------------------------------------- /jupyter/sarah_data/mmseqs_taxonomy/animation/img_10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/mmseqs_taxonomy/animation/img_10.png -------------------------------------------------------------------------------- /jupyter/sarah_data/mmseqs_taxonomy/animation/img_11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/mmseqs_taxonomy/animation/img_11.png -------------------------------------------------------------------------------- /jupyter/sarah_data/mmseqs_taxonomy/animation/img_12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/mmseqs_taxonomy/animation/img_12.png -------------------------------------------------------------------------------- /jupyter/sarah_data/mmseqs_taxonomy/animation/img_13.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/mmseqs_taxonomy/animation/img_13.png -------------------------------------------------------------------------------- /jupyter/sarah_data/mmseqs_taxonomy/animation/img_14.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/mmseqs_taxonomy/animation/img_14.png -------------------------------------------------------------------------------- /jupyter/sarah_data/mmseqs_taxonomy/animation/img_15.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/mmseqs_taxonomy/animation/img_15.png -------------------------------------------------------------------------------- /jupyter/sarah_data/mmseqs_taxonomy/animation/img_16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/mmseqs_taxonomy/animation/img_16.png -------------------------------------------------------------------------------- /jupyter/sarah_data/mmseqs_taxonomy/animation/img_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/mmseqs_taxonomy/animation/img_2.png -------------------------------------------------------------------------------- /jupyter/sarah_data/mmseqs_taxonomy/animation/img_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/mmseqs_taxonomy/animation/img_3.png -------------------------------------------------------------------------------- /jupyter/sarah_data/mmseqs_taxonomy/animation/img_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/mmseqs_taxonomy/animation/img_4.png -------------------------------------------------------------------------------- /jupyter/sarah_data/mmseqs_taxonomy/animation/img_5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/mmseqs_taxonomy/animation/img_5.png -------------------------------------------------------------------------------- /jupyter/sarah_data/mmseqs_taxonomy/animation/img_6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/mmseqs_taxonomy/animation/img_6.png -------------------------------------------------------------------------------- /jupyter/sarah_data/mmseqs_taxonomy/animation/img_7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/mmseqs_taxonomy/animation/img_7.png -------------------------------------------------------------------------------- /jupyter/sarah_data/mmseqs_taxonomy/animation/img_8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/mmseqs_taxonomy/animation/img_8.png -------------------------------------------------------------------------------- /jupyter/sarah_data/mmseqs_taxonomy/animation/img_9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/mmseqs_taxonomy/animation/img_9.png -------------------------------------------------------------------------------- /jupyter/sarah_data/mmseqs_taxonomy/animation/taxonomy.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/mmseqs_taxonomy/animation/taxonomy.gif -------------------------------------------------------------------------------- /jupyter/sarah_data/mmseqs_taxonomy/class.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:5aef8a6a4af7ff80dcc84915274bcb7a6f71e9005c659d79966f278127479774 3 | size 15934 4 | -------------------------------------------------------------------------------- /jupyter/sarah_data/mmseqs_taxonomy/class_renamed.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:8058cabad4699fadd0747fd66fd0eb77087c4afc030a7613e20c8efc45e6b5de 3 | size 15902 4 | -------------------------------------------------------------------------------- /jupyter/sarah_data/mmseqs_taxonomy/family.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:b50225e7f78de7055fa908373a631b75e9f5ff465ca93f47ce7e60489718e92e 3 | size 62056 4 | -------------------------------------------------------------------------------- /jupyter/sarah_data/mmseqs_taxonomy/family_renamed.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:fabe353af584179e6a7a2973f796991257a75287940da39c62b5957e2af4d368 3 | size 62017 4 | -------------------------------------------------------------------------------- /jupyter/sarah_data/mmseqs_taxonomy/genus.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:e57dcaf0a670ccbeba5ac4c8fcf0971ea4e897c34f5369f5b21ed2615543e7db 3 | size 171315 4 | -------------------------------------------------------------------------------- /jupyter/sarah_data/mmseqs_taxonomy/genus_renamed.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:825923ecbd31fef4514e0b030e1515b757b0db975d2f62ff53cb6fa568a6ca1c 3 | size 171283 4 | -------------------------------------------------------------------------------- /jupyter/sarah_data/mmseqs_taxonomy/order.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:2a91d16a4a12a245d005221c8004da7f2f09cf0f6b37d9396d5a7b72db6c864f 3 | size 32319 4 | -------------------------------------------------------------------------------- /jupyter/sarah_data/mmseqs_taxonomy/order_renamed.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:4a290a9420f5f9085790cdf23f78847ebd2a7182bdb9d176386670f1d7262500 3 | size 32283 4 | -------------------------------------------------------------------------------- /jupyter/sarah_data/mmseqs_taxonomy/pca_by_approach.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/mmseqs_taxonomy/pca_by_approach.png -------------------------------------------------------------------------------- /jupyter/sarah_data/mmseqs_taxonomy/pca_by_filter.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/mmseqs_taxonomy/pca_by_filter.png -------------------------------------------------------------------------------- /jupyter/sarah_data/mmseqs_taxonomy/pca_by_method.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/mmseqs_taxonomy/pca_by_method.png -------------------------------------------------------------------------------- /jupyter/sarah_data/mmseqs_taxonomy/pca_by_replicate.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/mmseqs_taxonomy/pca_by_replicate.png -------------------------------------------------------------------------------- /jupyter/sarah_data/mmseqs_taxonomy/pca_by_sample.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/mmseqs_taxonomy/pca_by_sample.png -------------------------------------------------------------------------------- /jupyter/sarah_data/mmseqs_taxonomy/pca_combined_replicates.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/mmseqs_taxonomy/pca_combined_replicates.png -------------------------------------------------------------------------------- /jupyter/sarah_data/mmseqs_taxonomy/phylum.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:0096fb49275e35c2ca5fb12de5ec5411980d16f2eb7c9aa01f768f510c34d3e6 3 | size 13142 4 | -------------------------------------------------------------------------------- /jupyter/sarah_data/mmseqs_taxonomy/phylum_renamed.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:29aaaf1f49cebb44a6540c46cd23f8a3c1574a3f46ad7514c1ee8c3cfd6d5ef0 3 | size 13118 4 | -------------------------------------------------------------------------------- /jupyter/sarah_data/mmseqs_taxonomy/species.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:ec0a283c3e57c47b6c3de2635f87b52ab99604bfeee0ac5ea60c34b05a46e43b 3 | size 920431 4 | -------------------------------------------------------------------------------- /jupyter/sarah_data/mmseqs_taxonomy/species_renamed.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:8004e4bbd9f8af70ba304b330156f60a2f9ab122e7aad405d4c91087954db821 3 | size 920367 4 | -------------------------------------------------------------------------------- /jupyter/sarah_data/mmseqs_taxonomy/superkingdom.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:07d848069a728613a45d9d3be4de3460b7c2f31a079606776421d179cbdd3e47 3 | size 857 4 | -------------------------------------------------------------------------------- /jupyter/sarah_data/mmseqs_taxonomy/superkingdom_renamed.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:eef8e2ca8e33e73ffec79d9a83468dda871f9bc77fba00d584cd5c8e7df6d629 3 | size 838 4 | -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/README.md: -------------------------------------------------------------------------------- 1 | # NORMALIZATIONS 2 | 3 | Currently we perform three normalizations: 4 | 5 | 1. \*\_raw.tsv 6 | 7 | This is the non-normalised data, so just the raw counts. For each sequence, if it appears in one subsystem we incremenet that count by 1, but if it occurs in more than one subsystem, we increment that count by 1/n (1/2 for 2 subsystems, 1/3 for 3 subsystems, etc). 8 | 9 | 2. \*\_norm_all.tsv 10 | 11 | This data is normalised for _all_ reads, regardless of whether they are in a subsystem or not. This makes smaller numbers. 12 | 13 | 3. \*\_norm_ss.tsv 14 | 15 | This data is normalised only to the number of reads that match to subsystems, so if there is a lot of other stuff we ignore it. 16 | 17 | 18 | -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/all_norm_all.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:df1101333f95f325366b8807d1caaae0a0cc4a7cd71812b7296d10f8dc44cf7e 3 | size 230533 4 | -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/all_norm_all_renamed.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:0f9d76e22baba6dfbc17b46a0c5bc9017d385f8bfb3ef3bd0f542e7f703f1e30 3 | size 230577 4 | -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/all_norm_ss.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:d4e1595695e2a67ab48d69e4c69aa3ccf6b27a07a6b2c0374677bb9830c90fa4 3 | size 231419 4 | -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/all_norm_ss_renamed.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:27df5af57de582b8acd880d39ae3363b5394dcded57d7271724cf8bf35ca2f00 3 | size 231463 4 | -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/all_raw.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:e7d1ebf57b9e220840468ded0b1fbf85614371022269a0cfdbe7c7053a50389c 3 | size 83793 4 | -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/all_raw_renamed.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:8c33f5f97ca06e83a40b7f21bdd915dcad602066168ff6c76c4b50c9b7ba8576 3 | size 83610 4 | -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/animation/img_10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/subsystems/animation/img_10.png -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/animation/img_11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/subsystems/animation/img_11.png -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/animation/img_12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/subsystems/animation/img_12.png -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/animation/img_13.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/subsystems/animation/img_13.png -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/animation/img_14.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/subsystems/animation/img_14.png -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/animation/img_15.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/subsystems/animation/img_15.png -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/animation/img_16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/subsystems/animation/img_16.png -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/animation/img_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/subsystems/animation/img_2.png -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/animation/img_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/subsystems/animation/img_3.png -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/animation/img_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/subsystems/animation/img_4.png -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/animation/img_5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/subsystems/animation/img_5.png -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/animation/img_6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/subsystems/animation/img_6.png -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/animation/img_7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/subsystems/animation/img_7.png -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/animation/img_8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/subsystems/animation/img_8.png -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/animation/img_9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/subsystems/animation/img_9.png -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/animation/subsystems.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/subsystems/animation/subsystems.gif -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/class_norm_all.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:fb6e04772566d1276a3680e25f00ef74ecef9b11b4be9823b55ef3db175f77c7 3 | size 4546 4 | -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/class_norm_all_renamed.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:ea4e2ac3cc4ba80f4ab2f961c9412249af8e913a3c5dc32446ec161966f0d0e3 3 | size 4578 4 | -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/class_norm_ss.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:8ae2abf953ef5d2044cdabc4674c1f8f688baec58038a091a490e23bf605003d 3 | size 4548 4 | -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/class_norm_ss_renamed.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:eb63cc354a1a1c3c3961971d4d75a1cbaa126a18f5fc80cb064cb2885a625d58 3 | size 4578 4 | -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/class_raw.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:3f4125bc09e5b15cdcb1b3f6cf9471f485a583b906b7393e66d68f881895f25a 3 | size 3153 4 | -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/class_raw_renamed.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:3bf2f6cb66c3ef3689b83955077fb4967f8eee2e0a931fb67cb0e9d5cc1c5acf 3 | size 3171 4 | -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/level1_norm_all.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:498aab50552e37c89cd29109718c4b6d9905c23e8bcc285fee4e2f2122bc1e05 3 | size 11863 4 | -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/level1_norm_all_renamed.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:3a1f53cba5dd1aee5cec3fcc2a87b414a8fc6dee2dc959e2f2c027294cddb61d 3 | size 11897 4 | -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/level1_norm_ss.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:26d50896610e8ade7ad7168e55e5685716a5b8d341952cb86ed64f4320b73dc6 3 | size 11844 4 | -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/level1_norm_ss_renamed.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:09e3d6f9086406c040574a01176caa1af841f6dbe2421142453048ce8ed98cb8 3 | size 11878 4 | -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/level1_raw.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:abe36aba85f88a9587ec3dc1ae9e1cce323c23fded8d04f70b8f5f973617164b 3 | size 6837 4 | -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/level1_raw_renamed.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:18f7651b1b8dd08a956262605d85cfaaa029593ca1b6d86a5a91b27ab5772e98 3 | size 6856 4 | -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/level2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/subsystems/level2.png -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/level2_norm_all.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:ee134eecf83b7eec83ca1c5e87cbe7ebee1ad853716b63e9724613331f80249a 3 | size 49983 4 | -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/level2_norm_all_renamed.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:c50aff3a6902a98a1b9bc5e66e28e57a591ad2d14c1c19f4ba5b39a874c04f96 3 | size 50030 4 | -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/level2_norm_ss.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:2601d0e377ebf2119a0a43e21147b58f75d86e3dd83b34da55ce5f32022304e0 3 | size 50114 4 | -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/level2_norm_ss_renamed.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:d2e6ab18718b9dd843f26420187ca772c505ded6636a6c2173555e16bc5166da 3 | size 50153 4 | -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/level2_raw.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:91df8242cb4bd16ad574b4f20b4cd3e00a0c7b675d3745370e64d287f76c0451 3 | size 21481 4 | -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/level2_raw_renamed.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:39632352c97256c26af5de894a68d7f4d6bebabd66c075479c6613e9dd49759f 3 | size 21500 4 | -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/normalised_subsystem_level2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/subsystems/normalised_subsystem_level2.png -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/pca_by_approach.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/subsystems/pca_by_approach.png -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/pca_by_filter.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/subsystems/pca_by_filter.png -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/pca_by_method.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/subsystems/pca_by_method.png -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/pca_by_replicate.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/subsystems/pca_by_replicate.png -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/pca_by_sample.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/subsystems/pca_by_sample.png -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/pca_combined_replicates.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/jupyter/sarah_data/subsystems/pca_combined_replicates.png -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/subsystems_norm_all.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:5157a6de41c6c9ad430076ab1772079bb943a17c579ffa9940231c69afefc31c 3 | size 225791 4 | -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/subsystems_norm_all_renamed.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:88facbf9b39609a93c7f83e448b57e2c07560f8486269fa55bfbee0b4f448d89 3 | size 225818 4 | -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/subsystems_norm_ss.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:b76508435fa599cd36b9c7f86c7b2b89af92160084d9e649bb349b81bb8a478a 3 | size 226313 4 | -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/subsystems_norm_ss_renamed.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:b90fdf723679ebd12ba035d121a6906dbec200b81d3bd03c8bac8cbe11b987f4 3 | size 226359 4 | -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/subsystems_raw.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:5794567819778d270611d5b2b2a683dd457c77041296044ff1631c468b470624 3 | size 79893 4 | -------------------------------------------------------------------------------- /jupyter/sarah_data/subsystems/subsystems_raw_renamed.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:0210849eaf89ba559b6f10738be5787c96bfcf6dc9b02cffc2d71d79ed79481b 3 | size 79688 4 | -------------------------------------------------------------------------------- /jupyter/subsystems_data.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:db34618a583198795f2624f590d004843af584530c9c22d82e977f0615e50fae 3 | size 656607 4 | -------------------------------------------------------------------------------- /jupyter/subsystems_data_all.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:0ead7593972d777acce4b91d5cfad642f6edeedfdd6f25d7d941b9194e14ef34 3 | size 656559 4 | -------------------------------------------------------------------------------- /jupyter/taxonomy_data.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:8df2e6cf456fdda362f72c5518c6e2a657fc59a03e7a12937507a0832c7bd3ee 3 | size 3293678 4 | -------------------------------------------------------------------------------- /jupyter/taxonomy_data_all.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:af7b5210010a96b006021d5222ba380ba38e18f0a73f7619a8c560b210fd2a24 3 | size 3292856 4 | -------------------------------------------------------------------------------- /jupyter/test2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import os" 12 | ] 13 | } 14 | ], 15 | "metadata": { 16 | "kernelspec": { 17 | "display_name": "Python 3", 18 | "language": "python", 19 | "name": "python3" 20 | }, 21 | "language_info": { 22 | "codemirror_mode": { 23 | "name": "ipython", 24 | "version": 2 25 | }, 26 | "file_extension": ".py", 27 | "mimetype": "text/x-python", 28 | "name": "python", 29 | "nbconvert_exporter": "python", 30 | "pygments_lexer": "ipython2", 31 | "version": "2.7.6" 32 | } 33 | }, 34 | "nbformat": 4, 35 | "nbformat_minor": 0 36 | } -------------------------------------------------------------------------------- /kbase/parse_json.py: -------------------------------------------------------------------------------- 1 | """ 2 | Parse the JSON file and print some stuff out 3 | """ 4 | 5 | import os 6 | import sys 7 | 8 | import argparse 9 | import json 10 | 11 | import re 12 | 13 | if __name__ == '__main__': 14 | parser = argparse.ArgumentParser(description="Parse the JSON file downloaded from KBase") 15 | parser.add_argument('-f', help='JSON file', required=True) 16 | args = parser.parse_args() 17 | 18 | data = json.load(open(args.f)) 19 | 20 | # print all the keys 21 | print("\n".join(data.keys())) 22 | 23 | # print keys associated with features 24 | feats = data['features'] 25 | 26 | print("There are " + str(len(data['contig_lengths'])) + " contigs") 27 | 28 | sys.exit(0) 29 | 30 | for f in feats: 31 | #print("\t".join([f['id'], f['function']])) 32 | for p in f['function'].split(' ; '): 33 | m = re.match('\s*[\d\-\.]+$', p) 34 | if m and m.end() == len(p): 35 | print("\t".join([f['id'], 'EC ' + p])) 36 | # else: 37 | # print("\t".join([f['id'], p])) 38 | 39 | 40 | -------------------------------------------------------------------------------- /kmers/count_kmers.py: -------------------------------------------------------------------------------- 1 | """ 2 | Count the 11mers in a sequence 3 | """ 4 | 5 | import os 6 | import sys 7 | import argparse 8 | from itertools import product 9 | from roblib import read_fasta 10 | from roblib import median 11 | 12 | parser = argparse.ArgumentParser(description='Count the kmers in a fasta file') 13 | parser.add_argument('-f', help='fasta file', required=True) 14 | parser.add_argument('-s', help='K-mer size, (default=11)', type=int, default=11) 15 | args = parser.parse_args() 16 | 17 | fa = read_fasta(args.f) 18 | 19 | for id in fa: 20 | count = [] 21 | for k in product("ATGC", repeat=args.s): 22 | sk = "".join(k) 23 | count.append(fa[id].upper().count(sk)) 24 | 25 | print("id: {} len(seq): {} sum: {} n: {} average: {} median: {} max: {}".format( 26 | id, len(fa[id]), sum(count), len(count), (1.0 * sum(count) / len(count)), median(count), max(count) 27 | )) 28 | 29 | 30 | -------------------------------------------------------------------------------- /kmers/count_kmers_genbank.py: -------------------------------------------------------------------------------- 1 | """ 2 | Count the kmers in a genbank file 3 | """ 4 | 5 | 6 | import os 7 | import sys 8 | import argparse 9 | from itertools import product 10 | from roblib import median 11 | from Bio import SeqIO 12 | 13 | parser = argparse.ArgumentParser(description='Count the kmers in a fasta file') 14 | parser.add_argument('-g', help='genbank file', required=True) 15 | parser.add_argument('-s', help='K-mer size, (default=11)', type=int, default=11) 16 | args = parser.parse_args() 17 | 18 | gbk = SeqIO.parse(args.g, 'genbank') 19 | for record in gbk: 20 | count = [] 21 | for k in product("ATGC", repeat=args.s): 22 | sk = "".join(k) 23 | count.append(record.seq.upper().count(sk)) 24 | 25 | print("id: {} len(seq): {} sum: {} n: {} average: {} median: {} max: {}".format( 26 | id, len(record.seq), sum(count), len(count), (1.0 * sum(count) / len(count)), median(count), max(count) 27 | )) 28 | 29 | 30 | -------------------------------------------------------------------------------- /kmers/find_kmers.py: -------------------------------------------------------------------------------- 1 | """ 2 | Model the frequency that we find different kmers in DNA sequences 3 | """ 4 | 5 | import os 6 | import sys 7 | import argparse 8 | from random import randint 9 | from itertools import product 10 | from statistics import median 11 | 12 | __author__ = 'Rob Edwards' 13 | 14 | def generate_random_seq(length): 15 | """ 16 | Generate a random sequence of length len 17 | :param length: the length to generate 18 | :return: str 19 | """ 20 | 21 | bases = {1: "A", 2: "G", 3: "C", 4: "T"} 22 | seq="" 23 | for i in range(length): 24 | seq += bases[randint(1,4)] 25 | return seq 26 | 27 | 28 | 29 | #seq = generate_random_seq(1000000) 30 | seq = generate_random_seq(4194304) 31 | count = [] 32 | for k in product("ATGC", repeat=11): 33 | sk="".join(k) 34 | count.append(seq.count(sk)) 35 | 36 | 37 | 38 | print("sum: {} n: {} average: {} median: {}".format( 39 | sum(count), len(count), (1.0*sum(count)/len(count)), median(count) 40 | )) 41 | 42 | -------------------------------------------------------------------------------- /kmers/hashcode.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | """ 4 | 5 | import os 6 | import sys 7 | import argparse 8 | 9 | from roblib import bcolors 10 | 11 | if __name__ == '__main__': 12 | parser = argparse.ArgumentParser(description="Plot a heatmap") 13 | parser.add_argument('-f', help='file', required=True) 14 | parser.add_argument('-o', help='output file') 15 | parser.add_argument('-v', help='verbose output', action='store_true') 16 | args = parser.parse_args() 17 | -------------------------------------------------------------------------------- /manipulate_genomes/README.md: -------------------------------------------------------------------------------- 1 | # Filter some reads based on a blast file -------------------------------------------------------------------------------- /matplotlib graphs/show_fig.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | figx = pickle.load(open('3dfig.pickle', 'rb')) 4 | figx.show() -------------------------------------------------------------------------------- /mmseqs/dummy_database.sqlite: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:c0238af1292f9b9971da51b632ec5d7f2a75ff20fe039552385e76396010c96a 3 | size 24576 4 | -------------------------------------------------------------------------------- /mongodb/find_biomasses.py: -------------------------------------------------------------------------------- 1 | """ 2 | Find biomasses greater than a value 3 | """ 4 | 5 | import os 6 | import sys 7 | 8 | from pymongo import MongoClient 9 | 10 | databasename = 'fba_models' 11 | collectionname = 'citrobacter' 12 | 13 | value = 40 14 | 15 | 16 | client = MongoClient() 17 | 18 | coll = client[databasename][collectionname] 19 | 20 | for cursor in coll.find({"content.biomasses.biomasscompounds.coefficient": { '$gt': 35 }}): 21 | print(cursor['file_name']) -------------------------------------------------------------------------------- /mongodb/print_keys.py: -------------------------------------------------------------------------------- 1 | """ 2 | Print the keys from a file 3 | """ 4 | 5 | import os 6 | import sys 7 | 8 | import argparse 9 | import json 10 | 11 | if __name__ == '__main__': 12 | parser = argparse.ArgumentParser(description="Print all the keys in a file") 13 | parser.add_argument('-f', help='Files to print keys from', required=True) 14 | args = parser.parse_args() 15 | 16 | data = json.load(open(args.f, 'r')) 17 | for k in data: 18 | print(k) -------------------------------------------------------------------------------- /mongodb/search_mongo.py: -------------------------------------------------------------------------------- 1 | """ 2 | Search a mongo database loaded with load_models.py 3 | 'file_name' : '/data/Genotype-Phenotype-Modeling/models/Citrobacter/Citrobacter/models/C.sedlakii_gf_draft_ArgonneLB.json' 4 | 5 | """ 6 | 7 | 8 | import argparse 9 | 10 | from pymongo import MongoClient 11 | 12 | if __name__ == '__main__': 13 | parser = argparse.ArgumentParser(description="Search a mongo database") 14 | parser.add_argument('-n', help='Database name', required=True) 15 | parser.add_argument('-c', help='Collection name', required=True) 16 | parser.add_argument('-k', help='Key to find', required=True) 17 | parser.add_argument('-v', help='Value for key', required=True) 18 | args = parser.parse_args() 19 | 20 | client = MongoClient() 21 | 22 | db = client[args.n] 23 | coll = db[args.c] 24 | 25 | for cursor in coll.find({args.k : args.v}): 26 | # this is a cursor to the document 27 | print(cursor['file_name']) 28 | -------------------------------------------------------------------------------- /mongodb/simple_find.py: -------------------------------------------------------------------------------- 1 | """ 2 | Search for something in the database 3 | 4 | """ 5 | 6 | import sys 7 | import os 8 | 9 | import json 10 | from pymongo import MongoClient 11 | 12 | client = MongoClient() 13 | 14 | db = client['fba_models'] 15 | coll = db['citrobacter'] 16 | 17 | for cursor in coll.find({ "media_ref": "5067/41/1" }): 18 | # examples from the models data; 19 | """ 20 | "content.id" : "contig00112.fbamdl3.fba.90" 21 | 22 | "content.objectiveValue" : { '$gt' : 10 } 23 | "media_ref":"5067/41/1" 24 | 25 | """ 26 | # this is a cursor to the document 27 | print(cursor['file_name']) 28 | 29 | 30 | -------------------------------------------------------------------------------- /mongodb/simple_load_models.py: -------------------------------------------------------------------------------- 1 | """ 2 | Create a mongo database if it doesn't exist and load a bunch of data into it. 3 | 4 | We need a directory with one or more JSON files in it. We look for JSON on the end of the filename. 5 | 6 | e.g. python load_models.py -d /data/Genotype-Phenotype-Modeling/models/Citrobacter/Citrobacter/models/ -n fba_models -c citrobacter 7 | """ 8 | 9 | 10 | import sys 11 | import os 12 | 13 | import json 14 | from pymongo import MongoClient 15 | 16 | client = MongoClient() 17 | 18 | db = client['fba_models'] 19 | coll = db['citrobacter'] 20 | 21 | 22 | for f in os.listdir('/data/Genotype-Phenotype-Modeling/models/Citrobacter/Citrobacter/models/'): 23 | if f.lower().endswith('.json'): 24 | sys.stderr.write("Loading file " + f + "\n") 25 | text = json.load(open(os.path.join(args.d, f))) 26 | obj = {'file_name': os.path.join(args.d, f), 'content': text} 27 | coll.insert(obj) 28 | 29 | -------------------------------------------------------------------------------- /mummer/six_mers.py: -------------------------------------------------------------------------------- 1 | """ 2 | Create a list of all six mers and filter for those that are also in the list of reverse complement 6 mers 3 | """ 4 | 5 | import os 6 | import sys 7 | import argparse 8 | 9 | from roblib import bcolors 10 | from roblib import rc 11 | 12 | from itertools import product 13 | 14 | def countit(): 15 | kmer = 6 16 | bases = {'A', 'C', 'T', 'G'} 17 | 18 | fwd = set() 19 | rev = set() 20 | for s in product(bases, repeat = kmer): 21 | seq = "".join(s) 22 | fwd.add(seq) 23 | rev.add(rc(seq)) 24 | 25 | count = 0 26 | for f in fwd: 27 | if f not in rev: 28 | print(f) 29 | else: 30 | count+=1 31 | 32 | print(f"Checked {count}") 33 | 34 | if __name__ == '__main__': 35 | countit() -------------------------------------------------------------------------------- /ncbi/genbank_phages_via_ftp.py: -------------------------------------------------------------------------------- 1 | import StringIO 2 | from ftplib import FTP 3 | import gzip 4 | from Bio import SeqIO 5 | 6 | r = StringIO.StringIO() 7 | 8 | def read_data(data): 9 | r.write(data) 10 | 11 | ftp = FTP('ftp.ncbi.nlm.nih.gov') 12 | ftp.login() 13 | ftp.cwd('genbank/') 14 | ftp.retrbinary('RETR gbphg3.seq.gz', r.write) 15 | 16 | r.seek(0) 17 | 18 | for seq in SeqIO.parse(gzip.GzipFile(fileobj=r), 'genbank'): 19 | print(seq.id + "\t" + seq.) -------------------------------------------------------------------------------- /ncbi/get_protein_sequence.py: -------------------------------------------------------------------------------- 1 | """ 2 | Get a set of protein sequences from NCBI 3 | """ 4 | 5 | import os 6 | import sys 7 | 8 | import argparse 9 | 10 | from Bio import Entrez, SeqIO 11 | from time import sleep 12 | from random import randint 13 | 14 | if __name__ == '__main__': 15 | parser = argparse.ArgumentParser(description="Get a set of protein sequences from NCBI") 16 | parser.add_argument('-f', help='File of IDs to get', required=True) 17 | parser.add_argument('-o', help='Output file', required=True) 18 | args = parser.parse_args() 19 | 20 | # retrieve a GI number from GenBank 21 | Entrez.email = 'raedwards@gmail.com' # set this so NCBI knows who to complain to 22 | 23 | out = open(args.o, 'w') 24 | with open(args.f, 'r') as f: 25 | for l in f: 26 | p=l.strip().split("\t") 27 | handle = Entrez.efetch(db="protein", id=p[0], rettype="gbwithparts", retmode="text") 28 | out.write(handle.read()) 29 | sleep(randint(0, 5)) 30 | out.close() 31 | -------------------------------------------------------------------------------- /ncbi/tax2spreadsheetdb.py: -------------------------------------------------------------------------------- 1 | """ 2 | Print the NCBI taxonomy as a spreadsheet 3 | """ 4 | 5 | from taxon import get_taxonomy_db, get_taxonomy, all_species_ids 6 | 7 | want = ['superkingdom', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'] 8 | 9 | def printtaxa(i, c): 10 | """ 11 | Print out the taxonomy 12 | :param i: identifier 13 | :param c: database connection 14 | :return: 15 | """ 16 | 17 | names = {w: "" for w in want} 18 | t, n = get_taxonomy(i, c) 19 | if t.rank in want: 20 | names[t.rank] = n.get_name() 21 | while t.parent != 1 and t.taxid != 1: 22 | t, n = get_taxonomy(t.parent, c) 23 | if t.rank in want: 24 | names[t.rank] = n.get_name() 25 | print("\t".join([str(i)] + [names[w] for w in want])) 26 | 27 | 28 | 29 | if __name__ == '__main__': 30 | c = get_taxonomy_db() 31 | for i in all_species_ids(c): 32 | printtaxa(i[0], c) 33 | -------------------------------------------------------------------------------- /ncbi/taxonomy_phylum_kingdom.py: -------------------------------------------------------------------------------- 1 | from taxon import get_taxonomy_db, get_taxonomy, all_ids 2 | 3 | want = ['species', 'genus', 'family', 'order', 'class', 'phylum', 'kingdom'] 4 | 5 | def printtaxa(i): 6 | bn=names[i].name 7 | if i in blastname: 8 | bn=blastname[i].name 9 | 10 | level={} 11 | 12 | node = i 13 | while taxa[node].parent != '1' and node != '1': 14 | if taxa[node].rank in want: 15 | level[taxa[node].rank]=names[node].name 16 | node=taxa[node].parent 17 | 18 | print("{}\t{}".format(i, bn), end="") 19 | for l in want: 20 | if l in level: 21 | print("\t{}".format(level[l]), end="") 22 | else: 23 | print("\t-", end="") 24 | print("") 25 | 26 | 27 | c = get_taxonomy_db() 28 | for i in all_ids(c): 29 | print (f"{i}") 30 | t, n = get_taxonomy(i, c) 31 | if t.rank == "phylum": 32 | while t.parent != 1 and t.taxid != 1: 33 | t, n = get_taxonomy(t.parent, c) 34 | print(f"rank: {t.rank} :: name: {t.common_name}") 35 | 36 | 37 | -------------------------------------------------------------------------------- /percent_pairwise_identity/RecA_uniprot.faa.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:62f8a5cde6d4176269b2a74de0098bca73643e9eff9906f0e24950a68daf7993 3 | size 381127 4 | -------------------------------------------------------------------------------- /percent_pairwise_identity/identical_percent_ids.json.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:6df18ec4c0e3f1bd1f615518d2b90ee5b12063c4f5e4d54323499ea36ef4e7f3 3 | size 7501033 4 | -------------------------------------------------------------------------------- /percent_pairwise_identity/min_percent_counts.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | sl = [] # sorted list 4 | with open("RecA_uniprot_pairwise_ids.tsv", 'r') as f: 5 | #with open("temp", 'r') as f: 6 | for l in f: 7 | p=l.strip().split("\t") 8 | val = float(p[2]) 9 | added = False 10 | for i in range(len(sl)): 11 | if sl[i] > val: 12 | sl[i:i]=[val] 13 | added=True 14 | break 15 | if not added: 16 | sl.append(val) 17 | 18 | indices = [-1 for i in range(100)] 19 | for i in range(len(sl)): 20 | ni = int(sl[i])+1 21 | sys.stderr.write("Adding {} for {}\n".format(ni, sl[i])) 22 | indices[ni]=i 23 | 24 | print(sl) 25 | 26 | lastindex=-1 27 | for i in range(50, 70): 28 | if indices[i] == -1: 29 | indices[i] = lastindex 30 | else: 31 | lastindex = indices[i] 32 | print("{}\t{}".format(i, indices[i]+1)) 33 | 34 | -------------------------------------------------------------------------------- /percent_pairwise_identity/needleman_wunsch-0.3.5/README.md: -------------------------------------------------------------------------------- 1 | # Needleman-Wunsch global alignment 2 | 3 | This is a fork of software originally written by Isaac Turner and available from [sourceforge](http://sourceforge.net/projects/needlemanwunsch). 4 | 5 | We have made some adjustments to the code, and changed the input/output formats. This code is copyright Isaac Turner and Katelyn McNair. 6 | 7 | We will be releasing a full version of this software soon! 8 | 9 | === License 10 | 11 | This software was originally written under the GPL, and thus we use that license: 12 | 13 | GNU General Public License (v3 or later). See LICENSE file. 14 | -------------------------------------------------------------------------------- /percent_pairwise_identity/needleman_wunsch-0.3.5/seq1.fna: -------------------------------------------------------------------------------- 1 | >one 2 | actg 3 | -------------------------------------------------------------------------------- /percent_pairwise_identity/needleman_wunsch-0.3.5/seq2.fna: -------------------------------------------------------------------------------- 1 | >two 2 | act 3 | -------------------------------------------------------------------------------- /percent_pairwise_identity/strain_taxonomy.txt.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:f646c9ee6d0ecd307c6c23a3130ce156f2d220c983a8a81733d797d7479e8bd1 3 | size 131090 4 | -------------------------------------------------------------------------------- /perl/SGE-0.02/Changes: -------------------------------------------------------------------------------- 1 | v 0.01 2 | 3 | Initial release into the wild 4 | 5 | 6 | v 0.02 7 | 8 | October 2005. Received these excellent comments from Hinri Kerstens: 9 | 10 | Thank you for writing the ScheduleSGE perl module. When using this 11 | module I found two 'mismatches' between your system and mine. 12 | - My SGE version (5.3) returns "your job" instead of "Your job" after 13 | submission of a job. Maybe you can make line 233 of Run.pm tolerant for 14 | that. 15 | - The CPAN documentation claims that a jobID can be grabbed by "my 16 | $pid=$sge->job_id;", but job_id doesn't exist in the modules. It should 17 | be "my $pid=$sge->execute;" isn't it? 18 | 19 | After these modifications the module runs happily, so keep on the good 20 | work. 21 | 22 | regards 23 | 24 | Hinri 25 | 26 | These two bugs have been fixed, and the method job_id added 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /perl/SGE-0.02/Control/CVS/Entries: -------------------------------------------------------------------------------- 1 | /Control.pm/1.1.1.1/Fri Sep 14 22:49:31 2007// 2 | /MANIFEST/1.1.1.1/Fri Sep 14 22:49:31 2007// 3 | /Makefile.PL/1.1.1.1/Fri Sep 14 22:49:31 2007// 4 | D 5 | -------------------------------------------------------------------------------- /perl/SGE-0.02/Control/CVS/Repository: -------------------------------------------------------------------------------- 1 | bioinformatics/Modules/SGE-0.02/Control 2 | -------------------------------------------------------------------------------- /perl/SGE-0.02/Control/CVS/Root: -------------------------------------------------------------------------------- 1 | :ext:linsalrob@edwards-sdsu.cvs.sourceforge.net:/cvsroot/edwards-sdsu 2 | -------------------------------------------------------------------------------- /perl/SGE-0.02/Control/Control.pm: -------------------------------------------------------------------------------- 1 | # Schedule::SGE::Control 2 | 3 | # POD docs 4 | 5 | =head1 Schedule::SGE::Control 6 | 7 | Control jobs on the SGE queues. You should not use this method directly, rather you should use the SGE method that inherits from this, then all the methods herein are available to you. 8 | 9 | =head1 AUTHOR 10 | 11 | Rob Edwards (rob@salmonella.org) 12 | 3/24/05 13 | 14 | =cut 15 | 16 | package SGE::Control; 17 | use strict; 18 | use Exporter; 19 | 20 | use vars qw(@ISA @EXPORT_OK); 21 | @ISA = qw(Schedule::SGE Exporter); 22 | @EXPORT_OK = qw(qdel); 23 | our $VERSION = '0.01'; 24 | 25 | =head2 qdel() 26 | 27 | Delete all failed jobs from a queue (this must be run as the user who owns the jobs) 28 | 29 | =cut 30 | 31 | sub qdel { 32 | my ($self, $user)=@_; 33 | unless ($user) {$user =`whoami`; chomp($user)} 34 | print `qdel -u $user`; 35 | } 36 | 37 | 38 | 1; 39 | -------------------------------------------------------------------------------- /perl/SGE-0.02/Control/MANIFEST: -------------------------------------------------------------------------------- 1 | Control.pm 2 | Makefile.PL 3 | MANIFEST 4 | -------------------------------------------------------------------------------- /perl/SGE-0.02/Control/Makefile.PL: -------------------------------------------------------------------------------- 1 | use 5.008001; 2 | use ExtUtils::MakeMaker; 3 | # See lib/ExtUtils/MakeMaker.pm for details of how to influence 4 | # the contents of the Makefile that is written. 5 | WriteMakefile( 6 | NAME => 'Schedule::SGE::Control', 7 | VERSION_FROM => 'Control.pm', # finds $VERSION 8 | PREREQ_PM => {}, # e.g., Module::Name => 1.1 9 | ($] >= 5.005 ? ## Add these new keywords supported since 5.005 10 | (AUTHOR => 'rob ') : ()), 11 | ); 12 | -------------------------------------------------------------------------------- /perl/SGE-0.02/Copying: -------------------------------------------------------------------------------- 1 | Copyright (c) 2005 Rob Edwards. All rights reserved. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. 2 | -------------------------------------------------------------------------------- /perl/SGE-0.02/MANIFEST: -------------------------------------------------------------------------------- 1 | t/SGE.t 2 | Control/Control.pm 3 | Copying 4 | Run/Run.pm 5 | Status/Status.pm 6 | Changes 7 | README 8 | examples/README 9 | examples/submit2cluster.pl 10 | examples/test.pl 11 | -------------------------------------------------------------------------------- /perl/SGE-0.02/Makefile.PL: -------------------------------------------------------------------------------- 1 | use 5.008001; 2 | use ExtUtils::MakeMaker; 3 | # See lib/ExtUtils/MakeMaker.pm for details of how to influence 4 | # the contents of the Makefile that is written. 5 | WriteMakefile( 6 | NAME => 'Schedule::SGE', 7 | VERSION_FROM => 'SGE.pm', # finds $VERSION 8 | PREREQ_PM => {}, # e.g., Module::Name => 1.1 9 | ($] >= 5.005 ? ## Add these new keywords supported since 5.005 10 | (ABSTRACT => 'Schedule::SGE is a suite of modules for interacting with the Sun Grid Engine. The base module Schedule::SGE handles locating the executables and making sure everything works fine. The three modules Schedule::SGE::Run, Schedule::SGE::Control, and Schedule::SGE::Status are for different interactions with the queues', 11 | AUTHOR => 'rob ') : ()), 12 | ); 13 | -------------------------------------------------------------------------------- /perl/SGE-0.02/Run/CVS/Entries: -------------------------------------------------------------------------------- 1 | /MANIFEST/1.1.1.1/Fri Sep 14 22:49:31 2007// 2 | /Makefile.PL/1.1.1.1/Fri Sep 14 22:49:31 2007// 3 | /Run.pm/1.1.1.1/Fri Sep 14 22:49:31 2007// 4 | D 5 | -------------------------------------------------------------------------------- /perl/SGE-0.02/Run/CVS/Repository: -------------------------------------------------------------------------------- 1 | bioinformatics/Modules/SGE-0.02/Run 2 | -------------------------------------------------------------------------------- /perl/SGE-0.02/Run/CVS/Root: -------------------------------------------------------------------------------- 1 | :ext:linsalrob@edwards-sdsu.cvs.sourceforge.net:/cvsroot/edwards-sdsu 2 | -------------------------------------------------------------------------------- /perl/SGE-0.02/Run/MANIFEST: -------------------------------------------------------------------------------- 1 | Makefile.PL 2 | MANIFEST 3 | Run.pm 4 | -------------------------------------------------------------------------------- /perl/SGE-0.02/Run/Makefile.PL: -------------------------------------------------------------------------------- 1 | use 5.008001; 2 | use ExtUtils::MakeMaker; 3 | # See lib/ExtUtils/MakeMaker.pm for details of how to influence 4 | # the contents of the Makefile that is written. 5 | WriteMakefile( 6 | NAME => 'Schedule::SGE::Run', 7 | VERSION_FROM => 'Run.pm', # finds $VERSION 8 | PREREQ_PM => {}, # e.g., Module::Name => 1.1 9 | ($] >= 5.005 ? ## Add these new keywords supported since 5.005 10 | (AUTHOR => 'rob ') : ()), 11 | ); 12 | -------------------------------------------------------------------------------- /perl/SGE-0.02/Status/CVS/Entries: -------------------------------------------------------------------------------- 1 | /MANIFEST/1.1.1.1/Fri Sep 14 22:49:31 2007// 2 | /Makefile.PL/1.1.1.1/Fri Sep 14 22:49:31 2007// 3 | /Status.pm/1.1.1.1/Fri Sep 14 22:49:31 2007// 4 | D 5 | -------------------------------------------------------------------------------- /perl/SGE-0.02/Status/CVS/Repository: -------------------------------------------------------------------------------- 1 | bioinformatics/Modules/SGE-0.02/Status 2 | -------------------------------------------------------------------------------- /perl/SGE-0.02/Status/CVS/Root: -------------------------------------------------------------------------------- 1 | :ext:linsalrob@edwards-sdsu.cvs.sourceforge.net:/cvsroot/edwards-sdsu 2 | -------------------------------------------------------------------------------- /perl/SGE-0.02/Status/MANIFEST: -------------------------------------------------------------------------------- 1 | Makefile.PL 2 | MANIFEST 3 | Status.pm 4 | -------------------------------------------------------------------------------- /perl/SGE-0.02/Status/Makefile.PL: -------------------------------------------------------------------------------- 1 | use 5.008001; 2 | use ExtUtils::MakeMaker; 3 | # See lib/ExtUtils/MakeMaker.pm for details of how to influence 4 | # the contents of the Makefile that is written. 5 | WriteMakefile( 6 | NAME => 'Schedule::SGE::Status', 7 | VERSION_FROM => 'Status.pm', # finds $VERSION 8 | PREREQ_PM => {}, # e.g., Module::Name => 1.1 9 | ($] >= 5.005 ? ## Add these new keywords supported since 5.005 10 | (AUTHOR => 'rob ') : ()), 11 | ); 12 | -------------------------------------------------------------------------------- /perl/SGE-0.02/examples/CVS/Entries: -------------------------------------------------------------------------------- 1 | /README/1.1.1.1/Fri Sep 14 22:49:31 2007// 2 | /submit2cluster.pl/1.1.1.1/Fri Sep 14 22:49:31 2007// 3 | /test.pl/1.1.1.1/Fri Sep 14 22:49:31 2007// 4 | D 5 | -------------------------------------------------------------------------------- /perl/SGE-0.02/examples/CVS/Repository: -------------------------------------------------------------------------------- 1 | bioinformatics/Modules/SGE-0.02/examples 2 | -------------------------------------------------------------------------------- /perl/SGE-0.02/examples/CVS/Root: -------------------------------------------------------------------------------- 1 | :ext:linsalrob@edwards-sdsu.cvs.sourceforge.net:/cvsroot/edwards-sdsu 2 | -------------------------------------------------------------------------------- /perl/SGE-0.02/examples/README: -------------------------------------------------------------------------------- 1 | The script test.pl should max out one node by calculating factorials of large numbers and summing them. 2 | 3 | You should be able to submit this to the cluster using the script submit2cluster.pl like this: 4 | 5 | submit2cluster.pl test.pl 6 | 7 | 8 | -------------------------------------------------------------------------------- /perl/SGE-0.02/examples/test.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | 3 | use strict; 4 | my @results; 5 | my @fact = (1); 6 | for (my $c=1; $c<=10; $c++) { 7 | for (my $i=1; $i<=6000; $i++) { 8 | my $a=1; 9 | for (my $y=1; $y<=$i; $y++) { 10 | $a+=factorial($y); 11 | } 12 | $results[$i]=$a; 13 | } 14 | } 15 | 16 | sub factorial { 17 | my $n = shift; 18 | return $fact[$n] if defined $fact[$n]; 19 | $fact[$n] = $n * factorial($n - 1); 20 | } 21 | 22 | 23 | -------------------------------------------------------------------------------- /perl/SGE-0.02/t/CVS/Entries: -------------------------------------------------------------------------------- 1 | /SGE.t/1.1.1.1/Fri Sep 14 22:49:31 2007// 2 | D 3 | -------------------------------------------------------------------------------- /perl/SGE-0.02/t/CVS/Repository: -------------------------------------------------------------------------------- 1 | bioinformatics/Modules/SGE-0.02/t 2 | -------------------------------------------------------------------------------- /perl/SGE-0.02/t/CVS/Root: -------------------------------------------------------------------------------- 1 | :ext:linsalrob@edwards-sdsu.cvs.sourceforge.net:/cvsroot/edwards-sdsu 2 | -------------------------------------------------------------------------------- /perl/Teragrid-0.02/Changes: -------------------------------------------------------------------------------- 1 | v 0.01 2 | 3 | Initial release into the wild, using LWP for web services 4 | 5 | v 0.02 6 | 7 | Added better job control, and some examples. 8 | 9 | -------------------------------------------------------------------------------- /perl/Teragrid-0.02/Control/CVS/Entries: -------------------------------------------------------------------------------- 1 | /Control.pm/1.1.1.1/Fri Sep 14 22:49:31 2007// 2 | /MANIFEST/1.1.1.1/Fri Sep 14 22:49:31 2007// 3 | /Makefile.PL/1.1.1.1/Fri Sep 14 22:49:31 2007// 4 | D 5 | -------------------------------------------------------------------------------- /perl/Teragrid-0.02/Control/CVS/Repository: -------------------------------------------------------------------------------- 1 | bioinformatics/Modules/Teragrid-0.02/Control 2 | -------------------------------------------------------------------------------- /perl/Teragrid-0.02/Control/CVS/Root: -------------------------------------------------------------------------------- 1 | :ext:linsalrob@edwards-sdsu.cvs.sourceforge.net:/cvsroot/edwards-sdsu 2 | -------------------------------------------------------------------------------- /perl/Teragrid-0.02/Control/MANIFEST: -------------------------------------------------------------------------------- 1 | Control.pm 2 | Makefile.PL 3 | MANIFEST 4 | -------------------------------------------------------------------------------- /perl/Teragrid-0.02/Control/Makefile.PL: -------------------------------------------------------------------------------- 1 | use 5.008001; 2 | use ExtUtils::MakeMaker; 3 | # See lib/ExtUtils/MakeMaker.pm for details of how to influence 4 | # the contents of the Makefile that is written. 5 | WriteMakefile( 6 | NAME => 'TeraGrid::LSGW::Control', 7 | VERSION_FROM => 'Control.pm', # finds $VERSION 8 | PREREQ_PM => {}, # e.g., Module::Name => 1.1 9 | ($] >= 5.005 ? ## Add these new keywords supported since 5.005 10 | (AUTHOR => 'Rob Edwards ') : ()), 11 | ); 12 | -------------------------------------------------------------------------------- /perl/Teragrid-0.02/Copying: -------------------------------------------------------------------------------- 1 | Copyright (c) 2006 Rob Edwards. All rights reserved. This program is free software; you can redistribute it and/or modify it under the same terms as the SEED Toolkit license. 2 | -------------------------------------------------------------------------------- /perl/Teragrid-0.02/Jobs/CVS/Entries: -------------------------------------------------------------------------------- 1 | /Jobs.pm/1.1.1.1/Fri Sep 14 22:49:31 2007// 2 | /MANIFEST/1.1.1.1/Fri Sep 14 22:49:31 2007// 3 | /Makefile.PL/1.1.1.1/Fri Sep 14 22:49:31 2007// 4 | D 5 | -------------------------------------------------------------------------------- /perl/Teragrid-0.02/Jobs/CVS/Repository: -------------------------------------------------------------------------------- 1 | bioinformatics/Modules/Teragrid-0.02/Jobs 2 | -------------------------------------------------------------------------------- /perl/Teragrid-0.02/Jobs/CVS/Root: -------------------------------------------------------------------------------- 1 | :ext:linsalrob@edwards-sdsu.cvs.sourceforge.net:/cvsroot/edwards-sdsu 2 | -------------------------------------------------------------------------------- /perl/Teragrid-0.02/Jobs/MANIFEST: -------------------------------------------------------------------------------- 1 | Jobs.pm 2 | Makefile.PL 3 | MANIFEST 4 | -------------------------------------------------------------------------------- /perl/Teragrid-0.02/Jobs/Makefile.PL: -------------------------------------------------------------------------------- 1 | use 5.008001; 2 | use ExtUtils::MakeMaker; 3 | # See lib/ExtUtils/MakeMaker.pm for details of how to influence 4 | # the contents of the Makefile that is written. 5 | WriteMakefile( 6 | NAME => 'TeraGrid::LSGW::Jobs', 7 | VERSION_FROM => 'Jobs.pm', # finds $VERSION 8 | PREREQ_PM => {}, # e.g., Module::Name => 1.1 9 | ($] >= 5.005 ? ## Add these new keywords supported since 5.005 10 | (AUTHOR => 'Rob Edwards ') : ()), 11 | ); 12 | -------------------------------------------------------------------------------- /perl/Teragrid-0.02/MANIFEST: -------------------------------------------------------------------------------- 1 | Control/Control.pm 2 | Jobs/Jobs.pm 3 | examples/jobs.pl 4 | examples/blast.pl 5 | Copying 6 | Changes 7 | README 8 | -------------------------------------------------------------------------------- /perl/Teragrid-0.02/Makefile.PL: -------------------------------------------------------------------------------- 1 | use 5.008001; 2 | use ExtUtils::MakeMaker; 3 | # See lib/ExtUtils/MakeMaker.pm for details of how to influence 4 | # the contents of the Makefile that is written. 5 | WriteMakefile( 6 | NAME => 'TeraGrid::LSGW', 7 | VERSION_FROM => 'LSGW.pm', # finds $VERSION 8 | PREREQ_PM => {}, # e.g., Module::Name => 1.1 9 | ($] >= 5.005 ? ## Add these new keywords supported since 5.005 10 | (ABSTRACT => 'TeraGrid::LSGW, Interact with the TeraGrid Life Sciences Gateway', 11 | AUTHOR => 'Rob Edwards ') : ()), 12 | ); 13 | -------------------------------------------------------------------------------- /perl/Teragrid-0.02/README: -------------------------------------------------------------------------------- 1 | README 2 | 3 | TeraGrid::LSGW 4 | 5 | The Life Sciences Gateway to the TeraGrid is being developed to allow biologists access to High Performance Computing. This series of modules is being developed by Rob Edwards and Ivan Judson to assist in submitting jobbs to the LSGW. 6 | 7 | To use these modules you'll need an account on the LSGW machine with access to BLAST. You will have to contact Ivan for that. 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /perl/Teragrid-0.02/examples/CVS/Entries: -------------------------------------------------------------------------------- 1 | /all_jobs.pl/1.1.1.1/Fri Sep 14 22:49:31 2007// 2 | /blast.pl/1.1.1.1/Fri Sep 14 22:49:31 2007// 3 | /job.pl/1.1.1.1/Fri Sep 14 22:49:31 2007// 4 | /job_data.pl/1.1.1.1/Fri Sep 14 22:49:31 2007// 5 | D 6 | -------------------------------------------------------------------------------- /perl/Teragrid-0.02/examples/CVS/Repository: -------------------------------------------------------------------------------- 1 | bioinformatics/Modules/Teragrid-0.02/examples 2 | -------------------------------------------------------------------------------- /perl/Teragrid-0.02/examples/CVS/Root: -------------------------------------------------------------------------------- 1 | :ext:linsalrob@edwards-sdsu.cvs.sourceforge.net:/cvsroot/edwards-sdsu 2 | -------------------------------------------------------------------------------- /perl/Teragrid-0.02/examples/all_jobs.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | 3 | # just print out a list of all jobs 4 | 5 | use strict; 6 | use lib '/homes/redwards/perl/share/perl/5.8.4/'; 7 | use TeraGrid::LSGW; 8 | 9 | 10 | my $tg=new TeraGrid::LSGW(-verbose=>2); 11 | my $aj=$tg->jobs(); 12 | print STDERR "There are ", scalar(keys %$aj), " jobs\n"; 13 | my $jl=$tg->job_list; 14 | print join("\n", @$jl), "\n"; 15 | -------------------------------------------------------------------------------- /perl/Teragrid-0.02/examples/job.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | 3 | # Test the jobs interface 4 | 5 | use strict; 6 | use lib '/homes/redwards/perl/share/perl/5.8.4/'; 7 | use TeraGrid::LSGW; 8 | 9 | 10 | my $job=shift || die "$0 \n"; 11 | 12 | my $tg=new TeraGrid::LSGW(-verbose=>2); 13 | 14 | open(OUT, ">$job.input") || die "Can't write to $job.input"; 15 | print OUT join("\n", $tg->input($job)), "\n"; 16 | close OUT; 17 | 18 | open(OUT, ">$job.output") || die "Can't write to $job.output"; 19 | print OUT join("\n", $tg->output($job)), "\n"; 20 | close OUT; 21 | 22 | open(OUT, ">$job.results") || die "Can't write to $job.results"; 23 | print OUT join("\n", $tg->results($job)), "\n"; 24 | close OUT; 25 | 26 | -------------------------------------------------------------------------------- /perl/Teragrid-0.02/examples/job_data.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | 3 | # Test the jobs interface 4 | 5 | use strict; 6 | use lib '/homes/redwards/perl/share/perl/5.8.4/'; 7 | use TeraGrid::LSGW; 8 | 9 | 10 | my $ajob="blastx-20060608-20153622"; 11 | #my $ajob="blastx-20060612-01534283"; 12 | 13 | my $tg=new TeraGrid::LSGW(-verbose=>2); 14 | 15 | my $jobs=$tg->jobs(); 16 | 17 | foreach my $j (keys %$jobs) 18 | { 19 | print join("\t", $j, @{$jobs->{$j}}), "\n"; 20 | } 21 | 22 | print "For job $ajob\n"; 23 | print "INPUT\n======\n", $tg->input($ajob), "\nOUTPUT\n======\n", $tg->output($ajob), "\nRESULTS\n======\n", $tg->results($ajob), "\n"; 24 | 25 | 26 | -------------------------------------------------------------------------------- /perl/alignment/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'redwards' 2 | 3 | from .edit_distance import edit_distance 4 | 5 | from .gapped_alignment import gap_alignment 6 | from .dna_alignment import dna_gapped_alignment, dna_score_alignment 7 | from .gapped_alignment2 import gapped_alignment, score_alignment 8 | from local_alignment import local_alignment 9 | -------------------------------------------------------------------------------- /phage/envs/phispy.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - bioconda 3 | - conda-forge 4 | dependencies: 5 | - phispy 6 | - rsync 7 | - curl 8 | 9 | -------------------------------------------------------------------------------- /phage/is_phage_function.py: -------------------------------------------------------------------------------- 1 | """ 2 | Note that this uses code from phispy!! 3 | 4 | Test whether functions are phages .... or not! 5 | """ 6 | 7 | import os 8 | import sys 9 | import argparse 10 | 11 | from PhiSpyModules import is_phage_func, is_unknown_func 12 | 13 | def is_phage_hypo(f): 14 | with open(f, 'r') as fin: 15 | for l in fin: 16 | p = l.strip().split("\t") 17 | if is_phage_func(p[0]): 18 | p.append(1) 19 | else: 20 | p.append(0) 21 | if is_unknown_func(p[0]): 22 | p.append(1) 23 | else: 24 | p.append(0) 25 | print("\t".join(map(str, p))) 26 | 27 | if __name__ == '__main__': 28 | parser = argparse.ArgumentParser(description=" ") 29 | parser.add_argument('-f', help='file', required=True) 30 | args = parser.parse_args() 31 | 32 | is_phage_hypo(args.f) 33 | -------------------------------------------------------------------------------- /phage/metagenomes/contig_mv_samples.txt.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:f78e9e819bc0093033ee52d0d26f673b4cb90baf4b36401f3c9af7ee0b920da0 3 | size 8185 4 | -------------------------------------------------------------------------------- /phage/metagenomes/contigs_gokushovirus.blastn.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:5395c12a95c71f34cb15b55da063459b9414613aceac190e3291cb1034a1dc37 3 | size 10073 4 | -------------------------------------------------------------------------------- /phage/metagenomes/count_contigs.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use Getopt::Std; 3 | use Data::Dumper; 4 | use Rob; 5 | 6 | 7 | 8 | open(IN, "mv_samples.txt") || die "$! mv_samples.txt"; 9 | my %sample; 10 | while () { 11 | chomp; 12 | my @a=split /\t/; 13 | $sample{$a[0]}=$a[1]; 14 | } 15 | close IN; 16 | 17 | open(IN, "mv_sequences.txt") || die "$! mv_sequences.txt"; 18 | my %contigs; 19 | while () { 20 | chomp; 21 | my @a=split /\t/; 22 | if ($sample{$a[0]}) {print "$_\t$sample{$a[0]}\n"} 23 | elsif (/^contig/) {$contigs{$a[0]}=$_} 24 | else {print STDERR "Huh? $_\n"} 25 | } 26 | close IN; 27 | 28 | open(IN, "gunzip -c contig_count_table.tsv.gz|") || die "$! contig_count_table.tsv.gz"; 29 | my $contsamps; 30 | while () { 31 | chomp; 32 | my @a=split /\t/; 33 | if ($a[3] > 0) {$contsamps->{$a[1]}->{$a[0]}=1} 34 | } 35 | close IN; 36 | 37 | foreach my $c (keys %contigs) { 38 | print "$contigs{$c}\t"; 39 | if ($contsamps->{$c}) { 40 | print scalar(keys %{$contsamps->{$c}}), "\n"; 41 | } else { 42 | print "UNKNOWN\n"; 43 | } 44 | } 45 | 46 | 47 | -------------------------------------------------------------------------------- /phage/metagenomes/count_contigs2.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use Getopt::Std; 3 | use Data::Dumper; 4 | use Rob; 5 | 6 | # the first version was shit because it counted hits to multiple viruses more than once. 7 | # now we need to know which samples have e.g. Gokushaviridae 8 | # 9 | 10 | # get the contig information first since I need to work on the phables information 11 | # 12 | 13 | open(IN, "mv_sequences.txt") || die "$! mv_sequences.txt"; 14 | my %class; 15 | while () { 16 | chomp; 17 | my @a=split /\t/; 18 | $class{$a[0]}=$a[1]; 19 | } 20 | close IN; 21 | 22 | my $count; 23 | open(IN, "gunzip -c contig_count_table.tsv.gz|") || die "$! contig_count_table.tsv.gz"; 24 | while () { 25 | chomp; 26 | my @a=split /\t/; 27 | next unless ($a[3]); 28 | if ($class{$a[1]}) { 29 | $count->{$class{$a[1]}}->{$a[0]} = 1; 30 | } 31 | } 32 | close IN; 33 | 34 | foreach my $c (keys %$count) { 35 | foreach my $s (keys %{$count->{$c}}) { 36 | print "$c\t$s\n"; 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /phage/metagenomes/count_phables.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use Getopt::Std; 3 | use Data::Dumper; 4 | 5 | 6 | open(IN, "mv_sequences.txt") || die "$! mv_sequences.txt"; 7 | my %class; 8 | while () { 9 | chomp; 10 | my @a=split /\t/; 11 | $class{$a[0]}=$a[1]; 12 | } 13 | close IN; 14 | 15 | my $count; 16 | my $header; 17 | open(IN, "sample_genome_read_counts.tsv") || die "$! sample_genome_read_counts.tsv"; 18 | while () { 19 | chomp; 20 | my @a=split /\t/; 21 | unless ($header) {$header=\@a; next} 22 | if ($class{$a[0]}) { 23 | map {$count->{$class{$a[0]}}->{$header->[$_]}=1 if ($a[$_])} (1..$#a); 24 | } 25 | } 26 | close IN; 27 | 28 | 29 | foreach my $c (keys %$count) { 30 | foreach my $s (keys %{$count->{$c}}) { 31 | print "$c\t$s\n"; 32 | } 33 | } 34 | 35 | -------------------------------------------------------------------------------- /phage/metagenomes/crass_contigs.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:064a7286f28153595450f0fed1ff5180cd518f37b37ab386f10f8f947d576974 3 | size 384917 4 | -------------------------------------------------------------------------------- /phage/metagenomes/crassphage_1percent.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/phage/metagenomes/crassphage_1percent.png -------------------------------------------------------------------------------- /phage/metagenomes/crassus_results.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:b560b3f4b65d3bd19f50fcd9fd8014199b391709945391949591c6d78565a24f 3 | size 3528 4 | -------------------------------------------------------------------------------- /phage/metagenomes/find_pb_segment2.pl: -------------------------------------------------------------------------------- 1 | =pod 2 | 3 | contig_199070 is only present in these samples: '35536', '35613', '35634', '35658', '38046' 4 | 5 | Can we find any more contigs that are only present in those samples, and not present elsewhere? 6 | 7 | =cut 8 | 9 | use strict; 10 | 11 | open(IN, "gunzip -c contig_count_table.tsv.gz|") || die "$! contig_count_table.tsv.gz"; 12 | my $h = 1; 13 | 14 | my %w = ( 15 | 35536 => 1, 16 | 35613 => 1, 17 | 35634 => 1, 18 | 35658 => 1, 19 | 38046 => 1 20 | ); 21 | 22 | my %want; 23 | my %other; 24 | 25 | while () { 26 | if ($h) {$h=0; next} 27 | my @a=split /\t/; 28 | next unless ($a[3]); 29 | if ($w{$a[0]}) { 30 | $want{$a[1]}++; 31 | } else { 32 | $other{$a[1]}++; 33 | } 34 | } 35 | close IN; 36 | 37 | print "Contig\tWAnted\tOther samples\n"; 38 | foreach my $c (keys %want) { 39 | if ($want{$c} == 5) { 40 | print "$c\t$want{$c}\t$other{$c}\n"; 41 | } 42 | } 43 | 44 | 45 | -------------------------------------------------------------------------------- /phage/metagenomes/freezer.txt.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:479ab0783ab0477523b812ba3b3e9c0dcd5c33b5942b515164363b80e7f86d19 3 | size 280 4 | -------------------------------------------------------------------------------- /phage/metagenomes/hist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/phage/metagenomes/hist.png -------------------------------------------------------------------------------- /phage/metagenomes/hist_1percent.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/phage/metagenomes/hist_1percent.png -------------------------------------------------------------------------------- /phage/metagenomes/ibd_16s/RC2_16S_IBD_OTU.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:5b347228726cda60a3bef5ee13b8fc58a50615d0553bd8e0b06e735341e81e4b 3 | size 847840 4 | -------------------------------------------------------------------------------- /phage/metagenomes/ibd_16s/RC2_16S_IBD_metadata.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:e9c37a2d86b1ca3babcae3f1b5effed7ae322eb39bdffbf54c70749f29234927 3 | size 13113 4 | -------------------------------------------------------------------------------- /phage/metagenomes/ibd_16s/RC2_16S_IBD_taxadata.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:e6c7ccc8827c1312dadf3120fde5a4f80f5a39bbac00349c5d341220762df75b 3 | size 348542 4 | -------------------------------------------------------------------------------- /phage/metagenomes/join_vir.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use Getopt::Std; 3 | use Data::Dumper; 4 | use Rob; 5 | my %count; 6 | open(IN, "most_abundant.txt") || die "$! most_abundant.txt"; 7 | while () { 8 | chomp; 9 | my @a=split /\t/; 10 | $count{$a[0]}=$a[1]; 11 | } 12 | close IN; 13 | 14 | open(IN, "virus_contig_annotations.tsv") || die "C$! virus_contig_annotations.tsv"; 15 | while () { 16 | my @a=split /\t/; 17 | if ($a[0] eq "contigID") {splice @a, 1, 0, "Samples"} 18 | elsif ($count{$a[0]}) {splice @a, 1, 0, $count{$a[0]}} 19 | else {splice @a, 1, 0, 0} 20 | print join("\t", @a); 21 | } 22 | close IN; 23 | -------------------------------------------------------------------------------- /phage/metagenomes/limit_contigs.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use Getopt::Std; 3 | use Data::Dumper; 4 | 5 | my %w; 6 | open(IN, "crassus_results.tsv") || die "$! crassus_results.tsv"; 7 | while () { 8 | chomp; 9 | my @a=split /\t/; 10 | $w{$a[1]}=1; 11 | } 12 | close IN; 13 | 14 | open(IN, "gunzip -c contig_count_table.tsv.gz|") || die "$! contig_count_table.tsv.gz"; 15 | my $h = 1; 16 | while () { 17 | if ($h) {print; $h=0; next} 18 | my @a=split /\t/; 19 | print if ($w{$a[1]} && $a[3]); 20 | } 21 | close IN; 22 | -------------------------------------------------------------------------------- /phage/metagenomes/microviridae.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/phage/metagenomes/microviridae.png -------------------------------------------------------------------------------- /phage/metagenomes/microviridae58782.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/phage/metagenomes/microviridae58782.png -------------------------------------------------------------------------------- /phage/metagenomes/microviridae_correlations.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:977e6c6dd4cb7c9686185c2e34df51c180dcad6383f7e78bfc10bd40642905c5 3 | size 1022545 4 | -------------------------------------------------------------------------------- /phage/metagenomes/microvirus_contig_count_table.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:0fbbbe8c199f278c95d58ef09a4af71fa4fe52c11491cf1704a7454032ba6544 3 | size 89605 4 | -------------------------------------------------------------------------------- /phage/metagenomes/microvirus_contigs.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use Getopt::Std; 3 | use Data::Dumper; 4 | 5 | my %w; 6 | open(IN, "gunzip -c contigs_gokushovirus.blastn.gz|") || die "$! contigs_gokushovirus.blastn.gz"; 7 | while () { 8 | chomp; 9 | my @a=split /\t/; 10 | $w{$a[0]}=1; 11 | } 12 | close IN; 13 | 14 | open(IN, "gunzip -c contig_count_table.tsv.gz|") || die "$! contig_count_table.tsv.gz"; 15 | my $h = 1; 16 | while () { 17 | if ($h) {print; $h=0; next} 18 | my @a=split /\t/; 19 | print if ($w{$a[1]} && $a[3]); 20 | } 21 | close IN; 22 | -------------------------------------------------------------------------------- /phage/metagenomes/most_abundant.pl: -------------------------------------------------------------------------------- 1 | 2 | use strict; 3 | use Getopt::Std; 4 | use Data::Dumper; 5 | use Rob; 6 | 7 | # What are the most abundant contigs in terms of numbers of samples 8 | # they are in 9 | # 10 | 11 | 12 | my $count; 13 | open(IN, "gunzip -c contig_count_table.tsv.gz|") || die "$! contig_count_table.tsv.gz"; 14 | while () { 15 | chomp; 16 | my @a=split /\t/; 17 | next unless ($a[3]); 18 | $count->{$a[1]}->{$a[0]} = 1; 19 | } 20 | close IN; 21 | 22 | foreach my $c (keys %$count) { 23 | print "$c\t", scalar(keys %{$count->{$c}}), "\n"; 24 | } 25 | -------------------------------------------------------------------------------- /phage/metagenomes/most_abundant.txt.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:1398cd70fafc725457a49f0c5c45fba3d0d168af79c92252fe35a1270fa7b85e 3 | size 1192386 4 | -------------------------------------------------------------------------------- /phage/metagenomes/mv_gen_cont_samples.txt.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:baf3dee4372cd84717f94ee59b0471ece0740694a61edf6f38f4fa63feeb7d1e 3 | size 3760 4 | -------------------------------------------------------------------------------- /phage/metagenomes/mv_samples.txt.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:8979d93cd22c47a4377006ba1437fde11748b3191af0b07014ef9acaa9379108 3 | size 7844 4 | -------------------------------------------------------------------------------- /phage/metagenomes/mv_sequences.txt.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:d85fc8224fea65b05351cabc4b79931f363c978c965bcb4180fad719cb606ed5 3 | size 2410 4 | -------------------------------------------------------------------------------- /phage/metagenomes/our_crassphage.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:f221f60ac80a7b344deddcf360e46933a79f23ba722b14f7c3140024594d6ab0 3 | size 17184 4 | -------------------------------------------------------------------------------- /phage/metagenomes/pb199070.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/phage/metagenomes/pb199070.png -------------------------------------------------------------------------------- /phage/metagenomes/pb199070_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/phage/metagenomes/pb199070_2.png -------------------------------------------------------------------------------- /phage/metagenomes/pb199070_both.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/phage/metagenomes/pb199070_both.png -------------------------------------------------------------------------------- /phage/metagenomes/pb58328.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/phage/metagenomes/pb58328.png -------------------------------------------------------------------------------- /phage/metagenomes/phables_mv_samples.txt.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:c9a0aa9043ddd6c5dbf1f9736bec82de7f0ef5b2a30df0455fc098b93a04263c 3 | size 6722 4 | -------------------------------------------------------------------------------- /phage/metagenomes/pharokka_top_hits_mash_inphared.nonone.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:5c995afbd162130b757b5845082d635a233c6273c0e93022ebf6a7604e9f0c6e 3 | size 192651 4 | -------------------------------------------------------------------------------- /phage/metagenomes/pharokka_top_hits_mash_inphared.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:7d580c96d99db3ebf4d90ecf5e1fa3036e8438ae8a7a26f2b2b8e04623d5c47d 3 | size 428184 4 | -------------------------------------------------------------------------------- /phage/metagenomes/picobirnaviridae edited.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/EdwardsLab/dba1b79519165b71a3e7e6fe0e9ce9d0b4fa4f01/phage/metagenomes/picobirnaviridae edited.png -------------------------------------------------------------------------------- /phage/metagenomes/picobirnaviridae_contig_count_table.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:7777c6af305699c3dfffbb249db9e051e5e94e0df2ffebe5e3619d2a26cee181 3 | size 14971 4 | -------------------------------------------------------------------------------- /phage/metagenomes/picobirnaviridae_contigs.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use Getopt::Std; 3 | use Data::Dumper; 4 | 5 | my %w; 6 | open(IN, "gunzip -c picobirnaviridae_rdrp.tblastn.gz |") || die "$! picobirnaviridae_rdrp.tblastn.gz"; 7 | while () { 8 | chomp; 9 | my @a=split /\t/; 10 | $w{$a[1]}=1; 11 | } 12 | close IN; 13 | 14 | open(IN, "gunzip -c contig_count_table.tsv.gz|") || die "$! contig_count_table.tsv.gz"; 15 | my $h = 1; 16 | while () { 17 | if ($h) {print; $h=0; next} 18 | my @a=split /\t/; 19 | print if ($w{$a[1]} && $a[3]); 20 | } 21 | close IN; 22 | -------------------------------------------------------------------------------- /phage/metagenomes/picobirnaviridae_correlations.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:fb327a9e4c8eda037dfffcc6ad9ceeb3f35887a59ce75ee78972be408c5a5171 3 | size 360246 4 | -------------------------------------------------------------------------------- /phage/metagenomes/picobirnaviridae_rdrp.tblastn.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:d7e4dca4db1a61ec441da41739ef0ff227675e8861fd5e3305863682a2485d90 3 | size 1194 4 | -------------------------------------------------------------------------------- /phage/metagenomes/rdrp.tblastn.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:b627e3bb791c31a3e057379a6fb517f6f9fa0efaa3e54504b0f375ee42ca883b 3 | size 1391 4 | -------------------------------------------------------------------------------- /phage/metagenomes/rdrp_contig_count_table.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:af44802f46c8312f0edeac6693312dd6c60785bdaf306d289349dea61ba1df9b 3 | size 16733 4 | -------------------------------------------------------------------------------- /phage/metagenomes/rdrp_contigs.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use Getopt::Std; 3 | use Data::Dumper; 4 | 5 | my %w; 6 | open(IN, "gunzip -c rdrp.tblastn.tsv.gz |") || die "$! rdrp.tblastn.tsv.gz"; 7 | while () { 8 | chomp; 9 | my @a=split /\t/; 10 | $w{$a[1]}=1; 11 | } 12 | close IN; 13 | 14 | open(IN, "gunzip -c contig_count_table.tsv.gz|") || die "$! contig_count_table.tsv.gz"; 15 | my $h = 1; 16 | while () { 17 | if ($h) {print; $h=0; next} 18 | my @a=split /\t/; 19 | print if ($w{$a[1]} && $a[3]); 20 | } 21 | close IN; 22 | -------------------------------------------------------------------------------- /phage/metagenomes/rdrp_wehave.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:443a327f46ad6857a7770b00ee320b93f334d8c1501c33ca285db1b392c02b0e 3 | size 757 4 | -------------------------------------------------------------------------------- /phage/metagenomes/sampleSeqCounts.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:b631e6bf9d4e71d5215c90f34f728918ac6aeeee7b54d5c2464ef2888eca7dfd 3 | size 6888 4 | -------------------------------------------------------------------------------- /phage/metagenomes/sample_genome_read_counts.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:6b8262dda1891f5df6db39c95a80c77c20039e61ce85b7fb966b8dec3832f82a 3 | size 248549 4 | -------------------------------------------------------------------------------- /phage/metagenomes/virus_contig_annotations.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:a8a99a180550f1efa60633903c3f106e6756a0515b087c50aa734283b30c0949 3 | size 1028088 4 | -------------------------------------------------------------------------------- /phage/metagenomes/virus_contig_annotations_samples.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:22eb3940f17e72b095804f8c4a25d39242a150b400a10491eadb5e220dc12f81 3 | size 1056205 4 | -------------------------------------------------------------------------------- /phage/phage_quality_config.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | output_paths: 3 | contigs: contigs 4 | orfs: phanotate 5 | blast: blast 6 | databases: /home3/redwards/phage/Sequencing/databases 7 | statistics: stats 8 | results: results 9 | 10 | executable_paths: 11 | blast: /usr/local/blast+/bin/ 12 | 13 | databases: 14 | phage_proteins: phages.faa 15 | bacterial_proteins: bacteria.clusters.faa 16 | nr: /home2/db/blast/nr/nr 17 | phage_cluster_database: /home3/redwards/phage/genbank_phages/20200228/clusters.sql 18 | -------------------------------------------------------------------------------- /phage/submit_phispy_vogs_download.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DATE=20220525 4 | ASS=$DATE/assembly_summary_$DATE.txt.gz 5 | VOGS=/home3/redwards/VOGs/VOGs.hmm 6 | 7 | NEED=0000$SGE_TASK_ID 8 | NEED=${NEED:(-4)} 9 | 10 | snakemake -s ~/GitHubs/EdwardsLab/phage/phispy_vogs_download.snakefile --config filelist=$DATE/needed/x$NEED gbk=$DATE/gbk output=$DATE/phispy assembly=$ASS vogs=$VOGS --profile sge 11 | -------------------------------------------------------------------------------- /phage_clustering/bit_score_by_len.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | """ 4 | 5 | import os 6 | import sys 7 | import argparse 8 | from roblib import bcolors, stream_blast_results 9 | __author__ = 'Rob Edwards' 10 | 11 | 12 | 13 | def bit_scores_len(blastf, verbose=False): 14 | """ 15 | Generate a dict of self:self bitscores 16 | """ 17 | 18 | for b in stream_blast_results(blastf, verbose): 19 | if b.query == b.db: 20 | print(f"{b.query_length}\t{b.bitscore}") 21 | 22 | if __name__ == "__main__": 23 | parser = argparse.ArgumentParser(description=' ') 24 | parser.add_argument('-b', help='blast input file', required=True) 25 | parser.add_argument('-v', help='verbose output', action='store_true') 26 | args = parser.parse_args() 27 | 28 | bit_scores_len(args.b, args.v) 29 | -------------------------------------------------------------------------------- /process_EK_metagenomes/bin/compress.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=CmprssOutput 3 | #SBATCH --time=5-0 4 | #SBATCH --ntasks=1 5 | #SBATCH --cpus-per-task=1 6 | #SBATCH --mem=2G 7 | #SBATCH -o cmprss-%j.out 8 | #SBATCH -e cmprss-%j.err 9 | 10 | set -euo pipefail 11 | # here the brackets are required otherwise -exec will only do the last one 12 | find ReadAnnotations/ \( -name "*.tsv" -o -name "*.m8" -o -name "*.xls" \) -exec pigz {} \; 13 | 14 | -------------------------------------------------------------------------------- /process_EK_metagenomes/bin/merge_counts.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use Getopt::Std; 3 | use Data::Dumper; 4 | use Rob; 5 | my $rob = new Rob; 6 | 7 | my $reads; 8 | my %alltypes; 9 | my %allsamples; 10 | foreach my $f (@ARGV) { 11 | 12 | open(IN, $f) || die "$! : $f"; 13 | while () { 14 | chomp; 15 | s/.001.fast..gz//;# replace fastq or fasta 16 | s/.fast..gz//; 17 | s#//#/#g; 18 | my @a=split /\t/; 19 | my @b = split /\//, $a[0]; 20 | $reads->{$b[1]}->{$b[0]}=$a[1]; 21 | $allsamples{$b[1]}++; 22 | $alltypes{$b[0]}++; 23 | } 24 | close IN; 25 | } 26 | 27 | my @types = sort {$a cmp $b} keys %alltypes; 28 | my @samples = sort {$a cmp $b} keys %allsamples; 29 | 30 | print("Sample\t", join("\t", @types), "\n"); 31 | foreach my $s (@samples) { 32 | print $s; 33 | foreach my $t (@types) { 34 | print "\t"; 35 | (defined $reads->{$s}->{$t}) ? print $reads->{$s}->{$t} : print "not found"; 36 | } 37 | print "\n"; 38 | } 39 | 40 | 41 | -------------------------------------------------------------------------------- /process_EK_metagenomes/count_fastq.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=countfastq 3 | #SBATCH --time=0-1 4 | #SBATCH --ntasks=1 5 | #SBATCH --cpus-per-task=1 6 | #SBATCH --mem=4G 7 | #SBATCH -o count_fastq-%j.out 8 | #SBATCH -e count_fastq-%j.err 9 | #SBATCH --partition=short 10 | 11 | DIR=$1 12 | 13 | if [[ -z $DIR ]]; then 14 | echo "sbatch count_fastq.slurm " >&2; 15 | echo "You can also use a single fastq file, I suppose" >&2; 16 | exit 1; 17 | fi 18 | 19 | # This requires count_fastq from my EdwardsLab github to be installed 20 | 21 | count_fastq $DIR 22 | -------------------------------------------------------------------------------- /process_EK_metagenomes/count_mmseqs.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=CountMMSeqs 3 | #SBATCH --time=0-10 4 | #SBATCH --ntasks=1 5 | #SBATCH --cpus-per-task=1 6 | #SBATCH --mem=64G 7 | #SBATCH -o count_mmseqs-%j.out 8 | #SBATCH -e count_mmseqs-%j.err 9 | #SBATCH --partition=short 10 | 11 | eval "$(conda shell.bash hook)" 12 | conda activate bioinformatics 13 | 14 | perl ~/GitHubs/EdwardsLab/process_EK_metagenomes/count_mmseqs.pl -d mmseqs 15 | 16 | -------------------------------------------------------------------------------- /process_EK_metagenomes/count_sharks.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=count_sharks 3 | #SBATCH --time=5-0 4 | #SBATCH --ntasks=1 5 | #SBATCH --cpus-per-task=1 6 | #SBATCH --mem=128G 7 | #SBATCH -o count_sharks-%j.out 8 | #SBATCH -e count_sharks-%j.err 9 | 10 | for F in no_sharks/*; do echo -ne "$F\t"; gunzip -c $F | perl -ne 'if (/^@/) {$s{$_}++} END {print scalar(keys %s), "\n"}'; done > count_nosharks.txt & 11 | for F in sharks/*; do echo -ne "$F\t"; gunzip -c $F | perl -ne 'if (/^@/) {$s{$_}++} END {print scalar(keys %s), "\n"}'; done > count_sharks.txt & 12 | 13 | wait; 14 | 15 | -------------------------------------------------------------------------------- /process_EK_metagenomes/count_subsystems.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=CountSS 3 | #SBATCH --time=0-1 4 | #SBATCH --ntasks=1 5 | #SBATCH --cpus-per-task=1 6 | #SBATCH --mem=64G 7 | #SBATCH --partition=short 8 | #SBATCH -o count_ss-%j.out 9 | #SBATCH -e count_ss-%j.err 10 | 11 | perl /home/edwa0468/GitHubs/EdwardsLab/process_EK_metagenomes/count_subsystems.pl -d mmseqs 12 | -------------------------------------------------------------------------------- /process_EK_metagenomes/megahit_submit.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=megahit_submit 3 | #SBATCH --time=5-0 4 | #SBATCH --ntasks=1 5 | #SBATCH --cpus-per-task=1 6 | #SBATCH --mem=4G 7 | #SBATCH -o megahit_submit-%j.out 8 | #SBATCH -e megahit_submit-%j.err 9 | 10 | set -euo pipefail 11 | eval "$(conda shell.bash hook)" 12 | conda activate bioinformatics 13 | 14 | /home/edwa0468/slurm/stats.sh 15 | 16 | 17 | mkdir --parents megahit 18 | for R1 in $(cat R1_reads.txt); do 19 | R2=${R1/R1/R2}; 20 | #FILEEND="_R1_001.fastq.gz"; 21 | FILEEND="_R1.fastq.gz"; 22 | O=${R1/$FILEEND/}; 23 | 24 | 25 | if [[ ! -e megahit/$O ]]; then 26 | sbatch ~/GitHubs/EdwardsLab/process_EK_metagenomes/megahit.slurm no_sharks/$R1 no_sharks/$R2 megahit/$O 27 | fi; 28 | done 29 | -------------------------------------------------------------------------------- /process_EK_metagenomes/mmseqs_add_subsystems.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=mmseqs_ss 3 | #SBATCH --time=0-10 4 | #SBATCH --ntasks=1 5 | #SBATCH --cpus-per-task=2 6 | #SBATCH --mem=16G 7 | #SBATCH -o mmseqs_ss-%A_%a.out 8 | #SBATCH -e mmseqs_ss-%A_%a.err 9 | 10 | eval "$(conda shell.bash hook)" 11 | conda activate bioinformatics 12 | 13 | if [[ ! -e R1_reads.txt ]]; then 14 | echo "Please make a file with the R1 reads using this command:" >&2 15 | echo "find fastq -name \*R1\* -printf "%f\n" > R1_reads.txt" >&2; 16 | exit 2; 17 | fi 18 | 19 | cp /home/edwa0468/UniRef/uniref.sqlite $BGFS 20 | R1=$(head -n $SLURM_ARRAY_TASK_ID R1_reads.txt | tail -n 1) 21 | FILEEND="_R1.fastq.gz"; 22 | #FILEEND="_R1_001.fastq.gz"; 23 | BASE=${R1/$FILEEND/} 24 | 25 | python ~/GitHubs/EdwardsLab/mmseqs/easy_taxonomy_to_function.py -f mmseqs/$BASE/${BASE}_tophit_report.gz -d $BGFS/uniref.sqlite | gzip -c > mmseqs/$BASE/${BASE}_tophit_report_subsystems.gz 26 | -------------------------------------------------------------------------------- /process_EK_metagenomes/mmseqs_easy_taxonomy_submit.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=mmseqs_submit 3 | #SBATCH --time=5-0 4 | #SBATCH --ntasks=1 5 | #SBATCH --cpus-per-task=1 6 | #SBATCH --mem=2G 7 | #SBATCH -o mmseqs_submit-%j.out 8 | #SBATCH -e mmseqs_submit-%j.err 9 | 10 | # this is so we can submit it as a slurm job :) 11 | 12 | bash process_EK_metagenomes/mmseqs_easy_taxonomy_submit.sh UniRef50 mmseqs fasta 13 | 14 | -------------------------------------------------------------------------------- /process_EK_metagenomes/vamb_create_fasta.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import argparse 3 | import vamb 4 | 5 | parser = argparse.ArgumentParser( 6 | description="""Command-line bin creator. 7 | Will read the entire content of the FASTA file into memory - beware.""", 8 | formatter_class=argparse.RawDescriptionHelpFormatter, 9 | add_help=False, 10 | ) 11 | 12 | parser.add_argument("fastapath", help="Path to FASTA file") 13 | parser.add_argument("clusterspath", help="Path to clusters.tsv") 14 | parser.add_argument("minsize", help="Minimum size of bin", type=int, default=0) 15 | parser.add_argument("outdir", help="Directory to create") 16 | 17 | if len(sys.argv) == 1: 18 | parser.print_help() 19 | sys.exit() 20 | 21 | args = parser.parse_args() 22 | 23 | with open(args.clusterspath) as file: 24 | clusters = vamb.vambtools.read_clusters(file) 25 | 26 | with vamb.vambtools.Reader(args.fastapath) as file: 27 | vamb.vambtools.write_bins( 28 | args.outdir, clusters, file, maxbins=None, minsize=args.minsize 29 | ) 30 | -------------------------------------------------------------------------------- /process_JCJ_metagenomes/megahit_submit.sh: -------------------------------------------------------------------------------- 1 | 2 | 3 | mkdir --parents megahit 4 | for R1 in $(cat R1_reads.txt); do 5 | R2=${R1/R1/R2}; 6 | FILEEND="_R1_001.fastq.gz"; 7 | O=${R1/$FILEEND/}; 8 | 9 | 10 | if [[ ! -e megahit/$O ]]; then 11 | sbatch ~/GitHubs/EdwardsLab/process_JCJ_metagenomes/megahit.slurm no_human/$R1 no_human/$R2 megahit/$O 12 | fi; 13 | done 14 | -------------------------------------------------------------------------------- /process_JCJ_metagenomes/vamb_create_fasta.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import argparse 3 | import vamb 4 | 5 | parser = argparse.ArgumentParser( 6 | description="""Command-line bin creator. 7 | Will read the entire content of the FASTA file into memory - beware.""", 8 | formatter_class=argparse.RawDescriptionHelpFormatter, 9 | add_help=False, 10 | ) 11 | 12 | parser.add_argument("fastapath", help="Path to FASTA file") 13 | parser.add_argument("clusterspath", help="Path to clusters.tsv") 14 | parser.add_argument("minsize", help="Minimum size of bin", type=int, default=0) 15 | parser.add_argument("outdir", help="Directory to create") 16 | 17 | if len(sys.argv) == 1: 18 | parser.print_help() 19 | sys.exit() 20 | 21 | args = parser.parse_args() 22 | 23 | with open(args.clusterspath) as file: 24 | clusters = vamb.vambtools.read_clusters(file) 25 | 26 | with vamb.vambtools.Reader(args.fastapath) as file: 27 | vamb.vambtools.write_bins( 28 | args.outdir, clusters, file, maxbins=None, minsize=args.minsize 29 | ) 30 | -------------------------------------------------------------------------------- /process_metagenomes/mmseqs_add_subsystems.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=mmseqs_ss 3 | #SBATCH --time=0-10 4 | #SBATCH --ntasks=1 5 | #SBATCH --cpus-per-task=2 6 | #SBATCH --mem=16G 7 | #SBATCH -o mmseqs_ss-%A_%a.out 8 | #SBATCH -e mmseqs_ss-%A_%a.err 9 | 10 | eval "$(conda shell.bash hook)" 11 | conda activate bioinformatics 12 | 13 | if [[ ! -e R1_reads.txt ]]; then 14 | echo "Please make a file with the R1 reads using this command:" >&2 15 | echo "find fastq -name \*R1\* -printf "%f\n" > R1_reads.txt" >&2; 16 | exit 2; 17 | fi 18 | 19 | 20 | if [[ ! -e DEFINITIONS.sh ]]; then 21 | echo "Please create a DEFINITIONS.sh file with SOURCE, FILEEND, HOSTREMOVED" >&2 22 | exit 2; 23 | fi 24 | 25 | source DEFINITIONS.sh 26 | 27 | 28 | cp /home/edwa0468/UniRef/uniref.sqlite $BGFS 29 | 30 | R1=$(head -n $SLURM_ARRAY_TASK_ID R1_reads.txt | tail -n 1) 31 | BASE=${R1/$FILEEND/} 32 | 33 | python ~/GitHubs/EdwardsLab/mmseqs/easy_taxonomy_to_function.py -f mmseqs/$BASE/${BASE}_tophit_report.gz -d $BGFS/uniref.sqlite | gzip -c > mmseqs/$BASE/${BASE}_tophit_report_subsystems.gz 34 | -------------------------------------------------------------------------------- /process_metagenomes/mmseqs_taxonomy.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=mmTax 3 | #SBATCH --time=0-1 4 | #SBATCH --ntasks=1 5 | #SBATCH --cpus-per-task=1 6 | #SBATCH --mem=12G 7 | #SBATCH --partition=short 8 | #SBATCH -o mmtax-%j.out 9 | #SBATCH -e mmtax-%j.err 10 | 11 | 12 | set -euo pipefail 13 | 14 | python /home/edwa0468/GitHubs/EdwardsLab/taxon/mmseqs_report_to_table.py -d mmseqs -o mmseqs_taxonomy -v 15 | -------------------------------------------------------------------------------- /process_metagenomes/vamb_concat.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=VambConcat 3 | #SBATCH --time=0-1 4 | #SBATCH --ntasks=1 5 | #SBATCH --cpus-per-task=1 6 | #SBATCH --mem=32G 7 | #SBATCH --partition=short 8 | #SBATCH -o vamb_concat-%j.out 9 | #SBATCH -e vamb_concat-%j.err 10 | 11 | eval "$(conda shell.bash hook)" 12 | conda activate vamb 13 | 14 | 15 | mkdir -p vamb 16 | python /home/edwa0468/GitHubs/EdwardsLab/process_EK_metagenomes/vamb_concatenate.py vamb/contigs.fna.gz megahit/*/output/final.contigs.fa 17 | 18 | -------------------------------------------------------------------------------- /prophages/run_phispy_snakemakes.sh: -------------------------------------------------------------------------------- 1 | # shell script so I can run all the snakemakes! 2 | 3 | WD=$PWD 4 | cd phispy_metrics 5 | echo "Running phispy in phispy_metrics" 6 | snakemake -s phispy_metrics.snakefile -j 12 7 | snakemake -s phispy_no_metrics.snakefile -j 12 8 | python3 summarize.py 9 | cd $WD 10 | 11 | cd phispy_tests 12 | echo "Running phispy in phispy_tests" 13 | snakemake -s /home3/redwards/GitHubs/EdwardsLab/prophages/phispy_training_vs_test.snakefile -j 12 14 | cd $WD 15 | 16 | 17 | cd phispy_training_set 18 | echo "Running phispy in phispy_training_set"; 19 | snakemake -s /home3/redwards/GitHubs/EdwardsLab/prophages/phispy_with_training.snakefile -j 12 20 | cd $WD 21 | 22 | cd phispy_phage_genes 23 | echo "Running phispy in phispy_phage_genes" 24 | snakemake -s /home3/redwards/GitHubs/EdwardsLab/prophages/phispy_phage_genes.snakefile -j 12 25 | cd $WD 26 | 27 | 28 | cd PhiSpy_SN 29 | echo "Running phispy in phispy_SN" 30 | snakemake -s /home3/redwards/GitHubs/EdwardsLab/snakemake/phispy.snakefile -j 12 31 | cd $WD 32 | 33 | -------------------------------------------------------------------------------- /prophages/run_virsorter.snakefile: -------------------------------------------------------------------------------- 1 | """ 2 | Run virsorter on a genbank file 3 | 4 | """ 5 | 6 | 7 | import os 8 | import sys 9 | 10 | 11 | GBDIR = "genbank" 12 | FADIR = "fasta" 13 | 14 | SAMPLES, = glob_wildcards(os.path.join(GBDIR, '{sample}.gbf')) 15 | 16 | 17 | 18 | rule all: 19 | input: 20 | expand(os.path.join(FADIR, "{sample}.fna"), sample=SAMPLES) 21 | 22 | 23 | rule genbank2fasta: 24 | input: 25 | os.path.join(GBDIR, "{sample}.gbf") 26 | output: 27 | os.path.join(FADIR, "{sample}.fna") 28 | shell: 29 | "any2fasta {input} > {output}" 30 | 31 | 32 | rule run_virsorter: 33 | input: 34 | os.path.join(FADIR, "{sample}.fna") 35 | conda: 36 | "virsorter" 37 | output: 38 | os.path.join(VIRDIR, "{sample}", 39 | 40 | 41 | 42 | 43 | -------------------------------------------------------------------------------- /proxymeta/README.md: -------------------------------------------------------------------------------- 1 | # Proxymeta 2 | 3 | These scripts are for analyzing data from [Hi-C](https://www.biorxiv.org/content/biorxiv/early/2017/10/05/198713.full.pdf) provided by this paper _Hi-C deconvolution of a human gut microbiome yields high-quality draft 4 | genomes and reveals plasmid-genome interactions._ 5 | 6 | They have [three samples](https://www.ncbi.nlm.nih.gov/Traces/study/?acc=SAMN07736353) 7 | 8 | SRA ID | Library name | Type | Experiment | Library Selection | MBp | Reads 9 | ---|---|---|---|---|---|--- 10 | SRR6131124 | AD007 | OTHER | SRX3243492 | Restriction Digest | 12,019 | 5,602 11 | SRR6131123 | AD002 | WGS | SRX3243493 | RANDOM | 36,128 | 15,610 12 | SRR6131122 | AD012 | OTHER | SRX3243494 | Restriction Digest | 12,728 | 5,584 13 | 14 | As usual, they probably do neither what you expect nor what you need! 15 | 16 | 17 | -------------------------------------------------------------------------------- /pymol/draw_images.py: -------------------------------------------------------------------------------- 1 | """ 2 | Draw images from a directory of PDB files 3 | """ 4 | 5 | import os 6 | import sys 7 | import argparse 8 | import pymol 9 | 10 | 11 | __author__ = 'Rob Edwards' 12 | 13 | if __name__ == "__main__": 14 | parser = argparse.ArgumentParser(description=' ') 15 | parser.add_argument('-d', help='input directory', required=True) 16 | parser.add_argument('-o', help='output directory', required=True) 17 | parser.add_argument('-v', help='verbose output', action='store_true') 18 | args = parser.parse_args() 19 | 20 | for f in os.listdir(args.d): 21 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | PhiSpy 2 | bcbio-gff>=0.6.6 3 | biopython 4 | bs4 5 | crc64iso 6 | datrie 7 | h5py 8 | ipython 9 | jupyter 10 | lxml 11 | matplotlib 12 | natsort 13 | networkx 14 | numpy 15 | openpyxl 16 | pandas 17 | plotly 18 | pybtex 19 | pymongo 20 | pyreadr 21 | pysam 22 | pytaxonkit 23 | python-dateutil 24 | pytz 25 | requests 26 | scikit-bio 27 | scikit-learn 28 | scipy 29 | seaborn 30 | sklearn 31 | suffix_trees 32 | tk 33 | xmltodict 34 | -------------------------------------------------------------------------------- /requirements_mini.txt: -------------------------------------------------------------------------------- 1 | bcbio-gff>=0.6.6 2 | bs4 3 | crc64iso 4 | datrie 5 | h5py 6 | ipython 7 | jupyter 8 | lxml 9 | matplotlib 10 | networkx 11 | numpy 12 | openpyxl 13 | pandas 14 | python-dateutil 15 | pytz 16 | requests 17 | scikit-bio 18 | scikit-learn 19 | scipy 20 | seaborn 21 | -------------------------------------------------------------------------------- /rob_tests/test_stream_pair.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test streaming pairs of sequences but not a unit (nose) test. Sorry 3 | """ 4 | 5 | import os 6 | import sys 7 | import argparse 8 | 9 | from roblib import bcolors 10 | from roblib import stream_paired_fastq 11 | 12 | 13 | 14 | if __name__ == '__main__': 15 | parser = argparse.ArgumentParser(description='') 16 | parser.add_argument('-l', help='R1 file', required=True) 17 | parser.add_argument('-r', help='R2 file', required=True) 18 | parser.add_argument('-v', help='verbose output', action='store_true') 19 | args = parser.parse_args() 20 | 21 | 22 | for seqid, h1, s1, q1, h2, s2, q2 in stream_paired_fastq(args.l, args.r): 23 | print(f"{h1} :: {h2}") -------------------------------------------------------------------------------- /roblib/bcolors.py: -------------------------------------------------------------------------------- 1 | """ 2 | Colors that you can import and make the text look pretty 3 | 4 | Source: https://stackoverflow.com/questions/287871/print-in-terminal-with-colors 5 | """ 6 | 7 | __author__ = 'Rob Edwards' 8 | 9 | 10 | class bcolors(object): 11 | HEADER = '\033[95m' 12 | OKBLUE = '\033[94m' 13 | OKGREEN = '\033[92m' 14 | WARNING = '\033[93m' 15 | FAIL = '\033[91m' 16 | ENDC = '\033[0m' 17 | BOLD = '\033[1m' 18 | UNDERLINE = '\033[4m' 19 | 20 | PINK = '\033[95m' 21 | BLUE = '\033[94m' 22 | GREEN = '\033[92m' 23 | YELLOW = '\033[93m' 24 | RED = '\033[91m' 25 | WHITE = '\033[0m' 26 | BOLD = '\033[1m' 27 | UNDERLINE = '\033[4m' 28 | 29 | -------------------------------------------------------------------------------- /roblib/files.py: -------------------------------------------------------------------------------- 1 | """ 2 | Helper functions for files 3 | """ 4 | import binascii 5 | 6 | 7 | def is_gzip(filename: str) -> bool: 8 | """ 9 | Is this a gzip file? 10 | """ 11 | 12 | """ 13 | This is an elegant solution to test whether a file is gzipped by reading the first two characters. 14 | I also use a version of this in fastq_pair if you want a C version :) 15 | See https://stackoverflow.com/questions/3703276/how-to-tell-if-a-file-is-gzip-compressed for inspiration 16 | :param f: the file to test 17 | :return: True if the file is gzip compressed else false 18 | """ 19 | with open(filename, 'rb') as i: 20 | return binascii.hexlify(i.read(2)) == b'1f8b' 21 | -------------------------------------------------------------------------------- /roblib/rob_error.py: -------------------------------------------------------------------------------- 1 | """ 2 | An Error Class so I can write my own errors 3 | """ 4 | class Error(Exception): 5 | """ 6 | Base class for exceptions in this module. 7 | """ 8 | pass 9 | 10 | class SequencePairError(Error): 11 | """ 12 | Exception raised for sequences not being paired properly. 13 | 14 | :param message: explanation of the error 15 | """ 16 | 17 | def __init__(self, message): 18 | self.message = message 19 | super().__init__(self.message) 20 | 21 | class FastqFormatError(Error): 22 | """ 23 | Exception raised for sequences not being paired properly. 24 | 25 | :param message: explanation of the error 26 | """ 27 | 28 | def __init__(self, message): 29 | self.message = message 30 | super().__init__(self.message) 31 | 32 | class ColorNotFoundError(Error): 33 | """ 34 | Exception raised for a color not being found. 35 | 36 | :param message: explanation of the error 37 | """ 38 | 39 | def __init__(self, message): 40 | self.message = message 41 | -------------------------------------------------------------------------------- /roblib/strings.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import string 4 | __author__ = 'Rob Edwards' 5 | 6 | 7 | 8 | 9 | def ascii_clean(s): 10 | """Remove non-ascii characters from a string""" 11 | return filter(lambda x: x in string.printable, s) 12 | 13 | -------------------------------------------------------------------------------- /roblib_tk/__init__.py: -------------------------------------------------------------------------------- 1 | from .file_chooser import choose_a_file, write_a_file 2 | 3 | __all__ = [ 4 | 'choose_a_file', 'write_a_file' 5 | ] -------------------------------------------------------------------------------- /roblib_tk/file_chooser.py: -------------------------------------------------------------------------------- 1 | """ 2 | choose a file. can use this if one is not provided 3 | """ 4 | import tkinter as tk 5 | from tkinter import filedialog 6 | 7 | def choose_a_file(dialog_title="Choose a file..."): 8 | 9 | root = tk.Tk() 10 | 11 | filetypes = ( 12 | ('Text files', '*.TXT'), 13 | ('All files', '*.*'), 14 | ) 15 | 16 | filename = tk.filedialog.askopenfilename( 17 | title=dialog_title, 18 | filetypes=filetypes, 19 | ) 20 | root.destroy() 21 | 22 | return filename 23 | 24 | 25 | def write_a_file(dialog_title="Choose where to save the file..."): 26 | filetypes = ( 27 | ('TSV files', '*.TSV'), 28 | ('XLS files', '*.XLS'), 29 | ('All files', '*.*'), 30 | ) 31 | 32 | filename = tk.filedialog.asksaveasfilename( 33 | title=f'Choose where to save the file...', 34 | filetypes=filetypes,defaultextension=".tsv" 35 | ) 36 | 37 | return filename -------------------------------------------------------------------------------- /searchSRA/envs/samtools.yaml: -------------------------------------------------------------------------------- 1 | name: samtools 2 | channels: 3 | - bioconda 4 | dependencies: 5 | - samtools 6 | -------------------------------------------------------------------------------- /searchSRA/searchSRA_abstracts.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:431d7588717cfc6c514a337f127e101f31c9200c13615f310e097695ebf72d5f 3 | size 2715326 4 | -------------------------------------------------------------------------------- /seed_servers/RAST-alljobs.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # 3 | 4 | use strict; 5 | use Data::Dumper; 6 | $ENV{SAS_SERVER}="PUBSEED"; 7 | print STDERR "SAS is $ENV{SAS_SERVER}\n"; 8 | use Term::ReadKey; 9 | use RASTserver; 10 | 11 | ## Use RAST test, not regular RAST 12 | # Now using regular RAST 13 | 14 | 15 | print "Please enter your RAST username: "; 16 | my $user = ReadLine(0); 17 | chomp $user; 18 | 19 | print "Please enter your RAST password: "; 20 | ReadMode 2; 21 | my $password = ReadLine(0); 22 | chomp $password; 23 | ReadMode 1; 24 | print "\n"; 25 | 26 | 27 | my $rast=new RASTserver($user, $password); 28 | unless (defined $rast) {die "Can't connect ot the rast server"} 29 | 30 | print Dumper($rast->jobs()); 31 | -------------------------------------------------------------------------------- /seed_servers/RAST-jobs.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # 3 | # 4 | 5 | use strict; 6 | use RASTserver; 7 | use Term::ReadKey; 8 | use Data::Dumper; 9 | $ENV{SAS_SERVER}="PUBSEED"; 10 | 11 | print "Please enter your RAST username: "; 12 | my $user = ReadLine(0); 13 | chomp $user; 14 | 15 | print "Please enter your RAST password: "; 16 | ReadMode 2; 17 | my $password = ReadLine(0); 18 | chomp $password; 19 | ReadMode 1; 20 | print "\n"; 21 | 22 | my $rast=new RASTserver($user, $password); 23 | unless (defined $rast) {die "Can't connect ot the rast server"} 24 | 25 | my $time = time; my $job = 0; 26 | my @jobs = $rast->jobs(); 27 | 28 | foreach my $j (@jobs) { 29 | print Dumper($j); 30 | print STDERR $job++, " : ", ($time-time), " seconds\n"; 31 | } 32 | -------------------------------------------------------------------------------- /seed_servers/RAST-status.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use RASTserver; 5 | $ENV{SAS_SERVER}="SEED"; 6 | use Term::ReadKey; 7 | 8 | print "Please enter your RAST username: "; 9 | my $user = ReadLine(0); 10 | chomp $user; 11 | 12 | print "Please enter your RAST password: "; 13 | ReadMode 2; 14 | my $password = ReadLine(0); 15 | chomp $password; 16 | ReadMode 1; 17 | print "\n"; 18 | 19 | 20 | my $rast=new RASTserver($user, $password); 21 | unless (defined $rast) {die "Can't connect ot the rast server"} 22 | 23 | 24 | 25 | 26 | 27 | die "$0 " unless (defined $ARGV[0]); 28 | my $stat = $rast->status_of_RAST_job({-job => \@ARGV}); 29 | 30 | foreach my $job (sort {$a <=> $b} keys %$stat) { 31 | print join("\t", $job, $stat->{$job}->{'status'}), "\n"; 32 | } 33 | -------------------------------------------------------------------------------- /seed_servers/test_occ_roles.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test the conversion of roles to pegs 3 | """ 4 | 5 | import os 6 | import sys 7 | 8 | import argparse 9 | from servers.SAP import SAPserver 10 | 11 | def occ_to_roles(roles): 12 | """ 13 | Convert roles to a dict of roles and the pegs they do 14 | 15 | :param roles: 16 | :type roles: 17 | :return: 18 | :rtype: 19 | """ 20 | 21 | sv = SAPserver() 22 | result = sv.occ_of_role({'-roles' : roles}) 23 | return result 24 | 25 | if __name__ == '__main__': 26 | roles = ['PTS system, N-acetylglucosamine-specific IIB component (EC 2.7.1.69)', 'Glycerol-3-phosphate dehydrogenase [NAD+] (EC 1.1.1.8)'] 27 | res = occ_to_roles(roles) 28 | for r in res: 29 | print(r + "\t" + "\n".join(res[r])) -------------------------------------------------------------------------------- /snakemake/abricate.snakefile: -------------------------------------------------------------------------------- 1 | """ 2 | Run [abricate](https://github.com/tseemann/abricate) with all 3 | options on a directory of sequence files 4 | """ 5 | 6 | import os 7 | import sys 8 | import subprocess 9 | 10 | configfile: "abricate.yaml" 11 | 12 | 13 | indir = config['seq_files'] 14 | outdir = config['abricate_output'] 15 | 16 | # read the known databases from abricate 17 | 18 | proc = subprocess.Popen(['abricate','--list'],stdout=subprocess.PIPE, encoding='utf-8') 19 | databases = list(filter(lambda x: x != "DATABASE", [p[0] for p in [l.strip().split("\t") for l in proc.stdout]])) 20 | 21 | SEQS, = glob_wildcards(os.path.join(indir, '{seq}')) 22 | 23 | 24 | rule all: 25 | input: 26 | expand(os.path.join(outdir, "{sample}.{db}.abricate.tsv"), sample=SEQS, db=databases) 27 | 28 | rule abricate: 29 | input: 30 | os.path.join(indir, "{sample}") 31 | output: 32 | os.path.join(outdir, "{sample}.{db}.abricate.tsv") 33 | params: 34 | db = "{db}" 35 | shell: 36 | "abricate --noheader --nopath --db {params.db} {input} > {output}" 37 | -------------------------------------------------------------------------------- /snakemake/annotate_phages.snakefile: -------------------------------------------------------------------------------- 1 | """ 2 | Snakefile to upload all the genomes to PATRIC 3 | """ 4 | 5 | 6 | # where is the data 7 | FASTADIR = config['fasta'] 8 | OUTPUTDIR = config['output'] 9 | 10 | 11 | FASTA, = glob_wildcards(os.path.join(FASTADIR, '{fasta}.fasta')) 12 | 13 | rule all: 14 | input: 15 | 16 | 17 | -------------------------------------------------------------------------------- /snakemake/cluster_phages.snakefile: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | import os 5 | 6 | gbk = "phage_100_genbank" 7 | faa = "proteins" 8 | fna = "nucleotides" 9 | PHAGES, = glob_wildcards(os.path.join(gbk, '{phage}_phage.gbk')) 10 | 11 | 12 | rule all: 13 | input: 14 | expand(os.path.join(fna, '{phage}.fna'), phage=PHAGES) 15 | 16 | rule gbk2faa: 17 | input: 18 | os.path.join(gbk, '{phage}_phage.gbk') 19 | output: 20 | faa = os.path.join(faa, '{phage}.faa'), 21 | fna = os.path.join(fna, '{phage}.fna') 22 | shell: 23 | """ 24 | python3 ~/GitHubs/EdwardsLab/bin/genbank2sequences.py -g {input} -a {output.faa} -n {output.fna} -c 25 | """ 26 | 27 | -------------------------------------------------------------------------------- /snakemake/envs/bowtie.yaml: -------------------------------------------------------------------------------- 1 | name: bowtie 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - default 6 | dependencies: 7 | # There is an issue with newer tbb libraries. This might go away at some point: https://www.biostars.org/p/494922/ 8 | - tbb=2020.2 9 | - bowtie2 10 | - samtools 11 | -------------------------------------------------------------------------------- /snakemake/envs/canu.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | dependencies: 5 | - canu 6 | -------------------------------------------------------------------------------- /snakemake/envs/filtlong.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | dependencies: 5 | - filtlong 6 | -------------------------------------------------------------------------------- /snakemake/envs/flye.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | dependencies: 5 | - flye 6 | -------------------------------------------------------------------------------- /snakemake/envs/focus.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - bioconda 3 | dependencies: 4 | - focus 5 | -------------------------------------------------------------------------------- /snakemake/envs/kraken.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - bioconda 3 | dependencies: 4 | - kraken2 5 | -------------------------------------------------------------------------------- /snakemake/envs/megahit.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - bioconda 3 | dependencies: 4 | - megahit 5 | -------------------------------------------------------------------------------- /snakemake/envs/miniasmminipolish.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | dependencies: 5 | - miniasm 6 | - minipolish 7 | - any2fasta 8 | -------------------------------------------------------------------------------- /snakemake/envs/minimap.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - bioconda 3 | dependencies: 4 | - minimap2 5 | -------------------------------------------------------------------------------- /snakemake/envs/prinseq.yaml: -------------------------------------------------------------------------------- 1 | name: prinseq-plus-plus 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - default 6 | dependencies: 7 | - prinseq-plus-plus 8 | -------------------------------------------------------------------------------- /snakemake/envs/raven.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | dependencies: 5 | - raven-assembler 6 | -------------------------------------------------------------------------------- /snakemake/envs/seqtk.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - bioconda 3 | dependencies: 4 | - seqtk 5 | -------------------------------------------------------------------------------- /snakemake/envs/superfocus.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - bioconda 3 | dependencies: 4 | - super-focus 5 | -------------------------------------------------------------------------------- /snakemake/envs/trycycler.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | dependencies: 5 | - miniasm 6 | - mash 7 | - minimap2 8 | - muscle 9 | - r-ape 10 | - r-phangorn 11 | -------------------------------------------------------------------------------- /snakemake/phispy.yaml: -------------------------------------------------------------------------------- 1 | directories: 2 | fasta_files: fasta 3 | genbank_files: genbank 4 | phispy_files: phispy 5 | gto_files : gto 6 | number_of_genomes : 200 7 | 8 | -------------------------------------------------------------------------------- /snakemake/process_metagenomes.json: -------------------------------------------------------------------------------- 1 | { 2 | "tatabox_executables" : { 3 | "assembler" : "/usr/local/genome/megahit/current/bin/megahit", 4 | "mmseqs" : "/usr/local/genome/mmseqs2/mmseqs/bin/mmseqs" 5 | }, 6 | "anth_executables" : { 7 | "assembler" : "/home3/redwards/opt/megahit/current/bin/megahit", 8 | "mmseqs" : "/home3/redwards/opt/mmseqs/current/bin/mmseqs", 9 | "bowtie2-build" : "/usr/local/bowtie2/bin/bowtie2-build", 10 | "bowtie2" : "/usr/local/bowtie2/bin/bowtie2" 11 | }, 12 | "directories" : { 13 | "Reads" : "fastq", 14 | "round1_assembly_output" : "assembly.1", 15 | "round1_contig_read_mapping" : "reads.contigs.1", 16 | "round2_unassembled_reads" : "unassembled_reads", 17 | "round2_assembly_output" : "reassembled_reads", 18 | "combined_contig_merging" : "final.combined_contigs" 19 | }, 20 | "threads" : 8 21 | } 22 | 23 | -------------------------------------------------------------------------------- /snakemake_tests/test.snakefile: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | A simple test 4 | 5 | """ 6 | 7 | VAR = "hello world" 8 | 9 | rule all: 10 | input: 11 | "ls.txt" 12 | 13 | rule lsd: 14 | input: 15 | "bac_giant_unique_species" 16 | output: 17 | "ls.txt" 18 | shell: 19 | #"/bin/ls {input} > {output}" 20 | "echo {VAR} > {output}" 21 | -------------------------------------------------------------------------------- /sra/SRA.partie.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:f8f55f7174d6960c8d182661c5c1346a053f12d594c8c9b8b8445377872e9607 3 | size 5169243 4 | -------------------------------------------------------------------------------- /sra/run_accession-experiment_lib.tsv.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:558b0ffdef5b29a3b9d0f130e5085f89c18c99b590ec6538e6ce422c0999be2b 3 | size 5365649 4 | -------------------------------------------------------------------------------- /taxon/Error.py: -------------------------------------------------------------------------------- 1 | """ 2 | Custom exceptions for taxnomy parsing 3 | """ 4 | 5 | class Error(Exception): 6 | """Base class for other exceptions""" 7 | pass 8 | 9 | 10 | class EntryNotInDatabaseError(Exception): 11 | """Entry not in the db. Obvs""" 12 | 13 | def __init__(self, message): 14 | self.message = message 15 | 16 | 17 | class NoNameFoundError(Exception): 18 | """No name was found for this entry""" 19 | def __init__(self, message): 20 | self.message = message -------------------------------------------------------------------------------- /taxon/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | __author__ = 'Rob Edwards' 5 | from .taxon import read_taxa, read_nodes, extended_names, read_names, read_divisions, read_gi_tax_id, read_tax_id_gi 6 | from .config import get_db_dir 7 | from .load_from_database import get_taxonomy_db, get_taxonomy, connect_to_db, get_taxid_for_name, taxonomy_hierarchy_as_list 8 | from .load_from_database import all_ids, taxonomy_hierarchy, all_species_ids, taxonomy_ids_as_list, acc_to_taxonomy 9 | from .taxonomy import TaxonNode, TaxonName, TaxonDivision 10 | from .Error import NoNameFoundError, EntryNotInDatabaseError 11 | from .read_accession_files import read_acc_tax_id 12 | 13 | __all__ = [ 14 | 'read_taxa', 'read_nodes', 'extended_names', 'read_names', 'read_divisions', 'read_gi_tax_id', 'read_tax_id_gi', 15 | 'get_taxonomy_db', 'get_taxonomy', 'connect_to_db', 'get_db_dir', 'get_taxid_for_name', 'all_ids', 16 | 'taxonomy_hierarchy', 'taxonomy_hierarchy_as_list', 'taxonomy_ids_as_list', 'read_acc_tax_id', 'acc_to_taxonomy' 17 | ] 18 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /taxon/config.py: -------------------------------------------------------------------------------- 1 | """ 2 | Some settings for the config files 3 | """ 4 | 5 | import os 6 | import sys 7 | 8 | 9 | def get_db_dir(): 10 | """ 11 | Just return the default dir listed above 12 | :return: the default location for the sqllite database 13 | """ 14 | 15 | if 'NCBI_TAXONOMY' in os.environ: 16 | if os.path.exists(os.environ['NCBI_TAXONOMY']): 17 | return os.environ['NCBI_TAXONOMY'] 18 | else: 19 | print(f"WARNING: NCBI_TAXONOMY variable is set but {os.environ['NCBI_TAXONOMY']} does not exist", file=sys.stderr) 20 | if 'TAXONKIT_DB' in os.environ: 21 | if os.path.exists(os.environ['TAXONKIT_DB']): 22 | return os.environ['TAXONKIT_DB'] 23 | else: 24 | print(f"WARNING: TAXONKIT_DB variable is set but {os.environ['TAXONKIT_DB']} does not exist", file=sys.stderr) 25 | 26 | return None 27 | -------------------------------------------------------------------------------- /taxon/taxonomy/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Rob Edwards' 2 | from .taxonomy import TaxonNode, TaxonName, TaxonDivision 3 | __all__ = ["TaxonNode", "TaxonName", "TaxonDivision"] 4 | 5 | -------------------------------------------------------------------------------- /testrepeatfinder/ROBTEST.repeatfinder: -------------------------------------------------------------------------------- 1 | 1 12 17 28 2 | 1 11 18 28 3 | 2 12 17 27 4 | -------------------------------------------------------------------------------- /testrepeatfinder/compare.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | 3 | my $f1 = shift || die "file 1?"; 4 | my $f2 = shift || die "file 2?"; 5 | 6 | my %data; my %first; 7 | open(IN, $f1) || die "cant open $f1"; 8 | while () { 9 | $data{$_}=1; 10 | my @a=split /\t/; 11 | $first{$a[0]}=$_; 12 | } 13 | close IN; 14 | 15 | open(IN, $f2) || die "cant open $f2"; 16 | while () { 17 | if (!$data{$_}) { 18 | my @a=split /\t/; 19 | if ($first{$a[0]}) { 20 | print "\n$first{$a[0]}$_\n"; 21 | } 22 | else { 23 | print STDERR "NONE: $_"; 24 | } 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /testrepeatfinder/files.txt: -------------------------------------------------------------------------------- 1 | /home/redwards/.local/lib/python3.7/site-packages/repeatFinder.cpython-37m-x86_64-linux-gnu.so 2 | /home/redwards/.local/lib/python3.7/site-packages/repeatfinder-1.0.0.egg-info 3 | -------------------------------------------------------------------------------- /testrepeatfinder/repeatFinder.h: -------------------------------------------------------------------------------- 1 | 2 | 3 | #ifndef PHISPYREPEATFINDER_H 4 | #define PHISPYREPEATFINDER_H 5 | 6 | static PyObject * python_input(PyObject *self, PyObject *args); 7 | 8 | static PyMethodDef PhiSpyRepeatFinderMethods[] = { 9 | {"repeatFinder", python_input, METH_VARARGS, "Python interface for C++ repeat finder for PhiSpy"}, 10 | {NULL, NULL, 0, NULL} 11 | }; 12 | 13 | static struct PyModuleDef PhiSpyRepeatFinderModule = { 14 | PyModuleDef_HEAD_INIT, 15 | "repeatFinder", 16 | "Python for a C++ repeat finder used by PhiSpy to identify potential prophage ends", 17 | -1, 18 | PhiSpyRepeatFinderMethods 19 | }; 20 | 21 | #endif //PHISPYREPEATFINDER_H -------------------------------------------------------------------------------- /testrepeatfinder/setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup, Extension 2 | 3 | def main(): 4 | setup(name="RobRepeatFinder", 5 | version="1.0.0", 6 | description="Python interface for repeatFinder", 7 | author="Rob Edwards", 8 | author_email="raedwards@gmail.com", 9 | ext_modules=[Extension("RobRepeatFinder", sources=["repeatFinder.cpp"], language='c++')]) 10 | 11 | if __name__ == "__main__": 12 | main() 13 | -------------------------------------------------------------------------------- /testrepeatfinder/test.fasta: -------------------------------------------------------------------------------- 1 | >sequence 2 | AAAAAAAAAAATGCATGCATGCATCGTCAGCATCGACATGGCTACTTTTTTTTTTT 3 | -------------------------------------------------------------------------------- /testrepeatfinder/test.fasta.repeatfinder: -------------------------------------------------------------------------------- 1 | 1 11 56 46 2 | -------------------------------------------------------------------------------- /testrepeatfinder/test_repeatfinder.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test the implementation of the repeatfinder extension 3 | """ 4 | 5 | import os 6 | import sys 7 | import argparse 8 | 9 | from roblib import bcolors 10 | 11 | import RobRepeatFinder 12 | import pprint 13 | 14 | s = "TTTTTTTTTTTTagcaTTTTTTTTTTTT" 15 | print(f"s: {s}") 16 | r = RobRepeatFinder.repeatFinder(s, 0) 17 | pp = pprint.PrettyPrinter(indent=4) 18 | pp.pprint(r) 19 | 20 | -------------------------------------------------------------------------------- /text_matching/vfdb.txt.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:769e678ac6583edd63005c0a8eb5ce5f8611bd39e658d308ddf1ae3cd4766381 3 | size 147539 4 | --------------------------------------------------------------------------------