├── .gitignore ├── LICENSE ├── README.md ├── bit ├── bit-GL-combine-KO-and-tax-tables ├── bit-GL-combine-contig-tax-tables ├── bit-calc ├── bit-calc-gc-per-sequence ├── bit-calc-gc-sliding-window ├── bit-calc-variation-in-msa ├── bit-check-for-fastq-dup-headers ├── bit-colnames ├── bit-combine-bracken-and-add-lineage ├── bit-combine-go-summaries ├── bit-combine-kraken2-taxon-summaries ├── bit-count-bases ├── bit-count-bases-per-seq ├── bit-cov-analyzer ├── bit-data-locations ├── bit-dedupe-fasta-headers ├── bit-dl-ncbi-assemblies ├── bit-extract-seqs-by-coords ├── bit-ez-screen ├── bit-fasta-to-bed ├── bit-fasta-to-genbank ├── bit-figshare-upload ├── bit-filter-KOFamScan-results ├── bit-filter-seqs-by-length ├── bit-filter-table ├── bit-gen-iToL-binary-dataset ├── bit-gen-iToL-colorstrip ├── bit-gen-iToL-map ├── bit-gen-iToL-text-dataset ├── bit-gen-kraken2-tax-plots ├── bit-gen-reads ├── bit-genbank-locus-clean-slate ├── bit-genbank-to-AA-seqs ├── bit-genbank-to-cds-table ├── bit-genbank-to-fasta ├── bit-get-accessions-from-GTDB ├── bit-get-cov-stats ├── bit-get-go-term-info ├── bit-get-lineage-from-taxids ├── bit-get-test-data ├── bit-get-workflow ├── bit-gff-to-anvio ├── bit-kraken2-to-taxon-summaries ├── bit-lineage-to-tsv ├── bit-mutate-seqs ├── bit-normalize-table ├── bit-parse-fasta-by-headers ├── bit-parse-fastq-by-headers ├── bit-prot-acc-to-taxid ├── bit-remove-wraps ├── bit-rename-fasta-headers ├── bit-reorder-fasta ├── bit-slim-down-go-terms ├── bit-split-multifasta ├── bit-summarize-assembly ├── bit-summarize-column ├── bit-summarize-go-annotations ├── bit-update-go-dbs ├── bit-update-ncbi-taxonomy ├── bit-version ├── helper-bit-check-or-setup-GTDB-files.py ├── helper-bit-combine-bracken.py ├── helper-bit-dl-ncbi-assemblies-parallel.sh ├── helper-bit-get-ncbi-assembly-tables ├── helper-bit-get-ncbi-tax-data ├── helper-bit-parse-assembly-summary-file.py ├── helper-bit-setup-GO-dbs └── helper-bit-update-tax-table-for-seqscreen-go-tax-summary.sh ├── images ├── bit-cov-analyzer.pdf ├── bit-cov-analyzer.png ├── bit-metagenomics-overview.afdesign ├── bit-metagenomics-overview.pdf └── bit-metagenomics-overview.png ├── test-data ├── ez-screen-assembly.fasta ├── ez-screen-targets.fasta ├── kraken-example-out.tsv └── kraken-example.report └── workflows ├── genome-summarize-wf ├── README.md ├── Snakefile ├── config.yaml ├── envs │ ├── bit.yaml │ ├── cat.yaml │ ├── checkm2.yaml │ ├── eukcc.yaml │ └── gtdb-tk.yaml └── scripts │ ├── combine-euk-outputs.py │ ├── combine-outputs.py │ └── slurm-status.py ├── metagenomics-wf ├── CHANGELOG.md ├── README.md ├── Snakefile ├── config.yaml ├── config │ └── multiqc.config ├── envs │ ├── bit.yaml │ ├── cat.yaml │ ├── checkm2.yaml │ ├── gtdb-tk.yaml │ ├── keggdecoder.yaml │ ├── kofamscan.yaml │ ├── mapping.yaml │ ├── megahit.yaml │ ├── metabat.yaml │ ├── prodigal.yaml │ └── qc.yaml └── scripts │ ├── combine-benchmarks.sh │ ├── download-gtdbtk-refs.sh │ ├── format-contig-tax-classifications.sh │ ├── format-gene-tax-classifications.sh │ ├── generate-assembly-based-overview-table.sh │ ├── parse-MAG-annots.py │ ├── slurm-status.py │ └── swap-MAG-IDs.py └── sra-download-wf ├── CHANGELOG.md ├── README.md ├── Snakefile ├── config.yaml ├── envs └── sra-dl.yaml └── scripts ├── combine-benchmarks.sh └── combine-sra-accessions.sh /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | -------------------------------------------------------------------------------- /bit/bit-calc: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | if [ "$#" == 0 ] || [ $1 == "-h" ]; then 4 | printf "\n Uses \`awk\` for quick calculations at the command line. For version\n" 5 | printf " info run \`bit-version\`.\n\n" 6 | printf " Usage:\n\t bit-calc \"(5+5)/2\"\n\n" 7 | exit 8 | fi 9 | 10 | awk "BEGIN { print $1 }" 11 | -------------------------------------------------------------------------------- /bit/bit-calc-gc-per-sequence: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from Bio import SeqIO 4 | import sys 5 | import argparse 6 | 7 | parser = argparse.ArgumentParser(description = "This script takes a nucleotide multifasta and returns \ 8 | a tab-delimited file with 3 columns: header, sequence length, \ 9 | and GC. For version info, run `bit-version`.") 10 | 11 | required = parser.add_argument_group('required arguments') 12 | 13 | required.add_argument("-i", "--input-fasta", metavar = "", help = "fasta file", action = "store", required = True) 14 | parser.add_argument("-o", "--output-file", metavar = "", help = 'Name of output tsv file (default: "GC-out.tsv")', 15 | action = "store", default = "GC-out.tsv") 16 | 17 | if len(sys.argv)==1: 18 | parser.print_help(sys.stderr) 19 | sys.exit(0) 20 | 21 | args = parser.parse_args() 22 | 23 | with open(args.input_fasta, "r") as in_fasta: 24 | 25 | with open(args.output_file, "w") as out_file: 26 | 27 | out_file.write("header" + "\t" + "length" + "\t" + "gc" + "\n") 28 | 29 | for cur_record in SeqIO.parse(in_fasta, "fasta"): 30 | gene_name = cur_record.name 31 | A_count = cur_record.seq.count('A') 32 | C_count = cur_record.seq.count('C') 33 | G_count = cur_record.seq.count('G') 34 | T_count = cur_record.seq.count('T') 35 | length = len(cur_record.seq) 36 | gc_percentage = float(G_count + C_count) / length 37 | gc_percentage = round(gc_percentage,2) 38 | out_file.write(str(gene_name)+"\t"+str(length)+"\t"+str(gc_percentage)+"\n") 39 | -------------------------------------------------------------------------------- /bit/bit-calc-gc-sliding-window: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from Bio import SeqIO 4 | import sys 5 | import argparse 6 | 7 | parser = argparse.ArgumentParser(description = "This script is for nucleotide multifastas and will return \ 8 | a tab-delimited file with 4 columns: header, sequence length, \ 9 | gc of whole sequence, and gc of each window of the specified \ 10 | window size (-w) for each step of the specified step size (-s). \ 11 | For version info, run `bit-version`.") 12 | 13 | required = parser.add_argument_group('required arguments') 14 | 15 | required.add_argument("-i", "--input-fasta", metavar = "", help = "fasta file", action = "store", required = True) 16 | required.add_argument("-o", "--output-file", metavar = "", help = "Name of output tsv file", action = "store", required = True) 17 | parser.add_argument("-w", "--window-size", metavar = "", help = "Desired size of sliding window (default: 100)", action = "store", dest = "window", default = 100) 18 | parser.add_argument("-s", "--step-size", metavar = "", help = "Desired size of steps between each window (default: 1)", action = "store", dest = "step", default = 1) 19 | 20 | if len(sys.argv)==1: 21 | parser.print_help(sys.stderr) 22 | sys.exit(0) 23 | 24 | args = parser.parse_args() 25 | 26 | window = int(args.window) 27 | step = int(args.step) 28 | 29 | half_window = int(window / 2) 30 | 31 | with open(args.input_fasta, "r") as in_fasta: 32 | 33 | with open(args.output_file, "w") as out_file: 34 | 35 | out_file.write(f"header\tlength\tgc\tgc_per_window_of_size_{window}_with_step_of_size_{step}\n") 36 | 37 | for cur_record in SeqIO.parse(in_fasta, "fasta"): 38 | gene_name = cur_record.name 39 | cur_record.seq = cur_record.seq.upper() 40 | A_count = cur_record.seq.count('A') 41 | C_count = cur_record.seq.count('C') 42 | G_count = cur_record.seq.count('G') 43 | T_count = cur_record.seq.count('T') 44 | length = len(cur_record.seq) 45 | gc_percentage = float(G_count + C_count) / length 46 | gc_percentage = round(gc_percentage,2) 47 | 48 | values = [] 49 | 50 | for i in range(0, len(cur_record.seq), step): 51 | 52 | s = cur_record.seq[i - half_window : i + half_window] 53 | s = s.upper() 54 | g = s.count('G') 55 | c = s.count('C') 56 | try: 57 | window_gc_perc = float(g + c) / window 58 | except ZeroDivisionError: 59 | window_gc_perc = 0.0 60 | values.append(window_gc_perc) 61 | 62 | out_file.write(f"{gene_name}\t{length}\t{gc_percentage}\t{values}\n") 63 | -------------------------------------------------------------------------------- /bit/bit-calc-variation-in-msa: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from skbio import TabularMSA, DNA, Protein 4 | import pandas as pd 5 | import argparse 6 | import sys 7 | 8 | parser = argparse.ArgumentParser(description='This script takes an alignment in fasta format as input and returns the Shannon uncertainty values for each column \ 9 | using: http://scikit-bio.org/docs/0.5.3/generated/skbio.alignment.TabularMSA.conservation.html. In output "variation" column: 0 is \ 10 | same character in all sequences for that position (highest conservation); 1 is equal probability of any character \ 11 | (greatest variability). "Conservation" column is inverse. As written, any ambiguous bases or residues are converted to gap characters. \ 12 | For version info, run `bit-version`.') 13 | 14 | required = parser.add_argument_group('required arguments') 15 | 16 | required.add_argument("-i", "--input_alignment_fasta", metavar = "", help = "Input alignment fasta file", action = "store", dest = "input_alignment_fasta", required = True) 17 | 18 | parser.add_argument("-g", "--gap_treatment", metavar = "", help = 'How to treat gaps, either "nan", "ignore", "error", "include" (default: "ignore")', choices = ["nan", "ignore", "error", "include"], action = "store", dest = "gap_treatment", default = "ignore") 19 | parser.add_argument("-t", "--type", metavar = "", help = 'Either "DNA" or "Protein" (default: "Protein")', choices = ["DNA", "Protein"], action = "store", dest = "type", default = "Protein") 20 | parser.add_argument("-o", "--output_file", metavar = "", help = 'Name of output tab-separated file (default: "variation.tsv")', action = "store", dest = "output_tsv", default = "variation.tsv") 21 | 22 | if len(sys.argv)==1: 23 | parser.print_help(sys.stderr) 24 | sys.exit(0) 25 | 26 | args = parser.parse_args() 27 | 28 | # i'm not certain unequal alignments are all that would throw this error, so i'm leaving this out for now so skbio just spits out their problem if they have one reading in the alignment 29 | # try: 30 | # msa = TabularMSA.read(args.input_alignment_fasta, constructor=DNA) 31 | # except ValueError: 32 | # print('\n\tSorry, it seems not all sequences in the alignment are the same length... :(\n') 33 | # sys.exit(1) 34 | 35 | msa = TabularMSA.read(args.input_alignment_fasta, constructor=eval(args.type), lowercase=True) 36 | 37 | list_of_cleaned_seqs = [] 38 | 39 | # converting degenerate bases to gaps 40 | for seq in msa: 41 | 42 | seq = seq.replace(seq.degenerates(), "-") 43 | list_of_cleaned_seqs.append(seq) 44 | 45 | clean_msa = TabularMSA(list_of_cleaned_seqs) 46 | 47 | conserved = clean_msa.conservation(gap_mode=args.gap_treatment) 48 | indexes = list(range(1,clean_msa.shape[1] + 1)) 49 | 50 | df = pd.DataFrame({"position": indexes, "variation":1 - conserved, "conservation": conserved}) 51 | 52 | df.to_csv(args.output_tsv, sep="\t", index=False) 53 | -------------------------------------------------------------------------------- /bit/bit-check-for-fastq-dup-headers: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from Bio import SeqIO 4 | import sys 5 | import argparse 6 | import gzip 7 | 8 | class colors: 9 | GREEN = '\033[0;32m' 10 | YELLOW = '\033[0;33m' 11 | NC = '\033[0m' 12 | 13 | parser = argparse.ArgumentParser(description = 'This script is just for checking if there are any duplicate headers in a fastq file. \ 14 | For version info, run `bit-version`.') 15 | 16 | required = parser.add_argument_group('required arguments') 17 | 18 | required.add_argument("-i", "--input-fastq", metavar = "", help = "Fastq file", action = "store", required = True) 19 | parser.add_argument("--not-gzipped", help = "Add this flag if the input fastq is not gzipped (program expects they are gzipped by default)", action = "store_true") 20 | parser.add_argument("--write-dupes", help = "Add this flag if you want duplicate headers written to a file (will write to 'duplicate-headers.txt')", action = "store_true") 21 | 22 | if len(sys.argv)==1: 23 | parser.print_help(sys.stderr) 24 | sys.exit(0) 25 | 26 | args = parser.parse_args() 27 | 28 | headers_dict = {} 29 | seq_count = 0 30 | 31 | if args.not_gzipped: 32 | 33 | with open(args.input_fastq, "rt") as fastq_in: 34 | 35 | for seq_record in SeqIO.parse(fastq_in, "fastq"): 36 | 37 | seq_count += 1 38 | 39 | if seq_record.id in headers_dict: 40 | headers_dict[seq_record.id] += 1 41 | 42 | else: 43 | headers_dict[seq_record.id] = 1 44 | 45 | else: 46 | 47 | with gzip.open(args.input_fastq, "rt") as fastq_in: 48 | 49 | for seq_record in SeqIO.parse(fastq_in, "fastq"): 50 | 51 | seq_count += 1 52 | 53 | if seq_record.id in headers_dict: 54 | headers_dict[seq_record.id] += 1 55 | 56 | else: 57 | headers_dict[seq_record.id] = 1 58 | 59 | 60 | dup_keys = [k for k,v in headers_dict.items() if v > 1] 61 | 62 | if len(dup_keys) > 0: 63 | 64 | if len(dup_keys) == 1: 65 | print(colors.YELLOW + "\n There was 1 duplicate header among the " + str(seq_count) + " input fastq entries:\n\n " + colors.NC + str(dup_keys[0]) + "\n") 66 | 67 | else: 68 | 69 | print(colors.YELLOW + "\n There were " + str(len(dup_keys)) + " duplicate headers among the " + str(seq_count) + " input fastq entries." + colors.NC) 70 | 71 | if not args.write_dupes: 72 | print(" If you'd like to know which ones, add the `--write-dups` flag.\n") 73 | 74 | else: 75 | 76 | with open("duplicate-headers.txt", "w") as out: 77 | out.write("\n".join(dup_keys)) 78 | out.write("\n") 79 | 80 | print(" They were written to 'duplicate-headers.txt'.\n") 81 | 82 | else: 83 | 84 | print(colors.GREEN + "\n There were no duplicate headers detected among the " + str(seq_count) + " input fastq entries :)\n" + colors.NC) 85 | -------------------------------------------------------------------------------- /bit/bit-colnames: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | GREEN='\033[0;32m' 4 | RED='\033[0;31m' 5 | NC='\033[0m' 6 | 7 | if [ "$#" == 0 ] || [ $1 == "-h" ]; then 8 | printf "\n This script returns the column names (with number) from a tab-separated file.\n" 9 | printf " For version info, run \`bit-version\`.\n\n" 10 | printf " Usage:\n\t bit-colnames input.tsv\n\n" 11 | exit 12 | fi 13 | 14 | if [ -f $1 ]; then 15 | 16 | head -n1 $1 | tr "\t" "\n" | cat -n 17 | 18 | else 19 | echo -e " ${RED}Input file not found :/${NC}" >&2 20 | exit 1 21 | fi 22 | -------------------------------------------------------------------------------- /bit/bit-combine-bracken-and-add-lineage: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # setting some colors 4 | RED='\033[0;31m' 5 | GREEN='\033[0;32m' 6 | NC='\033[0m' 7 | 8 | 9 | ### HELP INFO 10 | ## called by program name with no arguments or with "-h" as only positional argument ## 11 | if [ "$#" == 0 ] || [ $1 == "-h" ] || [ $1 == "help" ]; then 12 | 13 | printf "\n -------------------------------- HELP INFO --------------------------------- \n\n" 14 | printf " This program combines multiple bracken sample outputs and adds full lineage info.\n" 15 | printf " The local NCBI taxonomy database that \`taxonkit\` uses can be updated at anytime\n" 16 | printf " with \`bit-update-ncbi-taxonomy\`. Recommended at least weekly. For version info,\n" 17 | printf " run \`bit-version\`." 18 | 19 | printf "\n Required:\n\n" 20 | printf " – [-i ] An input file holding either a single-column holding the input\n" 21 | printf " file names, or a tab-delimited 2-column file holding input\n" 22 | printf " filenames in column 1 and the wanted sample names in column 2.\n" 23 | 24 | printf "\n Optional:\n\n" 25 | 26 | printf ' - [-o ] Specify the output file name. Default: "Combined-taxonomy.tsv"\n' 27 | printf ' - [-d ] Specify the taxonkit database location. Default: "~/.taxonkit"\n' 28 | 29 | printf "\n Example usage:\n\n\t bit-combine-bracken-and-add-lineage -i input-files.tsv\n\n" 30 | 31 | exit 32 | fi 33 | 34 | ### PARSING ARGUMENTS 35 | ## setting defaults 36 | output_file="Combined-taxonomy.tsv" 37 | database="~/.taxonkit" 38 | 39 | while getopts :i:o:d: args 40 | do 41 | case "${args}" 42 | in 43 | i) input_file=${OPTARG};; 44 | o) output_file=${OPTARG};; 45 | d) database=${OPTARG};; 46 | \?) printf "\n ${RED}Invalid argument: -${OPTARG}${NC}\n\n Run with no arguments or '-h' only to see help menu.\n\n" >&2 && exit 47 | esac 48 | done 49 | 50 | 51 | ### CHECKING REQUIRED INPUT WAS PROVIDED 52 | if [ ! -n "$input_file" ]; then 53 | printf "\n ${RED}You need to provide an input file to '-i' :(${NC}\n" 54 | printf "\nExiting for now.\n\n" 55 | exit 56 | fi 57 | 58 | 59 | ### COMBINING MULTIPLE BRACKEN OUTPUT TABLES ### 60 | 61 | printf "\n\t${GREEN}Combining tables...${NC}\n\n" 62 | 63 | # this `helper-bit-combine-bracken.py` script was modified from the `combine_bracken_outputs.py` script provided by Jennifer Lu (jlu26@jhmi.edu) that comes with bracken 64 | 65 | # checking if there are 2 columns in input file (and therefore we are providing sample names too, otherwise base of filename is used) 66 | if grep -q '\t' ${input_file}; then 67 | 68 | helper-bit-combine-bracken.py -i $(cut -f 1 ${input_file}) -n $(cut -f 2 ${input_file} | tr "\n" "," | sed 's/,$//') -o combined-bracken.tmp 69 | 70 | else 71 | 72 | helper-bit-combine-bracken.py -i $(cut -f 1 ${input_file}) -o combined-bracken.tmp 73 | 74 | fi 75 | 76 | 77 | ### GETTING FULL LINEAGE INFO WITH TAXONKIT ### 78 | printf "\n\t${GREEN}Getting full lineage info...${NC}\n\n" 79 | tail -n +2 combined-bracken.tmp | cut -f 2 | taxonkit lineage --data-dir ${database} | taxonkit reformat --data-dir ${database} -r NA | cut -f 3 | tr ";" "\t" | cut -f 1-7 > lineages.tmp 80 | 81 | # adding a header 82 | cat <(printf "domain\tphylum\tclass\torder\tfamily\tgenus\tspecies\n") lineages.tmp > lineages-tab.tmp 83 | 84 | # combining lineage info and bracken combined-sample table 85 | paste lineages-tab.tmp <(cut -f 2- combined-bracken.tmp) > ${output_file} 86 | 87 | # clearing intermediate files 88 | rm combined-bracken.tmp lineages.tmp lineages-tab.tmp 89 | 90 | printf "\n\t\t${GREEN}DONE!${NC}\n\n" 91 | printf "\tOutput written to: $output_file\n\n" 92 | 93 | 94 | ### FINAL OUTPUT TABLE FORMAT ### 95 | 96 | ## columns in "Combined-bracken-species-taxonomy-for-other-microbes.tsv" 97 | # 1. domain 98 | # 2. phylum 99 | # 3. class 100 | # 4. order 101 | # 5. family 102 | # 6. genus 103 | # 7. species (which is genus and species in NCBI) 104 | # 8. taxonomy_id 105 | # 9. taxonomy_lvl 106 | # 10. ..._num (sample info starts here, this first one is number of reads classified) 107 | # 11. ..._frac (This one is the fraction normalized to 1 of the same sample. 108 | # Importantly, bracken only considers those classified. So this does not include unclassified reads. 109 | # They will always sum to 1, or very near 1, and there is no row currently accounting for unclassified.) 110 | # The rest of the columns are samples just like 10 and 11 above, 2 columns, first is read counts, second is fraction. 111 | 112 | # We can get "Unclassified" from the kraken report, but i didn't do that currently because those numbers don't add up to the total starting reads either (as all that was included in that run were things that were already filtered) 113 | -------------------------------------------------------------------------------- /bit/bit-combine-go-summaries: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import argparse 5 | import sys 6 | import pandas as pd 7 | 8 | parser = argparse.ArgumentParser(description='This script is for combining GO summary tables produced by\ 9 | `bit-summarize-go-annotations`. For version info, run `bit-version`.') 10 | 11 | 12 | required = parser.add_argument_group('required arguments') 13 | 14 | required.add_argument("-i", "--input-files", metavar = "", nargs = "+", 15 | type = str, help = "space-delimited list of `bit-summarize-go-annotations` output files", 16 | action = "store", required = True) 17 | parser.add_argument("-n", "--sample-names", metavar = "", 18 | help = 'Sample names provided as a comma-delimited list, be sure it matches the order of the input files (by default will use basename of input files up to last period)', 19 | action = "store", default = '') 20 | parser.add_argument("-o", "--output-file", metavar = "", 21 | help = 'Output combined summaries (default: "combined-GO-summaries.tsv")', 22 | action = "store", default = "combined-GO-summaries.tsv") 23 | 24 | if len(sys.argv)==1: 25 | parser.print_help(sys.stderr) 26 | sys.exit(0) 27 | 28 | args = parser.parse_args() 29 | 30 | # setting up variables 31 | sample_counts = {} 32 | total_counts = {} 33 | # file name is key, and sample name is value 34 | all_samples = {} 35 | 36 | # setting sample names and intializing counts 37 | if len(args.sample_names) == 0: 38 | for file in args.input_files: 39 | curr_sample = os.path.basename(file).rsplit('.', 1)[0] 40 | total_counts[curr_sample] = 0 41 | 42 | if file in all_samples: 43 | print('\n It seems the file "' + file + '" is trying to get in here twice.') 44 | print("\n That's not gonna fly :(\n") 45 | sys.exit(1) 46 | 47 | all_samples[file] = curr_sample 48 | 49 | else: 50 | 51 | # checking if sample names provided the length equals the number of input files 52 | if len(args.sample_names.split(",")) != len(args.input_files): 53 | print("\n It seems the number of provided sample names doesn't match the number of provided input files :(") 54 | print("\n Check usage with `bit-combine-go-summaries -h`.\n") 55 | sys.exit(0) 56 | 57 | # setting iterator 58 | i = 0 59 | 60 | for curr_sample in args.sample_names.split(","): 61 | total_counts[curr_sample] = 0 62 | all_samples[args.input_files[i]] = curr_sample 63 | i += 1 64 | 65 | # keeping a nested dictionary of info for all GO terms that show up in any table 66 | GO_dict = {} 67 | 68 | # building counts/percents table 69 | building_tab = pd.DataFrame(columns=["GO_term"]) 70 | 71 | ## working on each file 72 | for sample_key in all_samples: 73 | 74 | # reading current file into pandas dataframe 75 | curr_tab = pd.read_csv(sample_key, sep="\t") 76 | 77 | 78 | # adding to building GO dictionary of all GO terms in the input tables 79 | for row in curr_tab.itertuples(): 80 | 81 | if row[1] not in GO_dict: 82 | GO_dict[row[1]] = {'namespace': row[2], 'depth': row[3], 'name': row[4]} 83 | 84 | # trimming down current table 85 | curr_sub_tab = curr_tab[["GO_term", "counts", "percent_of_annotated"]] 86 | # and changing names to match sample ID 87 | curr_sub_tab.columns = ['GO_term', str(all_samples[sample_key]) + "_counts", str(all_samples[sample_key]) + "_perc_of_annotated"] 88 | 89 | # merging with master tab on GO_term 90 | building_tab = building_tab.merge(curr_sub_tab, on="GO_term", how="outer") 91 | 92 | ## replacing NAs with 0s 93 | building_tab = building_tab.fillna(0) 94 | 95 | ## making GO info dict into dataframe and merging into final table 96 | go_df = pd.DataFrame.from_dict(GO_dict, orient="index") 97 | # moving index to column and renaming 98 | go_df.reset_index(inplace=True) 99 | go_df.rename(columns = {'index': 'GO_term'}, inplace=True) 100 | # merging 101 | final_tab = go_df.merge(building_tab, on="GO_term", how="outer") 102 | # sorting 103 | final_tab.sort_values(by=["namespace", "depth"], inplace=True) 104 | 105 | ## writing out 106 | with open(args.output_file, "w") as out: 107 | out.write(final_tab.to_csv(index=False, sep="\t")) 108 | -------------------------------------------------------------------------------- /bit/bit-combine-kraken2-taxon-summaries: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | This script combines the outputs from the `bit-kraken2-to-taxon-summaries` program. 5 | """ 6 | 7 | import sys 8 | import argparse 9 | import pandas as pd 10 | import os 11 | 12 | parser = argparse.ArgumentParser(description="This script combines the outputs from the `bit-kraken2-to-taxon-summaries` program. \ 13 | For version info, run `bit-version`.") 14 | 15 | required = parser.add_argument_group('required arguments') 16 | 17 | required.add_argument("-i", "--input-files", metavar = "", nargs = "+", type = str, 18 | help = "space-delimited list of `bit-kraken2-to-taxon-summaries` output files, can be provided with shell wildcards", 19 | action = "store", required = True) 20 | parser.add_argument("-n", "--sample-names", metavar = "", 21 | help = 'Sample names provided as a comma-delimited list, be sure it matches the order of the input files (by default will use basename of input files up to last period)', 22 | action = "store", default = '') 23 | parser.add_argument("-o", "--output-file", metavar = "", 24 | help = 'Output combined summaries (default: "combined-kraken2-taxon-summaries.tsv")', 25 | action = "store", default = "combined-kraken2-taxon-summaries.tsv") 26 | 27 | if len(sys.argv)==1: 28 | parser.print_help(sys.stderr) 29 | sys.exit(0) 30 | 31 | args = parser.parse_args() 32 | 33 | # setting up variable 34 | # file name is key, and sample name is value 35 | all_samples = {} 36 | 37 | # setting sample names and intializing counts 38 | if len(args.sample_names) == 0: 39 | for file in args.input_files: 40 | curr_sample = os.path.basename(file).rsplit('.', 1)[0] 41 | 42 | if file in all_samples: 43 | print('\n It seems the file "' + file + '" is trying to get in here twice.') 44 | print("\n That's not gonna fly :(\n") 45 | sys.exit(1) 46 | 47 | all_samples[file] = curr_sample 48 | 49 | else: 50 | 51 | # checking if sample names provided the length equals the number of input files 52 | if len(args.sample_names.split(",")) != len(args.input_files): 53 | print("\n It seems the number of provided sample names doesn't match the number of provided input files :(") 54 | print("\n Check usage with `bit-combine-kraken2-taxon-summaries -h`.\n") 55 | sys.exit(0) 56 | 57 | # setting iterator 58 | i = 0 59 | 60 | for curr_sample in args.sample_names.split(","): 61 | 62 | all_samples[args.input_files[i]] = curr_sample 63 | i += 1 64 | 65 | # keeping dictionary of all taxids (keys) and full lineages (values) 66 | taxid_dict = {} 67 | 68 | # building final table 69 | building_tab = pd.DataFrame(columns=["taxid"]) 70 | 71 | ## working on each file 72 | for sample_key in all_samples: 73 | 74 | # reading current file into pandas dataframe 75 | curr_tab = pd.read_csv(sample_key, sep="\t") 76 | 77 | # adding to building taxid dictionary 78 | for row in curr_tab.itertuples(): 79 | if row[1] not in taxid_dict: 80 | taxid_dict[row[1]] = {'domain': row[2], 'phylum': row[3], 'class': row[4], 'order': row[5], 'family': row[6], 'genus': row[7], 'species': row[8]} 81 | 82 | # trimming down current table 83 | curr_sub_tab = curr_tab[["taxid", "read_counts", "percent_of_reads"]] 84 | 85 | # and changing count and percent column names to match sample ID 86 | curr_sub_tab.columns = ['taxid', str(all_samples[sample_key]) + "_read_counts", str(all_samples[sample_key]) + "_perc_of_reads"] 87 | 88 | # merging with master tab on GO_term 89 | building_tab = building_tab.merge(curr_sub_tab, on="taxid", how="outer") 90 | 91 | ## replacing NAs with 0s 92 | building_tab = building_tab.fillna(0) 93 | 94 | ## making taxid dictionary into dataframe and merging into final table 95 | taxid_df = pd.DataFrame.from_dict(taxid_dict, orient="index") 96 | 97 | # moving index to column and renaming 98 | taxid_df.reset_index(inplace=True) 99 | taxid_df.rename(columns = {'index': 'taxid'}, inplace=True) 100 | 101 | # merging 102 | final_tab = taxid_df.merge(building_tab, on="taxid", how="outer") 103 | 104 | # sorting 105 | final_tab.sort_values(by=["taxid"], inplace=True) 106 | 107 | # changing NAs to "NA" 108 | final_tab = final_tab.fillna("NA") 109 | 110 | ## writing out 111 | with open(args.output_file, "w") as out: 112 | out.write(final_tab.to_csv(index=False, sep="\t")) 113 | 114 | -------------------------------------------------------------------------------- /bit/bit-count-bases: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | RED='\033[0;31m' 4 | NC='\033[0m' 5 | 6 | if [ "$#" == 0 ] || [ $1 == "-h" ]; then 7 | printf "\n This script returns the total number of bases (or amino acids) in a fasta file.\n" 8 | printf " For version info, run \`bit-version\`.\n\n" 9 | printf " Usage:\n\t bit-count-bases input.fasta\n\n" 10 | exit 11 | fi 12 | 13 | if [ -f $1 ]; then 14 | echo $(grep -v ">" $1 | wc | awk '{print $3-$1}') 15 | 16 | else 17 | echo -e " ${RED}Input file not found :/${NC}" >&2 18 | exit 1 19 | fi 20 | -------------------------------------------------------------------------------- /bit/bit-count-bases-per-seq: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from Bio import SeqIO 4 | import argparse 5 | import sys 6 | import os 7 | from statistics import mean, median 8 | 9 | parser = argparse.ArgumentParser(description = "This script takes a multifasta as input and returns a tab-delimited file with two columns, \ 10 | header and number of bases or amino acids for each sequence. It also \ 11 | prints out some general stats. For version info, run `bit-version`.") 12 | 13 | required = parser.add_argument_group('required arguments') 14 | 15 | required.add_argument("-i", "--input-fasta", metavar = "", help = "Original fasta file", action = "store", required = True) 16 | parser.add_argument("-o", "--output-file", metavar = "", help = 'Name of output tab-delimited file (default: "Num-bps.tsv")', action = "store", \ 17 | default = "Num-bps.tsv") 18 | 19 | if len(sys.argv) == 1: 20 | parser.print_help(sys.stderr) 21 | sys.exit(0) 22 | 23 | args = parser.parse_args() 24 | 25 | 26 | # starting list to hold seq lengths so we can print out some summary stats 27 | lengths_list = [] 28 | 29 | # counting number of seqs 30 | n = 0 31 | 32 | with open(args.input_fasta, "r") as in_fasta: 33 | 34 | with open(args.output_file, "w") as out_file: 35 | 36 | for seq_record in SeqIO.parse(in_fasta, "fasta"): 37 | 38 | out_file.write(seq_record.id + "\t" + str(len(seq_record.seq)) + "\n") 39 | 40 | lengths_list.append(len(seq_record.seq)) 41 | 42 | n += 1 43 | 44 | print("\n Number of seqs: " + str(n)) 45 | print(" Min. length: " + str(min(lengths_list))) 46 | print(" Max length: " + str(max(lengths_list))) 47 | print(" Mean length: " + str(round(mean(lengths_list), 2))) 48 | print(" Median length: " + str(round(median(lengths_list), 2)) + "\n") 49 | print(" All seq lengths written to: '" + args.output_file + "'\n") 50 | -------------------------------------------------------------------------------- /bit/bit-dedupe-fasta-headers: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from Bio import SeqIO 4 | import sys 5 | import argparse 6 | import os 7 | 8 | parser = argparse.ArgumentParser(description='This script will append a number to headers if that exact ID has already appeared in the fasta file. For version info, run `bit-version`.') 9 | 10 | required = parser.add_argument_group('required arguments') 11 | 12 | required.add_argument("-i", "--input-fasta", metavar = "", help = "Starting fasta file", 13 | action = "store", required = True) 14 | parser.add_argument("-o", "--output-fasta", metavar = "", help = 'Output fasta file (default: "Renamed.fasta").', default = "Renamed.fasta") 15 | 16 | if len(sys.argv)==1: 17 | parser.print_help(sys.stderr) 18 | sys.exit(0) 19 | 20 | args = parser.parse_args() 21 | 22 | in_fasta = open(args.input_fasta, "r") 23 | out_fasta = open(args.output_fasta, "w") 24 | 25 | ids = {} 26 | 27 | for seq_record in SeqIO.parse(in_fasta, "fasta"): 28 | 29 | if seq_record.id not in ids: 30 | ids[seq_record.id] = 1 31 | out_fasta.write(">" + seq_record.id + "\n") 32 | out_fasta.write(str(seq_record.seq) + "\n") 33 | 34 | else: 35 | count = ids[seq_record.id] + 1 36 | ids[seq_record.id] = count 37 | out_fasta.write(">" + seq_record.id + "_" + str(count) + "\n") 38 | out_fasta.write(str(seq_record.seq) + "\n") 39 | 40 | in_fasta.close() 41 | out_fasta.close() 42 | -------------------------------------------------------------------------------- /bit/bit-extract-seqs-by-coords: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from pybedtools import BedTool 4 | import sys 5 | import argparse 6 | 7 | parser = argparse.ArgumentParser(description = 'This script takes a multifasta file and tab-delimited file specifying which \ 8 | contigs and coordinates are wanted and returns a multifasta of the chopped out \ 9 | sequences."For version info, run `bit-version`.') 10 | 11 | required = parser.add_argument_group('required arguments') 12 | 13 | required.add_argument("-i", "--input-fasta", metavar = "", help = "Starting fasta file", action = "store", required = True) 14 | required.add_argument("-b", "--bed-file", metavar = "", help = "Bed file of desired contigs and coordinates (3 columns - contig, start, end - no header, 0-based counting", required = True) 15 | parser.add_argument("-o", "--output-fasta", metavar = "", help = 'Name of output fasta file (default: "extracted-seqs.fasta")', action = "store", default = "extracted-seqs.fasta") 16 | 17 | if len(sys.argv)==1: 18 | parser.print_help(sys.stderr) 19 | sys.exit(0) 20 | 21 | args = parser.parse_args() 22 | 23 | coordinates_file = BedTool(args.bed_file) 24 | fasta = BedTool(args.input_fasta) 25 | seq = coordinates_file.sequence(fi = fasta) 26 | 27 | with open(args.output_fasta, "w") as out_fasta: 28 | out_fasta.write(open(seq.seqfn).read()) 29 | -------------------------------------------------------------------------------- /bit/bit-fasta-to-bed: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from Bio import SeqIO 4 | import sys 5 | import argparse 6 | import os 7 | 8 | parser = argparse.ArgumentParser(description="This script takes a nucleotide multifasta and returns a tab-delimited bed file (see: https://bedtools.readthedocs.io/en/latest/content/general-usage.html). For version info, run `bit-version`.") 9 | 10 | required = parser.add_argument_group('required arguments') 11 | 12 | required.add_argument("-i", "--input-fasta", metavar = "", help = "input fasta file", action = "store", required = True) 13 | parser.add_argument("-o", "--output-bed-file", metavar = "", help = 'Name of output bed file (default: "Output.bed")', 14 | action = "store", default = "Output.bed") 15 | 16 | if len(sys.argv)==1: 17 | parser.print_help(sys.stderr) 18 | sys.exit(0) 19 | 20 | args = parser.parse_args() 21 | 22 | # in_fasta = open(args.input_fasta, "r") 23 | # out_file = open(args.output_file, "w") 24 | 25 | with open(args.output_bed_file, "w") as out: 26 | with open(args.input_fasta, "r") as in_fasta: 27 | 28 | for record in SeqIO.parse(in_fasta, "fasta"): 29 | name = record.name 30 | length = len(record.seq) - 1 31 | out.write(str(name) + "\t" "0" + "\t" + str(length) + "\n") 32 | -------------------------------------------------------------------------------- /bit/bit-fasta-to-genbank: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import sys 5 | from Bio import SeqIO 6 | from Bio.Alphabet import generic_dna 7 | import os 8 | 9 | parser = argparse.ArgumentParser(description = "This script takes a fasta file and converts it into genbank format. For version info, run `bit-version`.") 10 | 11 | required = parser.add_argument_group('required arguments') 12 | 13 | required.add_argument('-i', '--input-fasta', metavar = "", action = 'store', help='input fasta file', required = True) 14 | parser.add_argument("-o", "--output-genbank-file", metavar = "", action = "store", dest = "output_gb", default = "new.gb", 15 | help = 'Output genbank file (default: "new.gb")') 16 | 17 | if len(sys.argv)==1: 18 | parser.print_help(sys.stderr) 19 | sys.exit(0) 20 | 21 | args = parser.parse_args() 22 | 23 | input_fasta = open(args.input_fasta, "r") 24 | 25 | output_gb = open(args.output_gb, "w") 26 | 27 | sequences = list(SeqIO.parse(input_fasta, "fasta")) 28 | 29 | for seq in sequences: 30 | seq.seq.alphabet = generic_dna 31 | 32 | SeqIO.write(sequences, output_gb, "genbank") 33 | 34 | input_fasta.close() 35 | output_gb.close() 36 | -------------------------------------------------------------------------------- /bit/bit-filter-KOFamScan-results: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | This script filters the "detail-tsv"-formatted output of KOFamScan to retain only those above the 5 | KO-specific score threshold, and retain only the best hit for each gene. 6 | Outputs a 3-column tab-delimited table with: gene_ID, KO_ID, and KO_annotation 7 | 8 | KOFamScan e.g. usage prior to input here: 9 | exec_annotation -p profiles/ -k ko_list --cpu 15 -f detail-tsv -o 5492-KO-tab.tmp 5492-genes.faa --tmp-dir 5492-tmp-KO --report-unannotated 10 | 11 | Then would be: 12 | bit-filter-KOFamScan-results -i 5492-KO-tab.tmp -o 5492-annotations.tsv 13 | """ 14 | 15 | import sys 16 | import argparse 17 | import pandas as pd 18 | 19 | parser = argparse.ArgumentParser(description = "This script filters the 'detail-tsv'-formatted output file from KOFamScan to retain only those above the KO-specific score threshold, and retains only the hit with the lowest e-value for each gene if there are multiple. It outputs a 3-column tab-delimited file with: gene_ID, KO_ID, and KO_annotation. For version info, run `bit-version`") 20 | 21 | required = parser.add_argument_group('required arguments') 22 | 23 | required.add_argument("-i", "--input-file", help = "Input annotation table", metavar = "", action = "store", required = True) 24 | parser.add_argument("-o", "--output-file", metavar = "", help = 'Output table filename (default: "output.tsv")', action = "store", default = "output.tsv") 25 | 26 | if len(sys.argv)==1: 27 | parser.print_help(sys.stderr) 28 | sys.exit(0) 29 | 30 | args = parser.parse_args() 31 | 32 | # initializing dictionaries 33 | annot_dict = {} 34 | e_value_dict = {} 35 | 36 | # looping through input file 37 | # input table looks like this: 38 | 39 | """ 40 | # gene name KO thrshld score E-value "KO definition" 41 | # --------- ------ ------- ------ --------- ------------- 42 | k119_6520_1 K01999 126.63 44.1 6.7e-11 "branched-chain amino acid transport system substrate-binding protein" 43 | k119_6520_1 K11954 433.33 39.0 1.8e-09 "neutral amino acid transport system substrate-binding protein" 44 | k119_6520_1 K04615 290.13 24.9 1.7e-05 "gamma-aminobutyric acid type B receptor" 45 | k119_6520_1 K11959 388.40 18.0 0.0035 "urea transport system substrate-binding protein" 46 | k119_6520_1 K05387 467.63 11.1 0.25 "glutamate receptor, ionotropic, plant" 47 | k119_19560_1 K02014 164.80 144.6 2.1e-41 "iron complex outermembrane recepter protein" 48 | k119_19560_1 K15721 575.63 122.5 7.3e-35 "pesticin/yersiniabactin receptor" 49 | k119_19560_1 K16090 578.13 90.5 3.3e-25 "catecholate siderophore receptor" 50 | """ 51 | 52 | with open(args.input_file, "r") as annots: 53 | 54 | for line in annots: 55 | 56 | if line.startswith("#"): 57 | continue 58 | 59 | line = line.lstrip("*").strip().split("\t") 60 | 61 | # adding gene ID if not present, to ensure all end up in final table 62 | if line[0] not in annot_dict: 63 | annot_dict[line[0]] = {"KO_ID":"NA", "KO_function":"NA"} 64 | 65 | # nothing there if no annotations for current gene, skipping 66 | if len(line) == 1: 67 | continue 68 | 69 | else: 70 | 71 | # only considering if its score is above the threshold 72 | # some, though very few, like K15869, don't have a threshold score due to having too few representatives, so if no threshold, just taking 73 | if line[2] == "" or float(line[3]) > float(line[2]): 74 | 75 | # adding to e_value_dict if not represented already, adding annotation to annot_dict, and moving on 76 | if line[0] not in e_value_dict: 77 | 78 | annot_dict[line[0]] = {"KO_ID":line[1], "KO_function":line[5].strip('"')} 79 | e_value_dict[line[0]] = line[4] 80 | continue 81 | 82 | else: 83 | 84 | # replacing current annotation only if e-value is lower than current 85 | if float(line[4]) < float(e_value_dict[line[0]]): 86 | 87 | annot_dict[line[0]] = {"KO_ID":line[1], "KO_function":line[5].strip('"')} 88 | e_value_dict[line[0]] = line[4] 89 | 90 | annot_tab = pd.DataFrame.from_dict(annot_dict, orient="index") 91 | annot_tab.reset_index(inplace=True) 92 | annot_tab.rename(columns = {'index':'gene_ID'}, inplace=True) 93 | 94 | with open(args.output_file, "w") as out: 95 | out.write(annot_tab.to_csv(index=False, sep="\t")) 96 | -------------------------------------------------------------------------------- /bit/bit-filter-seqs-by-length: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from Bio import SeqIO 4 | import argparse 5 | import sys 6 | import os 7 | 8 | parser = argparse.ArgumentParser(description="This script takes a multifasta as input and filters out sequences based on length. For version info, run `bit-version`.") 9 | 10 | required = parser.add_argument_group('required arguments') 11 | 12 | required.add_argument("-i", "--input-fasta", help = "Original fasta file", metavar = "", action = "store", required = True) 13 | required.add_argument("-m", "--min-length", metavar = "", help = "minimum length retained", action = "store", dest = "min_len", required = True) 14 | parser.add_argument("-M", "--max-length", metavar = "", help = "maximum length retained", action = "store", dest = "max_len", default = "9223372036854775807") 15 | parser.add_argument("-o", "--output-file", metavar = "", help = 'Name of output fasta file (default: "filtered.fasta")', action = "store", default = "filtered.fasta") 16 | 17 | if len(sys.argv)==1: 18 | parser.print_help(sys.stderr) 19 | sys.exit(0) 20 | 21 | args = parser.parse_args() 22 | 23 | in_fasta = open(args.input_fasta, "r") 24 | out_file = open(args.output_file, "w") 25 | min_len = args.min_len 26 | max_len = args.max_len 27 | 28 | total=0 29 | kept=0 30 | 31 | for seq_record in SeqIO.parse(in_fasta, "fasta"): 32 | 33 | total+=1 34 | 35 | if len(seq_record.seq) >= int(min_len) and len(seq_record.seq) <= int(max_len): 36 | 37 | kept+=1 38 | out_file.write(">" + str(seq_record.description) + "\n" + str(seq_record.seq) + "\n") 39 | 40 | 41 | perc = round(float(kept) / float(total) * 100, 2) 42 | print("\n\tRetained " + str(kept) + " sequences of the initial " + str(total) + " (" + str(perc) + "%).\n") 43 | 44 | in_fasta.close() 45 | out_file.close() 46 | -------------------------------------------------------------------------------- /bit/bit-filter-table: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import gzip 5 | 6 | parser = argparse.ArgumentParser(description = 'Ad hoc script for filtering a table based on values in a specified column. For version info, run `bit-version`.') 7 | 8 | required = parser.add_argument_group('required arguments') 9 | 10 | required.add_argument("-i", "--input-table", metavar = "", help = 'Input table', action = "store", dest = "in_tab", required = True) 11 | required.add_argument("-w", "--wanted-values", metavar = "", help = 'Wanted values', action = "store", dest = "wanted", required = True) 12 | 13 | parser.add_argument("-o", "--output-file", metavar = "", help='Output table filename (default: "Output.tsv")', action = "store", dest = "out_tab", default = "Output.tsv") 14 | parser.add_argument("-d", "--delimiter", metavar = "", help = 'Delimiter (default: "\\t")', action = "store", default = "\t") 15 | parser.add_argument("-c", "--column", metavar = "", help = 'Index of column to filter on (default: 1)', action = "store", default = 1, type = int) 16 | parser.add_argument("--no-header", help='Add if there is no header', action = "store_true") 17 | parser.add_argument("--gz", help = 'Add if the input is gzipped (output will not be)', action = "store_true") 18 | 19 | args = parser.parse_args() 20 | 21 | targets = set(line.strip() for line in open(args.wanted)) 22 | 23 | output = open(args.out_tab, "w") 24 | 25 | if not args.gz: 26 | input = open(args.in_tab, "r") 27 | else: 28 | input = gzip.open(args.in_tab, "rt") 29 | 30 | target_column = args.column - 1 31 | 32 | 33 | with open(args.out_tab, "w") as output: 34 | 35 | if not args.no_header: 36 | # only doing this firstline variable because i can't figure out a better way to just print the first line when the header is included (and still be just iterating over the file contents) 37 | firstline = True 38 | 39 | for line in input: 40 | 41 | if firstline: 42 | output.write(line) 43 | firstline = False 44 | continue 45 | 46 | split_line = line.strip().split(args.delimiter) 47 | 48 | if split_line[target_column] in targets: 49 | output.write(line) 50 | 51 | else: 52 | 53 | for line in input: 54 | 55 | split_line = line.strip().split(args.delimiter) 56 | 57 | if split_line[target_column] in targets: 58 | output.write(line) 59 | 60 | input.close() 61 | -------------------------------------------------------------------------------- /bit/bit-gen-iToL-binary-dataset: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import argparse 5 | import os 6 | 7 | parser = argparse.ArgumentParser(description='This script is for creating a standard iToL binary dataset. For version info, run `bit-version`.') 8 | 9 | required = parser.add_argument_group('required arguments') 10 | 11 | required.add_argument("-g", "--target-genomes", metavar = "", help = 'Single-column file with the genomes to color (need to match the IDs in the tree file)', action = "store", required = True) 12 | parser.add_argument("-d", "--dataset-label", metavar = "", help = 'Label of the dataset (default: "data")', action = "store", default = "data") 13 | parser.add_argument("-s", "--shape-to-add", metavar = "", help = 'Shape to add, must be one of: "square", "circle", "star", "rtriangle", "ltriangle", or "check" (default: "square")', action = "store", dest = "shape", default = "square") 14 | parser.add_argument("-c", "--color", metavar = "", help='Color to use of either: "blue", "green", "red", "purple", or "black" (default: "blue", of course, \'cause it\'s the best)', action = "store", default = "blue") 15 | parser.add_argument("-H", "--height-factor", metavar = "", help = 'Increase or decrease symbol size. Values below 1 will decrease the standard size, above 1 will increase it (default: "1")', action = "store", dest = "height", default = "1") 16 | 17 | parser.add_argument("-o", "--output-file", metavar = "", help = 'Output file for iToL (default: "iToL-binary-dataset.txt")', action = "store", default = "iToL-binary-dataset.txt") 18 | 19 | if len(sys.argv)==1: 20 | parser.print_help(sys.stderr) 21 | sys.exit(0) 22 | 23 | args = parser.parse_args() 24 | 25 | if args.color == "blue": 26 | col = "#434da7" 27 | elif args.color == "green": 28 | col = "#48a743" 29 | elif args.color == "red": 30 | col = "#c01820" 31 | elif args.color == "purple": 32 | col = "#512f9c" 33 | elif args.color == "black": 34 | col = "#000000" 35 | else: 36 | print("\n\tSorry, we're not prepared to handle \"" + str(args.color) + "\" as the color... :(\n") 37 | parser.print_help(sys.stderr) 38 | sys.exit(1) 39 | 40 | if args.shape not in ["square", "circle", "star", "rtriangle", "ltriangle", "check"]: 41 | print("\n\tSorry, we're not prepared to handle \"" + str(args.shape) + "\" as the argument for what shape to use... :(\n") 42 | parser.print_help(sys.stderr) 43 | sys.exit(1) 44 | 45 | try: 46 | height = float(args.height) 47 | except ValueError: 48 | print("\n\tSorry, " + str(args.height) + " doesn't appear to be a number... :(\n") 49 | parser.print_help(sys.stderr) 50 | sys.exit(1) 51 | 52 | target_list = [] 53 | 54 | with open(args.target_genomes, "r") as target_genomes: 55 | for genome in target_genomes: 56 | target_list.append(genome.strip()) 57 | 58 | out_file = open(args.output_file, "w") 59 | 60 | out_file.write("DATASET_BINARY\nSEPARATOR TAB\n\n") 61 | 62 | # setting DATASET_LABEL 63 | out_file.write("DATASET_LABEL" + "\t" + str(args.dataset_label) + "\n\n") 64 | 65 | # setting dataset main color 66 | out_file.write("COLOR\t" + str(col) + "\n\n") 67 | 68 | # setting FIELD_LABELS 69 | out_file.write("FIELD_LABELS\tf1\n\n") 70 | 71 | # setting FIELD_SHAPES 72 | 73 | if args.shape == "square": 74 | shape = "1" 75 | elif args.shape == "circle": 76 | shape = "2" 77 | elif args.shape == "star": 78 | shape = "3" 79 | elif args.shape == "rtriangle": 80 | shape = "4" 81 | elif args.shape == "ltriangle": 82 | shape = "5" 83 | else: 84 | shape = "6" 85 | 86 | out_file.write("FIELD_SHAPES\t" + str(shape) + "\n\n") 87 | 88 | # writing out FIELD_COLORS 89 | out_file.write("FIELD_COLORS\t" + str(col) + "\n\n") 90 | 91 | # writing out HEIGHT_FACTOR 92 | out_file.write("HEIGHT_FACTOR\t" + str(height) + "\n\n") 93 | 94 | # writing lines for each labels 95 | out_file.write("DATA\n") 96 | 97 | for target in target_list: 98 | out_file.write(str(target) + "\t" + str(shape) + "\n") 99 | 100 | out_file.close() 101 | -------------------------------------------------------------------------------- /bit/bit-gen-iToL-colorstrip: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | import argparse 4 | import os 5 | 6 | parser = argparse.ArgumentParser(description = 'This script is for creating a standard iToL colorstrip dataset file when given the IDs of the genomes we want to color (formatting comes from iToL help page here: https://itol.embl.de/help/dataset_color_strip_template.txt). For version info, run `bit-version`.') 7 | 8 | required = parser.add_argument_group('required arguments') 9 | 10 | required.add_argument("-g", "--target-genomes", metavar = "", help = 'Single-column file with the genomes to color (need to match the IDs in the tree file, with no ">")', action = "store", required = True) 11 | parser.add_argument("-l", "--label", metavar = "", help = 'Label used in the legend table (default: "label1")', action = "store", default = "label1") 12 | parser.add_argument("-c", "--color", metavar = "", help = 'Color to use, pre-baked options include "blue", "green", "red", "purple", or "black", or can provide the hexcode (default: "blue", of course, \'cause it\'s the best)', action = "store", default = "blue") 13 | parser.add_argument("-w", "--width", metavar = "", help = 'width of the colorstrip (default: 25)', action = "store", default = "25") 14 | parser.add_argument("--color-branches-too", help = "Add this flag if wanting to color branches also", action = "store_true") 15 | parser.add_argument("-o", "--output-file", metavar = "", help = 'Output file for iToL (default: "iToL-colorstrip.txt")', action = "store", default = "iToL-colorstrip.txt") 16 | 17 | 18 | if len(sys.argv)==1: 19 | parser.print_help(sys.stderr) 20 | sys.exit(0) 21 | 22 | args = parser.parse_args() 23 | 24 | if args.color == "blue": 25 | col = "#434da7" 26 | elif args.color == "green": 27 | col = "#48a743" 28 | elif args.color == "red": 29 | col = "#c01820" 30 | elif args.color == "purple": 31 | col = "#512f9c" 32 | elif args.color == "black": 33 | col = "#000000" 34 | else: 35 | if not args.color.startswith("#"): 36 | print("\n\tSorry, we're not prepared to handle \"" + str(args.color) + "\" as the color... :(\n") 37 | parser.print_help(sys.stderr) 38 | sys.exit(1) 39 | else: 40 | col = args.color 41 | 42 | target_list = [] 43 | 44 | with open(args.target_genomes, "r") as target_genomes: 45 | for genome in target_genomes: 46 | target_list.append(genome.strip()) 47 | 48 | out_file = open(args.output_file, "w") 49 | 50 | out_file.write("DATASET_COLORSTRIP" + "\n" + "SEPARATOR TAB" + "\n\n" + "DATASET_LABEL" + "\t" + str(args.label) + "\n" + "COLOR" + "\t" + str(col) + "\n\n") 51 | 52 | if args.color_branches_too: 53 | out_file.write("COLOR_BRANCHES\t1\n\n") 54 | else: 55 | out_file.write("COLOR_BRANCHES\t0\n\n") 56 | 57 | out_file.write("STRIP_WIDTH" + "\t" + str(args.width) + "\n\n") 58 | 59 | out_file.write("BORDER_WIDTH" + "\t" + "1" + "\n") 60 | out_file.write("BORDER_COLOR" + "\t" + "#999999" + "\n\n") 61 | 62 | out_file.write("DATA\n\n") 63 | 64 | # writing out primary data lines 65 | for target in target_list: 66 | out_file.write(str(target) + "\t" + str(col) + "\t" + str(args.label) + "\n") 67 | 68 | out_file.close() 69 | -------------------------------------------------------------------------------- /bit/bit-gen-iToL-map: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import argparse 5 | import os 6 | 7 | parser = argparse.ArgumentParser(description = 'This script is for creating a standard iToL "label" and/or "branch" color file when given the IDs of the genomes you want to color. For version info, run `bit-version`.') 8 | 9 | required = parser.add_argument_group('required arguments') 10 | 11 | required.add_argument("-g", "--target-genomes", metavar = "", help = 'Single-column file with the genomes to color (need to match the IDs in the tree file, with no ">")', action = "store", required = True) 12 | parser.add_argument("-w", "--what-to-color", metavar = "", help = 'What to color, must be: "branches", "labels", or "both" (default: "both")', action = "store", dest = "to_color", default = "both") 13 | parser.add_argument("-c", "--color", metavar = "", help = 'Color to use, pre-baked options include "blue", "green", "red", "purple", or "black", or can provide the hexcode (default: "blue", of course, \'cause it\'s the best)', action = "store", dest = "color", default = "blue") 14 | parser.add_argument("-l", "--line-weight", metavar = "", help = 'Line weight if coloring branches (default: "2")', action = "store", default = "2") 15 | parser.add_argument("-o", "--output-file", help = 'Output file for iToL (default: "iToL-colors.txt")', action = "store", default = "iToL-colors.txt") 16 | 17 | if len(sys.argv)==1: 18 | parser.print_help(sys.stderr) 19 | sys.exit(0) 20 | 21 | args = parser.parse_args() 22 | 23 | if args.color == "blue": 24 | col = "#434da7" 25 | elif args.color == "green": 26 | col = "#48a743" 27 | elif args.color == "red": 28 | col = "#c01820" 29 | elif args.color == "purple": 30 | col = "#512f9c" 31 | elif args.color == "black": 32 | col = "#000000" 33 | else: 34 | if not args.color.startswith("#"): 35 | print("\n\tSorry, we're not prepared to handle \"" + str(args.color) + "\" as the color... :(\n") 36 | parser.print_help(sys.stderr) 37 | sys.exit(1) 38 | else: 39 | col = args.color 40 | 41 | if args.to_color not in ["both", "branches", "labels"]: 42 | print("\n\tSorry, we're not prepared to handle \"" + str(args.to_color) + "\" as the argument for what to color... :(\n") 43 | parser.print_help(sys.stderr) 44 | sys.exit(1) 45 | 46 | try: 47 | line_weight = float(args.line_weight) 48 | except ValueError: 49 | print("\n\tSorry, " + str(args.line_weight) + " doesn't appear to be a number... :(\n") 50 | parser.print_help(sys.stderr) 51 | sys.exit(1) 52 | 53 | target_list = [] 54 | 55 | with open(args.target_genomes, "r") as target_genomes: 56 | for genome in target_genomes: 57 | target_list.append(genome.strip()) 58 | 59 | out_file = open(args.output_file, "w") 60 | 61 | out_file.write("TREE_COLORS\nSEPARATOR TAB\nDATA\n\n") 62 | 63 | # writing lines for coloring labels if needed 64 | if args.to_color in ["both", "labels"]: 65 | 66 | for target in target_list: 67 | out_file.write(str(target) + "\tlabel\t" + str(col) + "\tbold\n") 68 | 69 | # writing lines for coloring branches if needed 70 | if args.to_color in ["both", "branches"]: 71 | 72 | for target in target_list: 73 | out_file.write(str(target) + "\tbranch\t" + str(col) + "\tnormal\t" + str(line_weight) + "\n") 74 | 75 | out_file.close() 76 | -------------------------------------------------------------------------------- /bit/bit-gen-iToL-text-dataset: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import argparse 5 | import os 6 | 7 | parser = argparse.ArgumentParser(description = 'This script is for creating a standard iToL text dataset. For version info, run `bit-version`.') 8 | 9 | required = parser.add_argument_group('required arguments') 10 | 11 | required.add_argument("-g", "--target-genomes", metavar = "", help = 'Single-column file with the genomes to color (need to match the IDs in the tree file)', action = "store", required = True) 12 | required.add_argument("-l", "--text-to-add", metavar = "", help = 'Text to add to the target genomes', action = "store", dest = "text", required = True) 13 | parser.add_argument("-c", "--color", metavar = "", help = 'Color to use of either: "blue", "green", "red", "purple", or "black" (default: "blue", of course, \'cause it\'s the best)', action = "store", default = "blue") 14 | 15 | parser.add_argument("-o", "--output-file", metavar = "", help = 'Output file for iToL (default: "iToL-text-dataset.txt")', action = "store", default = "iToL-text-dataset.txt") 16 | 17 | if len(sys.argv)==1: 18 | parser.print_help(sys.stderr) 19 | sys.exit(0) 20 | 21 | args = parser.parse_args() 22 | 23 | if args.color == "blue": 24 | col = "#434da7" 25 | elif args.color == "green": 26 | col = "#48a743" 27 | elif args.color == "red": 28 | col = "#c01820" 29 | elif args.color == "purple": 30 | col = "#512f9c" 31 | elif args.color == "black": 32 | col = "#000000" 33 | else: 34 | print("\n\tSorry, we're not prepared to handle \"" + str(args.color) + "\" as the color... :(\n") 35 | parser.print_help(sys.stderr) 36 | sys.exit(1) 37 | 38 | target_list = [] 39 | 40 | with open(args.target_genomes, "r") as target_genomes: 41 | for genome in target_genomes: 42 | target_list.append(genome.strip()) 43 | 44 | out_file = open(args.output_file, "w") 45 | 46 | out_file.write("DATASET_TEXT\nSEPARATOR TAB\n\n") 47 | 48 | # setting DATASET_LABEL 49 | out_file.write("DATASET_LABEL\tdata\n\n") 50 | 51 | # setting dataset main color 52 | out_file.write("COLOR\t" + str(col) + "\n\n") 53 | 54 | # writing lines for each labels 55 | out_file.write("DATA\n") 56 | 57 | for target in target_list: 58 | out_file.write(str(target) + "\t" + str(args.text) + "\t" + "-1" + "\t" + str(col) + "\t" + "normal" + "\t" + "1" + "\t" + "0" + "\n") 59 | 60 | out_file.close() 61 | -------------------------------------------------------------------------------- /bit/bit-genbank-locus-clean-slate: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import warnings 4 | warnings.filterwarnings("ignore") 5 | 6 | from Bio import SeqIO 7 | import argparse 8 | import sys 9 | import os 10 | import subprocess 11 | 12 | parser = argparse.ArgumentParser(description = "Clean slate for LOCUS names in genbank files that are problematic (can be the case, for example, if annotated by NCBI but not officially released yet). This is only helpful if the original LOCUS names don't matter to us, of course. For version info, run `bit-version`.") 13 | 14 | required = parser.add_argument_group('required arguments') 15 | 16 | required.add_argument("-i", "--input-gb", metavar = "", help = 'Input Genbank file (e.g. "*.gbk", "*.gb", "*.gbff")', action = "store", required = True) 17 | parser.add_argument("-w", "--wanted-name", metavar = "", help = 'New locus name prefix (default: "Unknown")', action = "store", default = "Unknown") 18 | parser.add_argument("-o", "--output-gb", metavar = "", help = 'Output genbank file (default: "clean.gb")', action = "store", default = "clean.gb") 19 | 20 | if len(sys.argv)==1: 21 | parser.print_help(sys.stderr) 22 | sys.exit(0) 23 | 24 | args = parser.parse_args() 25 | 26 | tmp_file = args.input_gb + ".tmp" 27 | new_name = args.wanted_name 28 | 29 | tmp = open(tmp_file, "w") 30 | 31 | subprocess.call(['sed', 's/^LOCUS.*$/LOCUS noname 0 bp DNA linear BCT 00-MIK-0000/', args.input_gb], stdout=tmp) 32 | tmp.close() 33 | 34 | output_gb = open(args.output_gb, "w") 35 | 36 | recs = [rec for rec in SeqIO.parse(args.input_gb + ".tmp", "genbank")] 37 | 38 | num = 0 39 | 40 | for rec in recs: 41 | num += 1 42 | rec.name = new_name + "_" + str(num) 43 | 44 | output_gb.write(rec.format("genbank")) 45 | 46 | output_gb.close() 47 | os.remove(tmp_file) 48 | -------------------------------------------------------------------------------- /bit/bit-genbank-to-AA-seqs: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from Bio import SeqIO 4 | import argparse 5 | import re 6 | import sys 7 | import os 8 | 9 | parser = argparse.ArgumentParser(description = "This script takes a genbank file and returns amino acid sequences for all coding sequences. For version info, run `bit-version`.") 10 | 11 | required = parser.add_argument_group('required arguments') 12 | 13 | required.add_argument("-i", "--input-gb", metavar = "", help = 'input Genbank file (e.g. "*.gbk", "*.gb", "*.gbff")', action = "store", required = True) 14 | parser.add_argument("-o", "--output-fasta", metavar = "", help = 'Output fasta file (default: "output.faa")', action = "store", default = "output.faa") 15 | 16 | if len(sys.argv)==1: 17 | parser.print_help(sys.stderr) 18 | sys.exit(0) 19 | 20 | args = parser.parse_args() 21 | 22 | input_gb = open(args.input_gb, "r") 23 | 24 | output_fasta = open(args.output_fasta, "w") 25 | 26 | recs = [rec for rec in SeqIO.parse(input_gb, "genbank")] 27 | 28 | note_terms_to_exclude = ["frameshifted", "internal stop", "incomplete"] # dumping gene if noted as these in the "note" section of the call to keep only complete genes 29 | location_terms_to_exclude = ["join", "<", ">"] # dumping gene if "location" section contains any of these: "join" means the gene call spans multiple contigs; "<" or ">" means the gene call runs off a contig 30 | 31 | for rec in recs: 32 | 33 | genes = [gene for gene in rec.features if gene.type =="CDS"] # focusing on features annotated as "CDS" 34 | 35 | for gene in genes: 36 | 37 | location = str(gene.location) 38 | 39 | # dumping gene if "location" section contains any of these terms set above: "join" means the gene call spans multiple contigs; "<" or ">" means the gene call runs off a contig 40 | if any(exclusion_term in location for exclusion_term in location_terms_to_exclude): 41 | continue 42 | 43 | if "note" in gene.qualifiers: 44 | note = str(gene.qualifiers["note"][0]) 45 | else: 46 | note = "" 47 | 48 | # dumping gene if noted as any of these in the "note" section set above 49 | if any(exclusion_term in note for exclusion_term in note_terms_to_exclude): 50 | continue 51 | 52 | # dumping if overlapping translation frame 53 | if "transl_except" in gene.qualifiers: 54 | continue 55 | 56 | # dumping if noted a pseudo gene 57 | if "pseudo" in gene.qualifiers: 58 | continue 59 | 60 | if "locus_tag" in gene.qualifiers: 61 | locus_tag = str(gene.qualifiers["locus_tag"][0]) 62 | else: 63 | locus_tag = "No_locus_tag" 64 | 65 | if "protein_id" in gene.qualifiers: 66 | protein_id = str(gene.qualifiers["protein_id"][0]) 67 | else: 68 | protein_id = "No_protein_id" 69 | 70 | if "product" in gene.qualifiers: 71 | product = str(gene.qualifiers["product"][0]) 72 | else: 73 | product = "No_product" 74 | 75 | output_fasta.write(f">{product}_{locus_tag}_{protein_id}\n{gene.qualifiers['translation'][0]}\n") 76 | 77 | input_gb.close() 78 | output_fasta.close() 79 | -------------------------------------------------------------------------------- /bit/bit-genbank-to-cds-table: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import argparse 5 | import pandas as pd 6 | from Bio import SeqIO 7 | 8 | parser = argparse.ArgumentParser(description = "This script takes a genbank-formatted file and extracts basic info \ 9 | for every 'CDS' feature including: 'gene', 'protein_id', 'locus_tag', and 'product'. \ 10 | It then writes those out to a tab-delimited file. For version info, run `bit-version`.", 11 | epilog="Ex. usage: bit-genbank-to-cds-table -i input.gb -o output.tsv") 12 | 13 | required = parser.add_argument_group('REQUIRED PARAMETERS') 14 | optional = parser.add_argument_group('OPTIONAL PARAMETERS') 15 | 16 | required.add_argument("-i", "--input-gb", help = "input genbank file", 17 | metavar = "", required = True) 18 | 19 | optional.add_argument("-o", "--output-tsv", help = 'output tsv (default: "output.tsv")', action = "store", 20 | metavar = "", default = "output.tsv") 21 | 22 | if len(sys.argv)==1: 23 | parser.print_help(sys.stderr) 24 | sys.exit(0) 25 | 26 | args = parser.parse_args() 27 | 28 | ################################################################################ 29 | 30 | def main(): 31 | 32 | cds_dataframe = parse_genbank_cds_to_dataframe(args.input_gb) 33 | 34 | save_dataframe_to_tsv(cds_dataframe, args.output_tsv) 35 | 36 | print(f"\n CDS table written to '{args.output_tsv}'!\n") 37 | 38 | ################################################################################ 39 | 40 | 41 | def parse_genbank_cds_to_dataframe(file_path): 42 | cds_entries = [] 43 | 44 | with open(file_path, "r") as handle: 45 | 46 | for record in SeqIO.parse(handle, "genbank"): 47 | 48 | for feature in record.features: 49 | if feature.type == "CDS": 50 | 51 | gene = feature.qualifiers.get("gene", ["NA"])[0] 52 | locus_tag = feature.qualifiers.get("locus_tag", ["NA"])[0] 53 | product = feature.qualifiers.get("product", ["NA"])[0] 54 | protein_id = feature.qualifiers.get("protein_id", ["NA"])[0] 55 | 56 | cds_entries.append({ 57 | "gene": gene, 58 | "protein_id": protein_id, 59 | "locus_tag": locus_tag, 60 | "product": product, 61 | }) 62 | 63 | cds_df = pd.DataFrame(cds_entries) 64 | 65 | return cds_df 66 | 67 | 68 | def save_dataframe_to_tsv(df, output_file): 69 | 70 | df.to_csv(output_file, sep = '\t', index = False) 71 | 72 | ################################################################################ 73 | 74 | if __name__ == "__main__": 75 | main() -------------------------------------------------------------------------------- /bit/bit-genbank-to-fasta: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from Bio import SeqIO 4 | import argparse 5 | import sys 6 | import os 7 | 8 | parser = argparse.ArgumentParser(description="Parse nucleotide sequences from GenBank file into fasta file. For version info, run `bit-version`.") 9 | 10 | required = parser.add_argument_group('required arguments') 11 | 12 | required.add_argument("-i", "--input-gb", metavar = "", help = 'input Genbank file (e.g. "*.gbk", "*.gb", "*.gbff")', action = "store", required = True) 13 | parser.add_argument("-o", "--output-fasta", metavar = "", help = 'Output fasta file (default: "genbank.fa")', action = "store", default = "genbank.fa") 14 | 15 | if len(sys.argv)==1: 16 | parser.print_help(sys.stderr) 17 | sys.exit(0) 18 | 19 | args = parser.parse_args() 20 | 21 | input_gb = open(args.input_gb, "r") 22 | 23 | output = open(args.output_fasta, "w") 24 | 25 | recs = [rec for rec in SeqIO.parse(input_gb, "genbank")] 26 | 27 | for rec in recs: 28 | output.write(">" + rec.name + "\n" + str(rec.seq) + "\n") 29 | 30 | input_gb.close() 31 | output.close() 32 | -------------------------------------------------------------------------------- /bit/bit-get-cov-stats: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import sys 5 | from pathlib import Path 6 | import gzip 7 | from dataclasses import dataclass, field 8 | from Bio import SeqIO #type: ignore 9 | 10 | parser = argparse.ArgumentParser( 11 | description="This script generates whole-reference detection and coverage info\ 12 | for specified references given the reference fasta(s) and a mosdepth-produced\ 13 | *per-base.bed.gz file. For version info, run `bit-version`.", 14 | epilog="Ex. usage: bit-get-cov-stats -r reference.fasta -b asm-per-base.bed.gz", 15 | ) 16 | required = parser.add_argument_group("REQUIRED PARAMETERS") 17 | optional = parser.add_argument_group("OPTIONAL PARAMETERS") 18 | 19 | required.add_argument( 20 | "-r", 21 | "--reference-fastas", 22 | metavar="", 23 | help='Path to reference fasta file(s)', 24 | required=True, 25 | nargs="+", 26 | ) 27 | required.add_argument( 28 | "-b", 29 | "--bed-file", 30 | metavar="", 31 | help="Path to mosdepth-produced *per-base.bed.gz file", 32 | required=True, 33 | ) 34 | optional.add_argument( 35 | "-o", 36 | "--outpath", 37 | metavar="", 38 | help='Name of the output file (default: "coverage-stats.tsv")', 39 | default="coverage-stats.tsv", 40 | ) 41 | 42 | 43 | def main(reference_fastas, bed_file, outpath): 44 | 45 | preflight_checks(reference_fastas, bed_file) 46 | 47 | refs = parse_refs(reference_fastas) 48 | 49 | refs = parse_bed_file(refs, bed_file) 50 | 51 | generate_output(refs, outpath) 52 | 53 | 54 | def preflight_checks(reference_fastas, bed_file): 55 | paths_list = reference_fastas + [bed_file] 56 | check_files_are_found(paths_list) 57 | 58 | 59 | def check_files_are_found(paths_list): 60 | for path in paths_list: 61 | if not Path(path).is_file(): 62 | print(f"\n We were not able to find the input file: {path}") 63 | notify_premature_exit() 64 | 65 | 66 | def notify_premature_exit(): 67 | print("\n Exiting for now :(\n") 68 | sys.exit(1) 69 | 70 | 71 | def parse_refs(reference_fastas): 72 | refs = [] 73 | for fasta in reference_fastas: 74 | ref = RefData(fasta) 75 | ref.load_fasta() 76 | refs.append(ref) 77 | 78 | return refs 79 | 80 | 81 | @dataclass 82 | class RefData: 83 | path: str 84 | headers: set = field(default_factory=set) 85 | total_length: int = 0 86 | total_coverage_count: int = 0 87 | total_bases_detected_at_all: int = 0 88 | total_bases_detected_at_10x: int = 0 89 | 90 | def load_fasta(self): 91 | with open(self.path, "r") as f: 92 | for record in SeqIO.parse(f, "fasta"): 93 | self.headers.add(record.id) 94 | self.total_length += len(record.seq) 95 | 96 | def update_from_bed_line(self, header: str, start: int, end: int, num_reads: int): 97 | if header in self.headers: 98 | cur_range_covered = end - start 99 | self.total_coverage_count += num_reads * cur_range_covered 100 | if num_reads > 0: 101 | self.total_bases_detected_at_all += cur_range_covered 102 | if num_reads >= 10: 103 | self.total_bases_detected_at_10x += cur_range_covered 104 | 105 | def compute_metrics(self): 106 | detection = round(self.total_bases_detected_at_all / self.total_length, 4) 107 | detection_at_10x = round(self.total_bases_detected_at_10x / self.total_length, 4) 108 | average_coverage = round(self.total_coverage_count / self.total_length, 4) 109 | return detection, detection_at_10x, average_coverage 110 | 111 | 112 | def parse_bed_file(refs, bed_file): 113 | 114 | with gzip.open(bed_file, "rt") as f: 115 | for line in f: 116 | header, start, end, num_reads = line.strip().split("\t") 117 | for ref in refs: 118 | ref.update_from_bed_line(header, int(start), int(end), int(num_reads)) 119 | 120 | return(refs) 121 | 122 | 123 | def generate_output(refs, outpath): 124 | with open(outpath, "w") as f: 125 | f.write("Ref\tDetection\tDetection_at_10x\tAverage_coverage\n") 126 | for ref in refs: 127 | detection, detection_at_10x, average_coverage = ref.compute_metrics() 128 | f.write(f"{ref.path}\t{detection}\t{detection_at_10x}\t{average_coverage}\n") 129 | 130 | 131 | if __name__ == "__main__": 132 | if len(sys.argv) == 1: # pragma: no cover 133 | parser.print_help(sys.stderr) 134 | sys.exit(0) 135 | args = parser.parse_args() 136 | 137 | main( 138 | args.reference_fastas, 139 | args.bed_file, 140 | args.outpath, 141 | ) 142 | -------------------------------------------------------------------------------- /bit/bit-get-go-term-info: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ## lots of this is from this great tutorial: GO Tutorial in Python - Solutions.ipynb, which comes from here: http://gohandbook.org/doku.php ; https://nbviewer.jupyter.org/urls/dessimozlab.github.io/go-handbook/GO%20Tutorial%20in%20Python%20-%20Solutions.ipynb 4 | 5 | from goatools import obo_parser 6 | import os 7 | import argparse 8 | import pandas as pd 9 | import sys 10 | import subprocess 11 | 12 | parser = argparse.ArgumentParser(description = "Get quick information on individual GO terms. For version info, run `bit-version`.") 13 | parser.add_argument('GO-term', metavar = "", help = 'GO term you want to investigate, e.g. "GO:0010501"') 14 | 15 | parser.add_argument("-g", "--GO-obo-file", metavar = "", help = 'GO obo file to use (e.g. from: geneontology.org/docs/download-ontology/). By default will \ 16 | use "go-basic.obo". "goslim_metagenomics.obo" is also a pre-packaged option (enter `-g goslim_metagenomics` to specify it). Or \ 17 | a different obo-formatted file can be specified here.', 18 | action = "store", dest = "obo", default = "go_basic") 19 | 20 | parser.add_argument("--parents-only", help = "Add this flag to report parents only, and no children.", action = "store_true") 21 | 22 | 23 | if len(sys.argv)==1: 24 | parser.print_help(sys.stderr) 25 | sys.exit(0) 26 | 27 | args = parser.parse_args() 28 | 29 | pd.set_option('display.max_colwidth', None) 30 | 31 | ### checking and setting up obo file location 32 | go_data_dir = os.environ["GO_DB_DIR"] 33 | 34 | ## downloading default GO databases if they are not present already 35 | checking_db_dir = subprocess.run(["helper-bit-setup-GO-dbs"]) 36 | 37 | if args.obo == "goslim_metagenomics": 38 | go_obo = go_data_dir + "goslim_metagenomics.obo" 39 | 40 | elif args.obo == "go_basic": 41 | go_obo = go_data_dir + "go-basic.obo" 42 | 43 | else: 44 | go_obo = args.obo 45 | 46 | ## loading GO database 47 | print("\n\tGO obo file being used:") 48 | go = obo_parser.GODag(go_obo) 49 | print("") 50 | 51 | input_go_id = args.GO_term 52 | 53 | # adding "GO:" if not in input 54 | if not input_go_id.startswith("GO:"): 55 | input_go_id = "GO:" + input_go_id 56 | 57 | # trying to pull GO id from database, if not, quitting and reporting 58 | try: 59 | input_go_term = go[input_go_id] 60 | except: 61 | print(str(input_go_id) + " does not seem to be in the GO database :(\n") 62 | sys.exit() 63 | 64 | def get_general_info(go_id): 65 | go_term = go[go_id] 66 | name = go_term.name 67 | namespace = go_term.namespace 68 | depth = go_term.depth 69 | 70 | go_term_info = [go_id, namespace, depth, name] 71 | return go_term_info 72 | 73 | curr_go_info = get_general_info(input_go_id) 74 | 75 | header = ["GO id", "namespace", "depth", "name"] 76 | 77 | # getting current term info 78 | input_df = pd.DataFrame([curr_go_info], columns = header) 79 | 80 | print("Input GO term info:") 81 | print(input_df.to_string(index=False)) 82 | 83 | # getting parent terms and their info 84 | parents = input_go_term.get_all_parents() 85 | 86 | if parents: 87 | parent_df = pd.DataFrame([]) 88 | 89 | for term in parents: 90 | curr_parent_info = get_general_info(term) 91 | parent_df = parent_df.append([curr_parent_info]) 92 | 93 | print("\nParent terms info:") 94 | print(parent_df.to_string(index=False, header = header)) 95 | 96 | else: 97 | print("\nThere are no parent terms for " + str(input_go_id) + ".") 98 | 99 | # getting child terms and their info unless --parents-only flag was specified 100 | if not args.parents_only: 101 | children = input_go_term.get_all_children() 102 | 103 | if children: 104 | child_df = pd.DataFrame([]) 105 | 106 | for term in children: 107 | curr_child_info = get_general_info(term) 108 | child_df = child_df.append([curr_child_info]) 109 | 110 | print("\nChild terms info:") 111 | print(child_df.to_string(index=False, header = header)) 112 | 113 | else: 114 | print("\nThere are no child terms for " + str(input_go_id) + ".") 115 | 116 | print("") 117 | -------------------------------------------------------------------------------- /bit/bit-get-lineage-from-taxids: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | # setting colors to use 5 | GREEN='\033[0;32m' 6 | RED='\033[0;31m' 7 | NC='\033[0m' 8 | 9 | 10 | if [ "$#" == 0 ] || [ $1 == "-h" ] || [ $1 == "help" ] || [ $1 == "--help" ]; then 11 | printf "\n This script uses taxonkit in a standard fashion to get lineage info from NCBI taxids.\n" 12 | printf " It expects a single column file of taxids with no header, return table in the same order.\n" 13 | printf " Thanks go to taxonkit, don't forget to cite that if using: https://bioinf.shenwei.me/taxonkit/.\n" 14 | printf " Add the '-s' flag to include strain info if available. For version info, run \`bit-version\`.\n\n" 15 | printf " Usage:\n\t bit-get-lineage-from-taxids -i taxids.txt -o lineages.tsv\n\n" 16 | exit 17 | fi 18 | 19 | # setting defaults 20 | output_file="lineages.tsv" 21 | include_strain=false 22 | ## parsing arguments 23 | while getopts :i:o:s args 24 | do 25 | case "${args}" 26 | in 27 | i) taxids_file=${OPTARG};; 28 | o) output_file=${OPTARG};; 29 | s) include_strain=true;; 30 | \?) printf "\n ${RED}Invalid argument: -${OPTARG}${NC}\n\n Run 'bit-get-lineage-from-taxids' with no arguments or '-h' only to see help menu.\n\n" >&2 && exit 31 | esac 32 | done 33 | 34 | ## checking variables are good 35 | if [ -z $taxids_file ]; then 36 | printf "\n Please specify an input taxid file to the '-i' argument.\n" 37 | printf "\nExiting for now.\n\n" 38 | exit 39 | fi 40 | 41 | 42 | if [ ! -f $taxids_file ]; then 43 | printf "\n The specified input file, $taxids_file, doesn't seem to be where we think it is :( \n" 44 | printf "\nExiting for now.\n\n" 45 | exit 46 | fi 47 | 48 | 49 | ### checking that ncbi tax data is present already, and downloading if it isn't 50 | helper-bit-get-ncbi-tax-data 51 | 52 | if [ "${include_strain}" = true ]; then 53 | taxonkit_reformat_pattern='{domain|superkingdom}\t{phylum}\t{class}\t{order}\t{family}\t{genus}\t{species}\t{strain|subspecies|no rank}' 54 | header="taxid\tdomain\tphylum\tclass\torder\tfamily\tgenus\tspecies\tstrain\n" 55 | else 56 | taxonkit_reformat_pattern='{domain|superkingdom}\t{phylum}\t{class}\t{order}\t{family}\t{genus}\t{species}' 57 | header="taxid\tdomain\tphylum\tclass\torder\tfamily\tgenus\tspecies\n" 58 | fi 59 | 60 | cat $taxids_file | taxonkit lineage | taxonkit reformat2 -r NA -f "${taxonkit_reformat_pattern}" | cut -f 1,3- | tr ";" "\t" > lineages.tmp 61 | 62 | cat <(printf "${header}") lineages.tmp > $output_file 63 | 64 | rm lineages.tmp 65 | -------------------------------------------------------------------------------- /bit/bit-get-test-data: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | This is a program for downloading test data files. 5 | """ 6 | 7 | import sys 8 | import os 9 | import argparse 10 | import textwrap 11 | 12 | parser = argparse.ArgumentParser(description = "This is a helper program for downloading test data for use with bit workflows and programs. For bit verison info run `bit-version`. ", 13 | epilog = "Ex. usage: bit-get-test-data metagenomics\n") 14 | 15 | required = parser.add_argument_group('required arguments') 16 | 17 | required.add_argument('datatype', choices = ['metagenomics'], 18 | help = "The first positional argument should be what type of test data you'd like to download") 19 | 20 | if len(sys.argv)==1: 21 | parser.print_help(sys.stderr) 22 | sys.exit(0) 23 | 24 | args = parser.parse_args() 25 | 26 | 27 | ################################################################################ 28 | 29 | def main(): 30 | 31 | dl_test_data() 32 | 33 | ################################################################################ 34 | 35 | ### variables and functions ### 36 | 37 | tty_colors = { 38 | 'green' : '\033[0;32m%s\033[0m', 39 | 'yellow' : '\033[0;33m%s\033[0m', 40 | 'red' : '\033[0;31m%s\033[0m' 41 | } 42 | 43 | def color_text(text, color='green'): 44 | if sys.stdout.isatty(): 45 | return tty_colors[color] % text 46 | else: 47 | return text 48 | 49 | 50 | def wprint(text): 51 | print(textwrap.fill(text, width=80, initial_indent=" ", 52 | subsequent_indent=" ", break_on_hyphens=False)) 53 | 54 | 55 | def report_message(message, color = "yellow"): 56 | print("") 57 | wprint(color_text(message, color)) 58 | 59 | 60 | def dl_test_data(): 61 | 62 | """ main function for downloading test data """ 63 | 64 | if args.datatype == "metagenomics": 65 | 66 | report_message("Downloading and unpacking 2 paired-end Illumina metagenomics test samples (4 files, ~800 MB total; they are kinda large for test data so MAGs can be recovered):") 67 | print("") 68 | 69 | # getting the metagenomics test data 70 | os.system("curl -L -o test-metagenomics-reads.zip https://figshare.com/ndownloader/files/46096083") 71 | 72 | # extracting 73 | os.system("unzip -qo test-metagenomics-reads.zip") 74 | 75 | # removing archive 76 | os.system("rm test-metagenomics-reads.zip") 77 | 78 | report_message("Pulled metagenomics (Illumina) reads for two test samples from here:", "green") 79 | print(" https://figshare.com/account/projects/203736/articles/25750935\n") 80 | 81 | else: 82 | 83 | report_message("The data type you requested is not currently available.", "red") 84 | 85 | print("\n Please check the currently available data types with 'bit-get-test-data --help'\n") 86 | 87 | 88 | ################################################################################ 89 | 90 | if __name__ == "__main__": 91 | main() 92 | -------------------------------------------------------------------------------- /bit/bit-kraken2-to-taxon-summaries: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | This script parses the regular read-based output of a kraken2 run, producing a table of full standard taxonomic lineages of all 5 | classifications (filling in NAs for lower ranks), with counts of how many reads went to that specific taxon. Depends on another `bit` program 6 | and taxonkit, if `bit` was installed with conda, all should be swell :) 7 | 8 | Input table expected to look like this: 9 | 10 | U A00159:145:H75T2DMXX:1:1101:7735:13792 unclassified (taxid 0) 16 0:0 11 | U A00159:145:H75T2DMXX:1:1101:11216:13557 unclassified (taxid 0) 30 0:0 12 | U A00159:145:H75T2DMXX:1:1101:22688:14074 unclassified (taxid 0) 26 0:0 13 | U A00159:145:H75T2DMXX:1:1101:1325:14559 unclassified (taxid 0) 31 0:0 14 | U A00159:145:H75T2DMXX:1:1101:23719:15013 unclassified (taxid 0) 30 0:0 15 | C A00159:145:H75T2DMXX:1:1102:11388:8312 Ochrobactrum (taxid 528) 194 0:12 1224:16 28211:7 528:15 16 | U A00159:145:H75T2DMXX:1:1102:15465:8390 unclassified (taxid 0) 27 0:0 17 | U A00159:145:H75T2DMXX:1:1102:6343:7560 unclassified (taxid 0) 271 0:237 18 | U A00159:145:H75T2DMXX:1:1102:30101:11600 unclassified (taxid 0) 26 0:0 19 | U A00159:145:H75T2DMXX:1:1101:19678:2221 unclassified (taxid 0) 279 0:245 20 | 21 | Unclassified are reported on one row with "Unclassified" specified at each rank. If names are included, like the example above, you would need to add the '--names-included' flag when running the program. 22 | """ 23 | 24 | import sys 25 | import argparse 26 | import pandas as pd 27 | import subprocess 28 | import os 29 | import re 30 | 31 | parser = argparse.ArgumentParser(description = "This script parses the regular read-based output of a kraken2 run, producing a table of full standard taxonomic lineages of all \ 32 | classifications (filling in NAs for lower ranks), with counts of how many reads went to that specific taxon. For version info, run `bit-version`.") 33 | 34 | required = parser.add_argument_group('required arguments') 35 | 36 | required.add_argument("-i", "--input-tsv", metavar = "", help = "Input table produced by kraken2 run", action = "store", dest = "input_file", required = True) 37 | parser.add_argument("-o", "--output-tsv", metavar = "", help = 'Output table name (default: "output.tsv")', action = "store", dest = "output_file", default = "output.tsv") 38 | parser.add_argument("--names-included", help = 'Add this flag if kraken2 was run with the `--use-names` flag', action = "store_true") 39 | 40 | if len(sys.argv)==1: 41 | parser.print_help(sys.stderr) 42 | sys.exit(0) 43 | 44 | args = parser.parse_args() 45 | 46 | # initializing stuff 47 | unclassified_count = 0 48 | taxid_counts_dict = {} 49 | 50 | # iterating through read classifications 51 | with open(args.input_file, "r") as classifications: 52 | for line in classifications: 53 | 54 | # adding to unclassified count and moving on if unclassified 55 | if line.startswith("U"): 56 | unclassified_count += 1 57 | continue 58 | 59 | # gettig taxid classification of current read 60 | if args.names_included: 61 | classification = line.strip().split("\t")[2] 62 | taxid = re.split('\(taxid ', classification)[1].rstrip(")") 63 | else: 64 | taxid = line.strip().split("\t")[2] 65 | 66 | # adding count to taxid in taxid dictionary if present 67 | if taxid in taxid_counts_dict: 68 | taxid_counts_dict[taxid] += 1 69 | 70 | # adding to taxid dictionary if current taxid not yet present 71 | else: 72 | taxid_counts_dict[taxid] = 1 73 | 74 | # getting standard lineage for each taxid 75 | # writing out taxids to temp file 76 | with open("bit-convert-kraken2.tmp", "w") as tmp_taxid_file: 77 | for key in taxid_counts_dict: 78 | tmp_taxid_file.write(str(key) + "\n") 79 | 80 | # getting taxid lineages 81 | running_bit_get_lineages_from_taxids = subprocess.run(["bit-get-lineage-from-taxids", "-i", "bit-convert-kraken2.tmp", "-o", "bit-convert-kraken2-lineages.tmp"], stdout=subprocess.DEVNULL) 82 | 83 | # reading in results as table 84 | lineage_tab = pd.read_csv("bit-convert-kraken2-lineages.tmp", sep="\t") 85 | 86 | # converting taxid dict to dataframe 87 | taxid_counts_df = pd.DataFrame.from_dict(taxid_counts_dict, orient="index").reset_index() 88 | # moving index to column and setting column names 89 | taxid_counts_df.rename(columns={"index":"taxid", 0:"read_counts"}, inplace=True) 90 | 91 | # setting to integer type so can be merged with lineage tab 92 | taxid_counts_df = taxid_counts_df.astype({"taxid":'int64'}) 93 | 94 | # merging 95 | combined_tab = lineage_tab.merge(taxid_counts_df).fillna("NA") 96 | 97 | # adding in unclassified row 98 | # pandas append deprecated and dropped as of pandas 2.0, using concat below 99 | # combined_tab = combined_tab.append({"taxid":0, "domain":"Unclassified", "phylum":"Unclassified", "class":"Unclassified", "order":"Unclassified", "family":"Unclassified", "genus":"Unclassified", "species":"Unclassified", "read_counts":unclassified_count}, ignore_index=True) 100 | unclassified_dict = {"taxid":0, "domain":"Unclassified", "phylum":"Unclassified", "class":"Unclassified", "order":"Unclassified", "family":"Unclassified", "genus":"Unclassified", "species":"Unclassified", "read_counts":unclassified_count} 101 | unclassified_tab = pd.DataFrame(unclassified_dict, index = [0]).reset_index() 102 | # dropping index column 103 | unclassified_tab = unclassified_tab.drop("index", axis = "columns") 104 | 105 | combined_tab = pd.concat([combined_tab, unclassified_tab], ignore_index = True) 106 | 107 | # adding in percent column 108 | combined_tab['percent_of_reads'] = combined_tab.read_counts / combined_tab.read_counts.sum() * 100 109 | 110 | # sorting 111 | combined_tab.sort_values(by=["taxid"], inplace=True) 112 | 113 | # writing out 114 | combined_tab.to_csv(args.output_file, sep="\t", header=True, index=False) 115 | 116 | # removing intermediate files 117 | os.remove("bit-convert-kraken2.tmp") 118 | os.remove("bit-convert-kraken2-lineages.tmp") 119 | -------------------------------------------------------------------------------- /bit/bit-lineage-to-tsv: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import re 5 | 6 | parser = argparse.ArgumentParser(description = 'This script converts lineages in this format (e.g., "root;d__Bacteria;p__Proteobacteria") into consistent tsv format \ 7 | (e.g., "Bacteria\tProteobacteria\tNA\tNA\tNA\tNA\tNA"). It expects as input a 2-column tab-delimited file with column \ 8 | 1 holding an identifier and column 2 holding the lineage. For version info, run `bit-version`.') 9 | 10 | required = parser.add_argument_group('required arguments') 11 | 12 | required.add_argument("-i", "--input-tsv", metavar = "", help = 'input table, first column needs to be an identifier, second column the lineage', action = "store", required = True) 13 | 14 | parser.add_argument("-o", "--output-tsv", metavar = "", help = 'output file (default: "formatted-tax.tsv")', action = "store", default = "formatted-tax.tsv") 15 | 16 | parser.add_argument("--make-taxid", help = "Provide this flag to make a unique taxid (string of all rank fields) for each lineage \ 17 | (will be added as second column of output)", action = "store_true") 18 | 19 | args = parser.parse_args() 20 | 21 | 22 | # helper function 23 | def get_rank(lineage, prefix): 24 | 25 | if lineage.startswith(prefix): 26 | 27 | curr_rank = lineage.split(";")[0].replace(prefix, "", 1) 28 | 29 | lineage = re.sub(f"^{prefix}{curr_rank};", "", lineage) 30 | 31 | else: 32 | 33 | curr_rank = "NA" 34 | 35 | return(lineage, curr_rank) 36 | 37 | 38 | # converting 39 | with open(args.input_tsv) as in_tab: 40 | 41 | with open(args.output_tsv, "w") as out_tab: 42 | 43 | # adding header 44 | if args.make_taxid: 45 | 46 | out_tab.write("seq_ID\ttaxid\tdomain\tphylum\tclass\torder\tfamily\tgenus\tspecies\n") 47 | 48 | else: 49 | 50 | out_tab.write("seq_ID\tdomain\tphylum\tclass\torder\tfamily\tgenus\tspecies\n") 51 | 52 | for line in open(args.input_tsv): 53 | 54 | line = line.strip().split("\t") 55 | ID = line[0] 56 | 57 | # this is if there is no second column (no lineage) 58 | if len(line) == 1: 59 | 60 | out_line = f"{ID}\tNA\tNA\tNA\tNA\tNA\tNA\tNA" 61 | 62 | else: 63 | 64 | lineage = line[1] 65 | 66 | # removing "root" if that's at the start 67 | if lineage.startswith("root;"): 68 | lineage = re.sub("^root;", "", lineage) 69 | 70 | # getting all ranks present, setting to NA if not 71 | lineage, t_domain = get_rank(lineage, "d__") 72 | lineage, t_phylum = get_rank(lineage, "p__") 73 | lineage, t_class = get_rank(lineage, "c__") 74 | lineage, t_order = get_rank(lineage, "o__") 75 | lineage, t_family = get_rank(lineage, "f__") 76 | lineage, t_genus = get_rank(lineage, "g__") 77 | lineage, t_species = get_rank(lineage, "s__") 78 | 79 | if args.make_taxid: 80 | 81 | taxid_string = "_".join([t_domain, t_phylum, t_class, t_order, t_family, t_genus, t_species]).replace(" ", "_") 82 | out_line = "\t".join([ID, taxid_string, t_domain, t_phylum, t_class, t_order, t_family, t_genus, t_species]) 83 | 84 | else: 85 | 86 | out_line = "\t".join([ID, t_domain, t_phylum, t_class, t_order, t_family, t_genus, t_species]) 87 | 88 | out_tab.write(out_line + "\n") 89 | -------------------------------------------------------------------------------- /bit/bit-normalize-table: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Expects rows to be units (e.g. genes/KOs/etc.), and columns to be samples. 5 | 6 | This script normalizes a table for sampling depth by either coverage per million (CPM) or based on the median-ratio 7 | method as performed in DESeq2. But unlike DESeq2, we don't care here if there are floats in there. 8 | 9 | I initially wrote this for normalizing metagenomic coverage data, like gene-level coverage, or summed KO coverages. 10 | These are normalized for gene-length already because they are "coverages", but they are not yet normalized 11 | for sampling depth – which is where this script comes in. 12 | 13 | I also found myself wanting this because I wanted to do differential abundance testing of coverages 14 | of KO terms. DESeq2 doesn't require normalizing for gene-length because it is the same unit being analyzed 15 | across all samples – the same gene, so the same size. However, after grouping genes into their KO annotations, 16 | (which we may need to compare across samples that don't all share the same underlying assembly or genes), 17 | they no longer all represent the same units across all samples. It is because of this I decided to stick with 18 | gene-level coverages (which are normalized for gene-length), and then sum those values based on KO annotations. 19 | 20 | The CPM (coverage per million) normalization is just like a percent, except scaled to 1 million instead of 100. 21 | So each row's entry (e.g. gene/KO/etc.) is the proportion out of 1 million for that column (sample), 22 | and each column will sum to 1 million. 23 | 24 | The median-ration normalization method (MR) was initially described in this paper 25 | (http://dx.doi.org/10.1186/gb-2010-11-10-r106; e.q. 5), and this site is super-informative in general 26 | about the DESeq2 process overall, and helped me understand the normalizaiton process better to implement it: 27 | https://hbctraining.github.io/DGE_workshop/lessons/02_DGE_count_normalization.html. Columns will not sum to 28 | the same amount when the median-ratio method is applied. 29 | """ 30 | 31 | import os 32 | import sys 33 | import argparse 34 | import pandas as pd 35 | import numpy as np 36 | from scipy.stats.mstats import gmean 37 | 38 | parser = argparse.ArgumentParser(description = "This script normalizes a coverage table for sampling depth based on either \ 39 | coverage per million (CPM) or the median-ratio method (MR) as performed \ 40 | in DESeq2. See note at top of script for more info. It expects a \ 41 | tab-delimited table with samples as columns and units (e.g. genes/KOs/etc.) \ 42 | as rows. For version info, run `bit-version`.") 43 | 44 | required = parser.add_argument_group('required arguments') 45 | 46 | required.add_argument("-i", "--input-table", metavar = "", help = "Input tab-delimited table", action = "store", required = True) 47 | 48 | parser.add_argument("-n", "--normalization-method", help = 'Desired normalization method of either \ 49 | "CPM" as in coverage per million, or "MR" as in median-ratio as performed in DESeq2. \ 50 | See note at top of program for more info. (default: "CPM")', choices = ["CPM", "MR"], \ 51 | action = "store", default = "CPM") 52 | 53 | parser.add_argument("-o", "--output-table", metavar = "", help = 'Output filename (default: "Normalized.tsv")', action = "store", default = "Normalized.tsv") 54 | 55 | if len(sys.argv)==1: 56 | parser.print_help(sys.stderr) 57 | sys.exit(0) 58 | 59 | args = parser.parse_args() 60 | 61 | ################################################################################ 62 | 63 | tab = pd.read_csv(args.input_table, sep = "\t", index_col = 0, low_memory = False) 64 | 65 | 66 | ## removing columns if they have all zeroes in them prior to normalization to avoid problems (will put them back) 67 | column_sums = tab.sum() 68 | 69 | # getting all column names in order so can rearrange afterwards 70 | ordered_columns = tab.columns.tolist() 71 | 72 | # getting column names of those with all zeroes 73 | zero_column_names = column_sums[column_sums == 0].index.tolist() 74 | 75 | tab.drop(zero_column_names, axis = 1, inplace = True) 76 | 77 | 78 | if args.normalization_method == "CPM": 79 | 80 | norm_tab = tab / tab.sum() * 1000000 81 | 82 | else: 83 | 84 | ## calculating size factors 85 | # getting geometric means for each row 86 | with np.errstate(divide = 'ignore'): 87 | geomeans = gmean(tab, axis = 1) 88 | 89 | # getting ratios of gene values to geometric means 90 | ratios_tab = (tab.T / geomeans).T 91 | 92 | sizeFactors = ratios_tab[geomeans > 0].median().to_list() 93 | 94 | # dividing by size factors 95 | norm_tab = tab / sizeFactors 96 | 97 | 98 | ## adding back on columns with all zeroes 99 | if len(zero_column_names) > 0: 100 | for col in zero_column_names: 101 | norm_tab[col] = 0.0 102 | 103 | # reordering to match input 104 | norm_tab = norm_tab[ordered_columns] 105 | 106 | # writing out normalized table 107 | norm_tab.to_csv(args.output_table, sep = "\t") 108 | -------------------------------------------------------------------------------- /bit/bit-parse-fasta-by-headers: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from Bio import SeqIO 4 | import sys 5 | import argparse 6 | import os 7 | import gzip 8 | 9 | parser = argparse.ArgumentParser(description = 'This script is for parsing a fasta file by pulling out sequences with the desired headers. If you want all sequences EXCEPT the ones with the headers you are providing, add the flag "--inverse". For version info, run `bit-version`.') 10 | 11 | required = parser.add_argument_group('required arguments') 12 | 13 | required.add_argument("-i", "--input-fasta", metavar = "", help = "Original fasta file", action = "store", required = True) 14 | required.add_argument("-w", "--sequence-headers", metavar = "", help = "Single-column file with target sequence headers", action = "store", dest = "headers", required = True) 15 | parser.add_argument("-o", "--output-fasta", metavar = "", help = 'Output fasta file (default: "wanted.fasta")', action = "store", default = "wanted.fasta") 16 | parser.add_argument("--inverse", help = "Add this flag to pull out all sequences with headers NOT in the provided header file.", action = "store_true") 17 | parser.add_argument("--gz", help = "Add this flag if the input is gzipped (does not gzip output)", action = "store_true") 18 | 19 | if len(sys.argv)==1: 20 | parser.print_help(sys.stderr) 21 | sys.exit(0) 22 | 23 | args = parser.parse_args() 24 | 25 | if not args.gz: 26 | 27 | fasta_in = open(args.input_fasta, "r") 28 | 29 | else: 30 | 31 | fasta_in = gzip.open(args.input_fasta, "rt") 32 | 33 | 34 | headers_of_int = open(args.headers, "r") 35 | 36 | headers_of_int_set = set(line.strip() for line in headers_of_int) 37 | headers_of_int.close() 38 | 39 | fasta_out = open(args.output_fasta, "w") 40 | 41 | if not args.inverse: 42 | 43 | for seq_record in SeqIO.parse(fasta_in, "fasta"): 44 | if seq_record.id in headers_of_int_set: 45 | fasta_out.write(">" + seq_record.id + "\n") 46 | fasta_out.write(str(seq_record.seq) + "\n") 47 | 48 | else: 49 | 50 | for seq_record in SeqIO.parse(fasta_in, "fasta"): 51 | if seq_record.id not in headers_of_int_set: 52 | fasta_out.write(">" + seq_record.id + "\n") 53 | fasta_out.write(str(seq_record.seq) + "\n") 54 | 55 | fasta_in.close() 56 | fasta_out.close() 57 | -------------------------------------------------------------------------------- /bit/bit-parse-fastq-by-headers: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import os 5 | import argparse 6 | import textwrap 7 | from Bio.SeqIO.QualityIO import FastqGeneralIterator 8 | import gzip 9 | import subprocess 10 | 11 | 12 | parser = argparse.ArgumentParser(description = 'This script is for parsing a single fastq file by pulling out sequences with the desired headers (paired-end not supported yet).\ 13 | For version info, run `bit-version`.') 14 | 15 | required = parser.add_argument_group('required arguments') 16 | 17 | required.add_argument("-i", "--input-fastq", metavar = "", help = "Starting fastq file", action = "store", required = True) 18 | required.add_argument("-w", "--sequence-headers", metavar = "", help = "Single-column file with target sequence headers (if the headers in the fastq file have whitespace in them, it is okay to provide just the part up to the whitespace in this input file)", action = "store", required = True) 19 | parser.add_argument("-o", "--output-fastq", metavar = "", help='Output fastq file name (default: "wanted.fq", ".gz" will be added if compressed)', action = "store", default = "wanted.fq") 20 | parser.add_argument("--inverse", help = "Add this flag to pull out all sequences with headers NOT in the provided header file.", action = "store_true") 21 | parser.add_argument("--gz", help = "Add this flag if the input is gzipped (output will be too)", action = "store_true") 22 | 23 | if len(sys.argv)==1: 24 | parser.print_help(sys.stderr) 25 | sys.exit(0) 26 | 27 | args = parser.parse_args() 28 | 29 | ################################################################################ 30 | 31 | def main(): 32 | 33 | check_all_inputs_exist([args.input_fastq, args.sequence_headers]) 34 | 35 | check_if_output_already_exists(args.output_fastq) 36 | 37 | # reading headers into a set 38 | headers_of_int_set = set(line.strip() for line in open(args.sequence_headers, "r")) 39 | 40 | parse_fastq(headers_of_int_set, args.input_fastq, args.output_fastq, args.inverse, args.gz) 41 | 42 | if args.gz: 43 | 44 | subprocess.run(["pigz", args.output_fastq]) 45 | 46 | ################################################################################ 47 | 48 | 49 | # setting some colors 50 | tty_colors = { 51 | 'green' : '\033[0;32m%s\033[0m', 52 | 'yellow' : '\033[0;33m%s\033[0m', 53 | 'red' : '\033[0;31m%s\033[0m' 54 | } 55 | 56 | 57 | ### functions ### 58 | def color_text(text, color='green'): 59 | if sys.stdout.isatty(): 60 | return tty_colors[color] % text 61 | else: 62 | return text 63 | 64 | 65 | def wprint(text): 66 | """ print wrapper """ 67 | 68 | print(textwrap.fill(text, width=80, initial_indent=" ", 69 | subsequent_indent=" ", break_on_hyphens=False)) 70 | 71 | 72 | def check_all_inputs_exist(input_list): 73 | 74 | for file in input_list: 75 | if not os.path.exists(file): 76 | print("") 77 | wprint(color_text("It seems the specified input file '" + str(file) + "' can't be found.", "yellow")) 78 | print("\nExiting for now.\n") 79 | sys.exit(1) 80 | 81 | 82 | def check_if_output_already_exists(planned_output): 83 | 84 | # making sure outputs don't already exist, exiting if they do 85 | 86 | if os.path.exists(planned_output): 87 | 88 | print("") 89 | wprint(color_text("It seems the expected output (or intermediate) file '" + str(planned_output) + "' already exists.", "yellow")) 90 | print("") 91 | wprint("We don't want to overwrite something accidentally, so rename or remove that first if wanting to proceed.") 92 | print("\nExiting for now.\n") 93 | sys.exit(1) 94 | 95 | 96 | def parse_fastq(headers_of_int_set, input_fastq, output_fastq, inverse, gz): 97 | 98 | 99 | if gz: 100 | 101 | fastq_in = gzip.open(input_fastq, "rt") 102 | 103 | else: 104 | 105 | fastq_in = open(input_fastq, "rt") 106 | 107 | 108 | if not args.inverse: 109 | 110 | with open(output_fastq, "w") as output_file: 111 | 112 | for header, seq, qual in FastqGeneralIterator(fastq_in): 113 | 114 | if header in headers_of_int_set or header.split(" ")[0] in headers_of_int_set: 115 | 116 | output_file.write("@%s\n%s\n+\n%s\n" % (header, seq, qual)) 117 | 118 | else: 119 | 120 | with open(output_fastq, "w") as output_file: 121 | 122 | for header, seq, qual in FastqGeneralIterator(fastq_in): 123 | 124 | if header not in headers_of_int_set and header.split(" ")[0] not in headers_of_int_set: 125 | 126 | output_file.write("@%s\n%s\n+\n%s\n" % (header, seq, qual)) 127 | 128 | fastq_in.close() 129 | 130 | 131 | if __name__ == "__main__": 132 | main() 133 | -------------------------------------------------------------------------------- /bit/bit-prot-acc-to-taxid: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import argparse 5 | import os 6 | 7 | class colors: 8 | GREEN = '\033[0;32m' 9 | NC = '\033[0m' 10 | 11 | parser = argparse.ArgumentParser(description = 'This script takes NCBI protein accessions and returns a two-column \ 12 | tab-delimited file with protein accessions and taxids. It requires the \ 13 | "prot.accession2taxid" database (unzipped) that can be downloaded from here: \ 14 | ftp://ftp.ncbi.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.gz\ 15 | For version info, run `bit-version`.') 16 | 17 | required = parser.add_argument_group('required arguments') 18 | 19 | required.add_argument("-r", "--ref-map", metavar = "", help = "reference prot_acc_to_taxid_map database", action = "store", dest = "input_ref", required = True) 20 | required.add_argument("-w", "--wanted_prot_accessions", metavar = "", help = "Single-column file with protein accessions", action = "store", dest = "prot_accs", required = True) 21 | parser.add_argument("-o", "--output-file", metavar = "", help = 'Output file of prot_acc and taxID (default: "wanted-prot-accs-and-taxids.tsv")', action = "store", dest = "file_out", default = "wanted-prot-accs-and-taxids.tsv") 22 | 23 | if len(sys.argv)==1: 24 | parser.print_help(sys.stderr) 25 | sys.exit(0) 26 | 27 | args = parser.parse_args() 28 | 29 | wanted_accs = open(args.prot_accs, "r") 30 | 31 | wanted_accs_set = set(line.strip() for line in wanted_accs) 32 | 33 | output_file = open(args.file_out, "w") 34 | 35 | num_found = 0 36 | 37 | accs_found = [] 38 | 39 | output_file.write("prot_accession\ttaxid\n") 40 | 41 | num = 0 42 | 43 | print("\nNow beginning trek through reference mapping file.\n") 44 | 45 | with open(args.input_ref) as refs: 46 | 47 | for line in refs: 48 | line = line.split("\t") 49 | num += 1 50 | 51 | if num % 1000000 == 0: 52 | mega_num = num / 1000000 53 | sys.stdout.write("\r On line " + colors.GREEN + str(mega_num) + colors.NC + " million of prot_acc_to_taxid_map...") 54 | sys.stdout.flush() 55 | 56 | if line[1] in wanted_accs_set: 57 | output_file.write(line[1] + "\t" + line[2] + "\n") 58 | num_found += 1 59 | accs_found.append(line[1]) 60 | 61 | print("\n") 62 | 63 | wanted_accs_list = list(wanted_accs_set) 64 | 65 | print(' Adding in "NA"s for those protein accessions not found...\n') 66 | 67 | for acc in wanted_accs_list: 68 | if acc not in accs_found: 69 | output_file.write(acc + "\tNA\n") 70 | 71 | print(colors.GREEN + " Done!" + colors.NC + "\n") 72 | print(" You were looking for " + str(len(wanted_accs_list)) + " protein accessions.") 73 | print(" " + str(num_found) + ' were found. The rest were given taxids of "NA\".') 74 | 75 | output_file.close() 76 | wanted_accs.close() 77 | -------------------------------------------------------------------------------- /bit/bit-remove-wraps: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | RED='\033[0;31m' 4 | GREEN='\033[0;32m' 5 | NC='\033[0m' 6 | 7 | if [ "$#" == 0 ] || [ $1 == "-h" ]; then 8 | printf "\n This script removes line wraps from a fasta file. For version\n" 9 | printf " info, run \`bit-version\`.\n\n" 10 | printf " Usage:\n\t bit-remove-wraps input.fasta > new.fasta\n\n" 11 | exit 12 | fi 13 | 14 | if [ -f $1 ]; then 15 | awk '!/^>/ { printf "%s", $0; n="\n" } /^>/ { print n $0; n = "" } END { printf "%s", n }' "$1" 16 | echo -e " ${GREEN}Annoying line wraps removed! Cheers!${NC}" 1>&2 17 | 18 | else 19 | echo -e " ${RED}Input file not found :/${NC}" 1>&2 20 | exit 1 21 | fi 22 | -------------------------------------------------------------------------------- /bit/bit-rename-fasta-headers: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from Bio import SeqIO 4 | import sys 5 | import argparse 6 | 7 | parser = argparse.ArgumentParser( 8 | description=( 9 | "This script facilitates renaming headers in a fasta file. " 10 | "By default, each sequence is renamed to _ for n=1,2,… " 11 | "If you provide --prefix and/or --suffix, those strings will be " 12 | "prepended/appended to each original header. " 13 | "For version info, run `bit-version`." 14 | ) 15 | ) 16 | 17 | required = parser.add_argument_group('required arguments') 18 | 19 | required.add_argument("-i", "--input-fasta", metavar = "", help = "starting fasta file", action = "store", required = True) 20 | parser.add_argument("-w", "--wanted-text", metavar = "", help = 'base name to give seqs when renaming to "_" (default: "Seq")', action = "store", default = "Seq") 21 | parser.add_argument("-o", "--output-fasta", metavar = "", help = 'output fasta file (default: "renamed.fasta").', default = "renamed.fasta") 22 | 23 | parser.add_argument("--prefix", metavar = "", help = "prepend this text to the original header (include separator if wanted)", default = "") 24 | parser.add_argument("--suffix", metavar = "", help = "append this text to the original header (include separator if wanted)", default = "") 25 | 26 | 27 | if len(sys.argv)==1: 28 | parser.print_help(sys.stderr) 29 | sys.exit(0) 30 | 31 | args = parser.parse_args() 32 | 33 | do_numbering = (args.prefix == "" and args.suffix == "") 34 | 35 | try: 36 | input_fasta = open(args.input_fasta, "r") 37 | except FileNotFoundError: 38 | print(f"Error: Input file '{args.input_fasta}' not found.") 39 | sys.exit(1) 40 | 41 | out_fasta = open(args.output_fasta, "w") 42 | 43 | if do_numbering: 44 | counter = 0 45 | for seq_record in SeqIO.parse(input_fasta, "fasta"): 46 | counter += 1 47 | out_fasta.write(f">{args.wanted_text}_{counter}\n") 48 | out_fasta.write(str(seq_record.seq) + "\n") 49 | 50 | else: 51 | 52 | for seq_record in SeqIO.parse(input_fasta, "fasta"): 53 | # using description so it preserves the full header 54 | original_header = seq_record.description 55 | 56 | new_header = f"{args.prefix}{original_header}{args.suffix}" 57 | out_fasta.write(f">{new_header}\n{seq_record.seq}\n") 58 | 59 | input_fasta.close() 60 | out_fasta.close() 61 | -------------------------------------------------------------------------------- /bit/bit-reorder-fasta: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from Bio import SeqIO 4 | import sys 5 | import argparse 6 | import os 7 | 8 | parser = argparse.ArgumentParser('This script takes a multifasta file and reorders the sequences according to the headers provided. For version info, run `bit-version`.') 9 | 10 | required = parser.add_argument_group('required arguments') 11 | 12 | required.add_argument("-i", "--input-fasta", metavar = "", help = "Original fasta file", action = "store", required = True) 13 | required.add_argument("-w", "--wanted-sequence-order", metavar = "", help = "Single-column file with headers in desired order", action = "store", dest = "ordered_headers", required = True) 14 | parser.add_argument("-o", "--output-fasta", help = 'Reordered output fasta (default: "reordered.fasta")', default = "reordered.fasta") 15 | 16 | 17 | if len(sys.argv)==1: 18 | parser.print_help(sys.stderr) 19 | sys.exit(0) 20 | 21 | args = parser.parse_args() 22 | 23 | ordered_seqs = open(args.ordered_headers, "r") 24 | 25 | ordered_list = list(line.strip() for line in ordered_seqs) 26 | 27 | fasta_dict = SeqIO.index(args.input_fasta, "fasta") 28 | 29 | fasta_out = open(args.output_fasta, "wb") 30 | 31 | for header in ordered_list: 32 | fasta_out.write(fasta_dict.get_raw(header)) 33 | 34 | ordered_seqs.close() 35 | fasta_out.close() 36 | -------------------------------------------------------------------------------- /bit/bit-slim-down-go-terms: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ## learned most goatools/python things from this great tutorial: GO Tutorial in Python - Solutions.ipynb, which comes from here: http://gohandbook.org/doku.php ; https://nbviewer.jupyter.org/urls/dessimozlab.github.io/go-handbook/GO%20Tutorial%20in%20Python%20-%20Solutions.ipynb 4 | 5 | import os 6 | import argparse 7 | import pandas as pd 8 | import sys 9 | import subprocess 10 | 11 | parser = argparse.ArgumentParser(description = 'This script wraps the goatools `map_to_slim.py` program (github.com/tanghaibao/Goatools#map-go-terms-to-goslim-terms). \ 12 | See there for more details, and if you use it in your work, be sure to properly cite them :) \ 13 | https://www.nature.com/articles/s41598-018-28948-z. It is included here to streamline integration with \ 14 | with the GO databases stored with `bit` and programs like `bit-summarize-go-annotations`. Stored databases \ 15 | can be updated with `bit-update-go-dbs`. For version info, run `bit-version`.') 16 | 17 | required = parser.add_argument_group('required arguments') 18 | 19 | required.add_argument("-a", "--association-file", metavar = "", 20 | help = "Input annotations file. 2-column, tab-delimited, where the first column holds gene IDs, and the second column holds GO terms (can be multiple delimited with a semi-colon).", 21 | action = "store", dest = "input_ass_file", required = True) 22 | parser.add_argument("-g", "--initial-GO-obo-file", metavar = "", 23 | help='Initial GO obo file holding relationships of all terms used to perform the annotation (e.g. from: geneontology.org/docs/download-ontology/). By default \ 24 | this program will use "go-basic.obo" that is stored with `bit`. Or a different obo-formatted file can be specified here.', 25 | action = "store", dest = "initial_obo", default = "go_basic") 26 | parser.add_argument("-s", "--slimmed-GO-obo-file", metavar = "", 27 | help = 'Slimmed GO obo file holding relationships to collapse GO terms (e.g. from: geneontology.org/docs/download-ontology/#subsets;). By default will \ 28 | use "goslim_metagenomics.obo" that is stored with `bit`. Or a different obo-formatted file can be specified here.', 29 | action = "store", dest = "slimmed_obo", default = "goslim_metagenomics") 30 | parser.add_argument("-m", "--mode", help = 'Set if the slimmer should return only direct ancestors, or all ancestors. Default setting is to return all.', 31 | choices=["all", "direct"], action = "store", dest = "mode", default = "all") 32 | 33 | parser.add_argument("-o", "--output-file", metavar = "", help = 'Name for output slimmed annotation file. (default: "GO-slimmed.tsv").', action = "store", dest = "output_tab", default = "GO-slimmed.tsv") 34 | 35 | 36 | if len(sys.argv)==1: 37 | parser.print_help(sys.stderr) 38 | sys.exit(0) 39 | 40 | args = parser.parse_args() 41 | 42 | ### checking and setting up obo file locations 43 | go_data_dir = os.environ["GO_DB_DIR"] 44 | 45 | ## downloading default GO databases if they are not present already 46 | checking_db_dir = subprocess.run(["helper-bit-setup-GO-dbs"]) 47 | 48 | if args.initial_obo == "go_basic": 49 | initial_obo = go_data_dir + "go-basic.obo" 50 | 51 | else: 52 | initial_obo = args.initial_obo 53 | 54 | if args.slimmed_obo == "goslim_metagenomics": 55 | slim_obo = go_data_dir + "goslim_metagenomics.obo" 56 | 57 | else: 58 | slim_obo = args.slimmed_obo 59 | 60 | ### building and running call to map_to_slim.py 61 | with open(args.output_tab, "w") as output: 62 | map_to_slim = subprocess.run(["map_to_slim.py", "--association_file", args.input_ass_file, "--slim_out", args.mode, initial_obo, slim_obo], stdout=output) 63 | map_to_slim 64 | 65 | -------------------------------------------------------------------------------- /bit/bit-split-multifasta: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from Bio import SeqIO 4 | from pathlib import Path 5 | import sys 6 | import argparse 7 | import os 8 | 9 | parser = argparse.ArgumentParser(description = 'This script will split a multifasta into individual fasta files, each file named with the header of the sequence within it, written to a new subdirectory. (It expects standard characters in the headers only, e.g. no spaces or special characters). For version info, run `bit-version`.') 10 | 11 | required = parser.add_argument_group('required arguments') 12 | 13 | required.add_argument("-i", "--input-fasta", metavar = "", help = "Starting fasta file", action = "store", required = True) 14 | parser.add_argument("-d", "--subdirectory-name", help = 'Name of new subdirectory holding split sequences (default: "sub")', action = "store", dest = "subdirectory", default = "sub") 15 | 16 | if len(sys.argv)==1: 17 | parser.print_help(sys.stderr) 18 | sys.exit(0) 19 | 20 | args = parser.parse_args() 21 | 22 | out_dir = Path(str(args.subdirectory)) 23 | 24 | Path.mkdir(out_dir, parents=True, exist_ok=True) 25 | 26 | in_fasta = open(args.input_fasta, "r") 27 | 28 | for seq_record in SeqIO.parse(in_fasta, "fasta"): 29 | curr_header = str(seq_record.id) 30 | 31 | curr_out = open(str(args.subdirectory) + "/" + curr_header + ".fa", "w") 32 | curr_out.write(">" + str(seq_record.id) + "\n" + str(seq_record.seq) + "\n") 33 | curr_out.close() 34 | 35 | in_fasta.close() 36 | -------------------------------------------------------------------------------- /bit/bit-summarize-assembly: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import pandas as pd 4 | import pyfastx 5 | import sys 6 | import argparse 7 | import os 8 | 9 | 10 | parser = argparse.ArgumentParser(description = 'This script outputs general summary stats for an assembly provided \ 11 | in fasta format. If given an output file, writes out a tsv, otherwise \ 12 | prints to the screen. "Ambiguous characters" reports total counts of \ 13 | of any letter that is not "A", "T", "C", or "G". For version info, run \ 14 | `bit-version`.') 15 | 16 | required = parser.add_argument_group('required arguments') 17 | 18 | required.add_argument("input_assembly", metavar = "", type = str, nargs = "+", help = "Input assembly file(s).") 19 | 20 | parser.add_argument("-o", "--output-tsv", metavar = "", help = 'Name of output tsv file (if none provided, prints to screen)', action = "store", default = False) 21 | parser.add_argument("-t", "--transpose-output-tsv", help = 'Set this flag if we want to have the output table have genomes as rows rather than columns.', action = "store_true") 22 | 23 | 24 | if len(sys.argv)==1: 25 | parser.print_help(sys.stderr) 26 | sys.exit(0) 27 | 28 | args = parser.parse_args() 29 | 30 | ## setting up master dataframe 31 | df_colnames = [] 32 | for assembly in args.input_assembly: 33 | 34 | assembly_base = os.path.basename(assembly) 35 | 36 | df_colnames.append(assembly_base.rsplit(".", 1)[0]) 37 | 38 | # checking for a situation where inputs may have the same basename, due to being from different directories 39 | # if so, setting a flag and reporting them as the full input paths instead of just basenames 40 | use_paths_instead_of_basenames = False 41 | 42 | for assembly in df_colnames: 43 | 44 | num_occurences = 0 45 | 46 | for assembly_2 in df_colnames: 47 | 48 | if assembly_2 == assembly: 49 | 50 | num_occurences += 1 51 | 52 | if num_occurences > 1: 53 | 54 | use_paths_instead_of_basenames = True 55 | 56 | if use_paths_instead_of_basenames: 57 | 58 | df_colnames = [] 59 | 60 | for assembly in args.input_assembly: 61 | 62 | df_colnames.append(assembly.rsplit(".", 1)[0]) 63 | 64 | 65 | ## creating output table foundation 66 | df_index = ["Assembly", "Total contigs", "Total length", "Ambiguous characters", 67 | "GC content", "Maximum contig length", "Minimum contig length", "N50", 68 | "N75", "N90", "L50", "L75", "L90", "Num. contigs >= 100", 69 | "Num. contigs >= 500", "Num. contigs >= 1000", "Num. contigs >= 5000", 70 | "Num. contigs >= 10000", "Num. contigs >= 50000", "Num. contigs >= 100000"] 71 | 72 | df = pd.DataFrame(columns = df_colnames, index = df_index) 73 | 74 | for assembly in args.input_assembly: 75 | 76 | if use_paths_instead_of_basenames: 77 | 78 | assembly_name = assembly.rsplit(".", 1)[0] 79 | 80 | else: 81 | assembly_base = os.path.basename(assembly) 82 | 83 | assembly_name = assembly_base.rsplit(".", 1)[0] 84 | 85 | try: 86 | df.at["Assembly", str(assembly_name)] = assembly_name 87 | except AttributeError: 88 | print(" An attribute exception was thrown by pandas. Maybe the inputs don't have unique names?") 89 | print(" As written, this cuts off the extension based on last period to generate names.") 90 | sys.exit(1) 91 | 92 | # putting in a catch if file is empty (which can happen if an assembly produced no contigs) 93 | # this will leave it in the table, but with NAs (written out as "NA") 94 | if os.stat(assembly).st_size == 0: 95 | continue 96 | 97 | fasta = pyfastx.Fasta(assembly) 98 | 99 | df.at["Total contigs", str(assembly_name)] = len(fasta) 100 | df.at["Total length", str(assembly_name)] = fasta.size 101 | 102 | num_ambiguous_chars = 0 103 | for key in fasta.composition: 104 | if key not in ["A","T","G","C"]: 105 | num_ambiguous_chars += fasta.composition[key] 106 | 107 | df.at["Ambiguous characters", str(assembly_name)] = num_ambiguous_chars 108 | df.at["GC content", str(assembly_name)] = round(fasta.gc_content, 2) 109 | df.at["Maximum contig length", str(assembly_name)] = len(fasta.longest) 110 | df.at["Minimum contig length", str(assembly_name)] = len(fasta.shortest) 111 | 112 | info_at_50 = fasta.nl(50) 113 | info_at_75 = fasta.nl(75) 114 | info_at_90 = fasta.nl(90) 115 | df.at["N50", str(assembly_name)] = info_at_50[0] 116 | df.at["N75", str(assembly_name)] = info_at_75[0] 117 | df.at["N90", str(assembly_name)] = info_at_90[0] 118 | df.at["L50", str(assembly_name)] = info_at_50[1] 119 | df.at["L75", str(assembly_name)] = info_at_75[1] 120 | df.at["L90", str(assembly_name)] = info_at_90[1] 121 | 122 | df.at["Num. contigs >= 100", str(assembly_name)] = fasta.count(100) 123 | df.at["Num. contigs >= 500", str(assembly_name)] = fasta.count(500) 124 | df.at["Num. contigs >= 1000", str(assembly_name)] = fasta.count(1000) 125 | df.at["Num. contigs >= 5000", str(assembly_name)] = fasta.count(5000) 126 | df.at["Num. contigs >= 10000", str(assembly_name)] = fasta.count(10000) 127 | df.at["Num. contigs >= 50000", str(assembly_name)] = fasta.count(50000) 128 | df.at["Num. contigs >= 100000", str(assembly_name)] = fasta.count(100000) 129 | 130 | # removing intermediate index file 131 | os.remove(assembly + ".fxi") 132 | 133 | 134 | if args.output_tsv: 135 | # transposing if wanted: 136 | if args.transpose_output_tsv: 137 | df = df.T 138 | df.to_csv(args.output_tsv, sep="\t", index=False, na_rep = "NA") 139 | 140 | else: 141 | df.to_csv(args.output_tsv, sep="\t", header=False, na_rep = "NA") 142 | 143 | else: 144 | print("") 145 | print(df.to_string(header=False)) 146 | print("") 147 | -------------------------------------------------------------------------------- /bit/bit-summarize-column: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import pandas as pd 4 | import numpy as np 5 | import sys 6 | import argparse 7 | import os 8 | 9 | 10 | parser = argparse.ArgumentParser(description = 'This script outputs general summary stats for a numeric column. It can take stdin or \ 11 | a file as input. It will run on the first (or only column) if not specified. Otherwise \ 12 | you can indicate which column to summarize by column position or name. For version info, \ 13 | run `bit-version`.') 14 | 15 | required = parser.add_argument_group('required arguments') 16 | 17 | required.add_argument("-i", "--input-file", metavar = "", type = argparse.FileType('r'), default = '-', help = "Input file or stdin if none provided") 18 | 19 | parser.add_argument("-c", "--column", metavar = "", help = "Specify the target column to summarize. Can be a number specifying the column index (1-based, like unix cut/awk), \ 20 | or can be a column name if also including the `--header` flag. (default: 1)", action = "store", default = 1) 21 | 22 | parser.add_argument("-d", "--delimiter", metavar = "", help = "Specify the delimiter (default = '\\t')", action = "store", default = "\t") 23 | 24 | parser.add_argument("--header", help = "Add this flag if your input has a header with column names", action = "store_true") 25 | 26 | args = parser.parse_args() 27 | 28 | ## help menu access ## 29 | # this handles if no standard in was provided and no -i input file was provided 30 | if sys.stdin.isatty(): 31 | 32 | if args.input_file.name == "": 33 | parser.print_help(sys.stderr) 34 | sys.exit(0) 35 | 36 | ## reading in input 37 | if args.header: 38 | input_header = 0 39 | else: 40 | input_header = None 41 | 42 | input_df = pd.read_csv(args.input_file, sep = args.delimiter, header = input_header) 43 | 44 | ## getting target column 45 | try: 46 | args.column = int(args.column) 47 | except: 48 | pass 49 | 50 | if isinstance(args.column, int): 51 | column = args.column - 1 52 | target_array = input_df.iloc[: , column].to_numpy() 53 | 54 | # removing first entry if it is a string, and therefore likely a header 55 | if isinstance(target_array.flat[0], str): 56 | 57 | target_array = np.delete(target_array, 0) 58 | 59 | # and converting all to integers 60 | target_array = target_array.astype(int) 61 | 62 | elif isinstance(args.column, str): 63 | 64 | # checking header was set to true if the user specified a column by name 65 | if not args.header: 66 | print("\n If trying to specify which column by header name, you need to explicity add the `--header` flag also.\n") 67 | parser.print_help(sys.stderr) 68 | sys.exit(1) 69 | 70 | target_array = input_df[args.column].to_numpy() 71 | 72 | # getting wanted values 73 | input_n = target_array.size 74 | input_min = np.round(np.min(target_array), decimals = 2) 75 | input_max = np.round(np.max(target_array), decimals = 2) 76 | input_mean = np.round(np.mean(target_array), decimals = 2) 77 | input_median = np.round(np.median(target_array), decimals = 2) 78 | input_stdev = np.round(np.std(target_array), decimals = 2) 79 | percentile_1 = np.round(np.percentile(target_array, 1), decimals = 2) 80 | percentile_5 = np.round(np.percentile(target_array, 5), decimals = 2) 81 | percentile_10 = np.round(np.percentile(target_array, 10), decimals = 2) 82 | percentile_25 = np.round(np.percentile(target_array, 25), decimals = 2) 83 | percentile_50 = np.round(np.percentile(target_array, 50), decimals = 2) 84 | percentile_75 = np.round(np.percentile(target_array, 75), decimals = 2) 85 | percentile_90 = np.round(np.percentile(target_array, 90), decimals = 2) 86 | percentile_95 = np.round(np.percentile(target_array, 95), decimals = 2) 87 | percentile_99 = np.round(np.percentile(target_array, 99), decimals = 2) 88 | 89 | 90 | # reporting 91 | print(f"\n Column '{args.column}' summary\n") 92 | 93 | print(f" {'N:':<15} {input_n}") 94 | print(f" {'Min:':<15} {input_min}") 95 | print(f" {'Max:':<15} {input_max}") 96 | print(f" {'Mean:':<15} {input_mean}") 97 | print(f" {'Median:':<15} {input_median}") 98 | print(f" {'StDev:':<15} {input_stdev}\n") 99 | print(f" Percentiles:\n") 100 | print(f" {'1st:':<11} {percentile_1}") 101 | print(f" {'5th:':<11} {percentile_5}") 102 | print(f" {'10th:':<11} {percentile_10}") 103 | print(f" {'25th:':<11} {percentile_25}") 104 | print(f" {'50th:':<11} {percentile_50}") 105 | print(f" {'75th:':<11} {percentile_75}") 106 | print(f" {'90th:':<11} {percentile_90}") 107 | print(f" {'95th:':<11} {percentile_95}") 108 | print(f" {'99th:':<11} {percentile_99}") 109 | print("") 110 | -------------------------------------------------------------------------------- /bit/bit-update-go-dbs: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | if [ ! -z $1 ]; then 6 | printf "\n This script updates the GO obo files \"go-basic.obo\" \n" 7 | printf " and \"goslim_metagenomics.obo\" from the GO reference site\n" 8 | printf " (http://geneontology.org/docs/download-ontology/) to support\n" 9 | printf " programs like \`bit-summarize-go-annotations\`. For version info,\n" 10 | printf " run \`bit-version\`.\n\n" 11 | printf " Usage:\n\t bit-update-go-dbs\n\n" 12 | exit 13 | fi 14 | 15 | # setting colors to use 16 | GREEN='\033[0;32m' 17 | NC='\033[0m' 18 | 19 | printf "\n" 20 | 21 | curl -L --retry 10 -o ${GO_DB_DIR}/go-basic.obo http://purl.obolibrary.org/obo/go/go-basic.obo 22 | curl -L --retry 10 -o ${GO_DB_DIR}/goslim_metagenomics.obo http://current.geneontology.org/ontology/subsets/goslim_metagenomics.obo 23 | 24 | printf "\n\t\t${GREEN}The GO basic and metagenomics slim obo files have been updated!${NC}\n\n" 25 | -------------------------------------------------------------------------------- /bit/bit-update-ncbi-taxonomy: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | if [ $1 ]; then 6 | printf "\n This script updates the NCBI taxonomy database taxonkit uses.\n" 7 | printf " For version info, run \`bit-version\`.\n\n" 8 | printf " Usage:\n\t bit-update-ncbi-taxonomy\n\n" 9 | exit 10 | fi 11 | 12 | # setting colors to use 13 | GREEN='\033[0;32m' 14 | RED='\033[0;31m' 15 | NC='\033[0m' 16 | 17 | printf "\n" 18 | 19 | curl --retry 10 -o ${TAXONKIT_DB}/taxdump.tar.gz ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz 20 | 21 | tar -xzf ${TAXONKIT_DB}/taxdump.tar.gz -C ${TAXONKIT_DB} 22 | 23 | rm ${TAXONKIT_DB}/taxdump.tar.gz 24 | 25 | printf "\n\t\t${GREEN}The NCBI taxonomy database info has been updated!${NC}\n\n" 26 | -------------------------------------------------------------------------------- /bit/bit-version: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | version='1.10.9' 4 | 5 | GREEN='\033[0;32m' 6 | NC='\033[0m' 7 | 8 | printf "\n\t\tBioinformatics Tools ${GREEN}v${version}${NC}\n" 9 | printf "\t\tgithub.com/AstrobioMike/bit\n\n" 10 | 11 | printf " If you happen to find this toolset useful in your work, please be sure to\n" 12 | printf " cite it :)\n\n" 13 | 14 | printf " Lee M. bit: a multipurpose collection of bioinformatics tools. F1000Research 2022, 11:122\n" 15 | printf " https://doi.org/10.12688/f1000research.79530.1\n\n" 16 | 17 | today=$(date +'%A') 18 | 19 | printf " ${GREEN}Happy $today :)${NC}\n\n" 20 | -------------------------------------------------------------------------------- /bit/helper-bit-check-or-setup-GTDB-files.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | This is a helper program of bit, that I initially wrote for GToTree (https://github.com/AstrobioMike/GToTree/wiki). 5 | It is for setting up reference files for the glorious Genome Taxonomy Database (gtdb.ecogenomic.org/). 6 | 7 | For examples, please visit the GToTree wiki here: https://github.com/AstrobioMike/GToTree/wiki/example-usage 8 | """ 9 | 10 | import sys 11 | import os 12 | import urllib.request 13 | import pandas as pd 14 | import textwrap 15 | import argparse 16 | 17 | parser = argparse.ArgumentParser(description = "This is a helper program to facilitate setting up the reference files for the \ 18 | glorious Genome Taxonomy Database (gtdb.ecogenomic.org). It's really meant for internal \ 19 | use only by other bit programs.") 20 | 21 | args = parser.parse_args() 22 | 23 | ################################################################################ 24 | 25 | def main(): 26 | 27 | ## checking env variable is set and writable 28 | check_location_var_is_set_and_writable("GTDB_DIR") 29 | 30 | ## setting up ref GTDB files if needed 31 | check_and_or_get_gtdb_files(os.environ["GTDB_DIR"]) 32 | 33 | ################################################################################ 34 | 35 | 36 | # setting some colors 37 | tty_colors = { 38 | 'green' : '\033[0;32m%s\033[0m', 39 | 'yellow' : '\033[0;33m%s\033[0m', 40 | 'red' : '\033[0;31m%s\033[0m' 41 | } 42 | 43 | 44 | ### functions ### 45 | def color_text(text, color='green'): 46 | if sys.stdout.isatty(): 47 | return tty_colors[color] % text 48 | else: 49 | return text 50 | 51 | 52 | def wprint(text): 53 | print(textwrap.fill(text, width=80, initial_indent=" ", 54 | subsequent_indent=" ", break_on_hyphens=False)) 55 | 56 | 57 | def check_location_var_is_set_and_writable(variable): 58 | 59 | # making sure there is an env variable 60 | try: 61 | path = os.environ[variable] 62 | 63 | if path == "": 64 | raise 65 | 66 | except: 67 | print() 68 | wprint(color_text("The environment variable '" + str(variable) + "' does not seem to be set :(", "red")) 69 | print() 70 | wprint("Try to set it with `bit-data-locations set`, then try again.") 71 | print("\nExiting for now.\n") 72 | sys.exit(1) 73 | 74 | # making sure path is writable for the user 75 | path_writable = os.access(path, os.W_OK) 76 | 77 | if not path_writable: 78 | print() 79 | wprint(color_text("The environment variable '" + str(variable) + "' does not seem to be writable :(", "red")) 80 | print() 81 | wprint("Try to set it somewhere else with `bit-data-locations set`, then try again.") 82 | print("\nExiting for now.\n") 83 | sys.exit(1) 84 | 85 | return() 86 | 87 | 88 | def gen_gtdb_tab(location): 89 | """ downloads and parses the GTDB info tables """ 90 | 91 | # getting archaea 92 | arc_tar_gz = urllib.request.urlopen("https://data.gtdb.ecogenomic.org/releases/latest/ar53_metadata.tar.gz") 93 | arc_tab = pd.read_csv(arc_tar_gz, sep="\t", compression="gzip", on_bad_lines = 'skip', header=0, low_memory=False) 94 | arc_tab.rename(columns={arc_tab.columns[0]:"accession"}, inplace=True) 95 | arc_tab.dropna(inplace=True, how="all") 96 | 97 | # getting bacteria 98 | bac_tar_gz = urllib.request.urlopen("https://data.gtdb.ecogenomic.org/releases/latest/bac120_metadata.tar.gz") 99 | bac_tab = pd.read_csv(bac_tar_gz, sep="\t", compression="gzip", on_bad_lines = 'skip', header=0, low_memory=False) 100 | bac_tab.rename(columns={bac_tab.columns[0]:"accession"}, inplace=True) 101 | bac_tab.dropna(inplace=True, how="all") 102 | 103 | # combining 104 | gtdb_tab = pd.concat([arc_tab, bac_tab]) 105 | 106 | # splitting gtdb taxonomy column into 7 and dropping the single column 107 | domain, phylum, rclass, order, family, genus, species = [], [], [], [], [], [], [] 108 | 109 | for index, row in gtdb_tab.iterrows(): 110 | curr_acc = row["accession"] 111 | tax_list = row["gtdb_taxonomy"].split(";") 112 | 113 | if len(tax_list) != 7: 114 | wprint(color_text("GTDB entry " + curr_acc + " doesn't seem to have 7-column lineage info. Something is likely wrong :(", "yellow")) 115 | print("") 116 | wprint("If this continues to happen, please file an issue at github.com/AstrobioMike/bit/issues") 117 | print("") 118 | wprint("Aborting for now.") 119 | print("") 120 | sys.exit(0) 121 | 122 | else: 123 | domain.append(tax_list[0][3:]) 124 | phylum.append(tax_list[1][3:]) 125 | rclass.append(tax_list[2][3:]) 126 | order.append(tax_list[3][3:]) 127 | family.append(tax_list[4][3:]) 128 | genus.append(tax_list[5][3:]) 129 | species.append(tax_list[6][3:]) 130 | 131 | gtdb_tab.insert(1, "species", species) 132 | gtdb_tab.insert(1, "genus", genus) 133 | gtdb_tab.insert(1, "family", family) 134 | gtdb_tab.insert(1, "order", order) 135 | gtdb_tab.insert(1, "class", rclass) 136 | gtdb_tab.insert(1, "phylum", phylum) 137 | gtdb_tab.insert(1, "domain", domain) 138 | 139 | # writing out 140 | gtdb_tab.to_csv(location + "GTDB-arc-and-bac-metadata.tsv", index=False, sep="\t") 141 | 142 | gtdb_version_info = urllib.request.urlretrieve("https://data.gtdb.ecogenomic.org/releases/latest/VERSION", location + "GTDB-version-info.txt") 143 | 144 | 145 | def check_and_or_get_gtdb_files(GTDB_DIR): 146 | """ checks for and sets up ref GTDB files if needed """ 147 | 148 | if os.path.exists(GTDB_DIR + "GTDB-arc-and-bac-metadata.tsv") and os.path.exists(GTDB_DIR + "GTDB-version-info.txt"): 149 | 150 | sys.exit(0) 151 | 152 | # generating when table doesn't exist yet 153 | else: 154 | print("") 155 | wprint(color_text("Downloading and parsing archaeal and bacterial metadata tables from GTDB (only needs to be done once, or when a new version is available)...", "yellow")) 156 | print("") 157 | 158 | gen_gtdb_tab(GTDB_DIR) 159 | 160 | 161 | if __name__ == "__main__": 162 | main() 163 | -------------------------------------------------------------------------------- /bit/helper-bit-combine-bracken.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import argparse 5 | import sys 6 | 7 | parser = argparse.ArgumentParser(description = 'This script is for combining bracken output tables. It was modified\ 8 | from the `combine_bracken_outputs.py` script provided by Jennifer\ 9 | Lu (jlu26@jhmi.edu) that comes with bracken for use with the\ 10 | `bit-combine-bracken-and-add-lineage` script. For version info,\ 11 | run `bit-version`.') 12 | 13 | 14 | required = parser.add_argument_group('required arguments') 15 | 16 | required.add_argument("-i", "--input-files", metavar = "", nargs = "+", type = str, help = "space-delimited list of bracken output files", action = "store", required = True) 17 | parser.add_argument("-n", "--sample-names", metavar = "", help = 'Sample names provided as a comma-delimited list (by default will use basename of input files)', action = "store", default = '') 18 | parser.add_argument("-o", "--output-file", metavar = "", help='Output file of combined tables (default: "combined-bracken.tsv")', action = "store", default = "combined-bracken.tsv") 19 | 20 | if len(sys.argv)==1: 21 | parser.print_help(sys.stderr) 22 | sys.exit(0) 23 | 24 | args = parser.parse_args() 25 | 26 | # setting up variables 27 | sample_counts = {} 28 | total_counts = {} 29 | all_samples = [] 30 | 31 | # setting sample names and intializing counts 32 | if len(args.sample_names) == 0: 33 | for file in args.input_files: 34 | curr_sample = os.path.basename(file) 35 | total_counts[curr_sample] = 0 36 | all_samples.append(curr_sample) 37 | 38 | else: 39 | for curr_sample in args.sample_names.split(","): 40 | total_counts[curr_sample] = 0 41 | all_samples.append(curr_sample) 42 | 43 | 44 | # working on each file 45 | # initialize level variable 46 | level = '' 47 | # initializiing iterator for grabbing sample names 48 | i = 0 49 | 50 | for file in args.input_files: 51 | 52 | # storing current sample name 53 | curr_name = all_samples[i] 54 | 55 | # incrementing iterator 56 | i += 1 57 | 58 | with open(file) as f: 59 | # skipping header 60 | next(f) 61 | for line in f: 62 | [name, taxid, taxlvl, kreads, areads, estreads, frac] = line.strip().split("\t") 63 | estreads = int(estreads) 64 | 65 | # error checks 66 | if name not in sample_counts: 67 | sample_counts[name] = {} 68 | sample_counts[name][taxid] = {} 69 | elif taxid != list(sample_counts[name].keys())[0]: 70 | sys.exit("Taxonomy IDs not matching for species %s: (%s\t%s)" % (name, taxid, list(sample_counts[name].keys())[0])) 71 | if len(level) == 0: 72 | level = taxlvl 73 | elif level != taxlvl: 74 | sys.exit("Taxonomy level not matching between samples :(") 75 | 76 | # summing counts for current sample 77 | total_counts[curr_name] += estreads 78 | # adding read counts for that taxa for this sample to the dict holding all samples 79 | sample_counts[name][taxid][curr_name] = estreads 80 | 81 | 82 | # opening output file 83 | output_file = open(args.output_file, "w") 84 | 85 | # writing header 86 | output_file.write("name\ttax_id\ttax_level") 87 | for name in all_samples: 88 | output_file.write("\t%s_num\t%s_frac" % (name, name)) 89 | output_file.write("\n") 90 | 91 | # writing out each sample 92 | for name in sample_counts: 93 | taxid = list(sample_counts[name].keys())[0] 94 | output_file.write("%s\t%s\t%s" % (name, taxid, level)) # seems like "level" variable is trusting the last thing was the same for all as it was for the last file's last line, probably true, but then not sure why we check. might return to this 95 | 96 | #Calculate and print information per sample 97 | for sample in all_samples: 98 | if sample in sample_counts[name][taxid]: 99 | num = sample_counts[name][taxid][sample] 100 | perc = float(num)/float(total_counts[sample]) 101 | output_file.write("\t%i\t%0.5f" % (num, perc)) 102 | 103 | # if sample doesn't have counts for this taxa, adding zeroes 104 | else: 105 | output_file.write("\t0\t0.00000") 106 | 107 | output_file.write("\n") 108 | 109 | output_file.close() 110 | -------------------------------------------------------------------------------- /bit/helper-bit-dl-ncbi-assemblies-parallel.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ### Helper script used with `bit-dl-ncbi-assemblies` when run in parallel; for version info, run or see `bit-version` ### 4 | 5 | # setting colors to use 6 | GREEN='\033[0;32m' 7 | RED='\033[0;31m' 8 | NC='\033[0m' 9 | 10 | my_ext=$2 11 | ext=$3 12 | format=$4 13 | http_flag=$5 14 | 15 | assembly=$(echo "$1" | cut -f 1) 16 | downloaded_accession=$(echo "$1" | cut -f 2) 17 | 18 | # storing and building links 19 | base_link=$(echo "$1" | cut -f 9) 20 | 21 | 22 | # checking link was actually present (sometimes, very rarely, it is not there) 23 | # if not there, attempting to build ourselves 24 | if [ $base_link == "na" ] || [ -z $base_link ]; then 25 | 26 | if [ $http_flag == "false" ]; then 27 | p1=$(printf "ftp://ftp.ncbi.nlm.nih.gov/genomes/all") 28 | else 29 | p1=$(printf "https://ftp.ncbi.nlm.nih.gov/genomes/all") 30 | fi 31 | 32 | # checking if GCF or GCA 33 | if [[ $assembly == "GCF"* ]]; then 34 | p2="GCF" 35 | else 36 | p2="GCA" 37 | fi 38 | 39 | p3=$(echo $assembly | cut -f 2 -d "_" | cut -c 1-3) 40 | p4=$(echo $assembly | cut -f 2 -d "_" | cut -c 4-6) 41 | p5=$(echo $assembly | cut -f 2 -d "_" | cut -c 7-9) 42 | 43 | ass_name=$(echo "$1" | cut -f 3) 44 | end_path=$(paste -d "_" <(echo "$assembly") <(echo "$ass_name")) 45 | 46 | base_link=$(paste -d "/" <(echo "$p1") <(echo "$p2") <(echo "$p3") <(echo "$p4") <(echo "$p5") <(echo "$end_path")) 47 | 48 | else 49 | 50 | end_path=$(basename $base_link) 51 | 52 | fi 53 | 54 | curl --silent --retry 10 -o ${assembly}${my_ext} "${base_link}/${end_path}${ext}" 55 | 56 | # grabbing this to check if " XML " is in there 57 | # when this was first written, trying to download a link that wasn't there would fail 58 | # now it can download an xml-formated file saying the link wasn't found at NCBI 59 | # so this let's us check for that, and report and remove it if that's the case 60 | file_command_output=$(file ${assembly}${my_ext}) 61 | 62 | if [ -s ${assembly}${my_ext} ] && [[ ${file_command_output} != *" XML "* ]] && [[ ${file_command_output} != *" XHTML "* ]]; then 63 | 64 | printf "\r\t Successfully downloaded: $assembly" 65 | 66 | else 67 | 68 | printf "\n ${RED}******************************* ${NC}NOTICE ${RED}*******************************${NC} \n" 69 | printf "\t $assembly's $format file didn't download successfully.\n" 70 | printf "\t That file type may not exist for this accession.\n\n" 71 | printf "\t Written to \"NCBI-accessions-not-downloaded.txt\".\n" 72 | printf " ${RED}********************************************************************** ${NC}\n\n" 73 | 74 | echo ${assembly} >> NCBI-accessions-not-downloaded.txt 75 | 76 | rm -rf ${assembly}${my_ext} 77 | 78 | fi -------------------------------------------------------------------------------- /bit/helper-bit-get-ncbi-assembly-tables: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | This is a helper program of bit taken from my GToTree package (https://github.com/AstrobioMike/GToTree/wiki) 5 | to download the NCBI assembly summary tables if they are not present, or are more than 4 weeks old. 6 | """ 7 | 8 | import sys 9 | import os 10 | import urllib.request 11 | import argparse 12 | import shutil 13 | import textwrap 14 | from datetime import date, timedelta 15 | import filecmp 16 | import tarfile 17 | import gzip 18 | 19 | parser = argparse.ArgumentParser(description="This is a helper program to download and setup the NCBI assembly summary tables if they are \ 20 | not present, or are older than 4 weeks.", \ 21 | epilog="Ex. usage: helper-bit-get-ncbi-assembly-tables\n") 22 | 23 | parser.add_argument("-P", "--use-http", help='Use http instead of ftp', action = "store_true") 24 | parser.add_argument("-f", "--force-update", help='Force an update regardless of last date retrieved', action = "store_true") 25 | 26 | 27 | args = parser.parse_args() 28 | 29 | 30 | ################################################################################ 31 | 32 | def main(): 33 | 34 | NCBI_assembly_data_dir = check_location_var_is_set() 35 | 36 | data_present = check_if_data_present_and_less_than_4_weeks_old(NCBI_assembly_data_dir) 37 | 38 | if data_present and not args.force_update: 39 | exit() 40 | 41 | else: 42 | 43 | get_NCBI_assembly_summary_data(NCBI_assembly_data_dir) 44 | 45 | ################################################################################ 46 | 47 | 48 | # setting some colors 49 | tty_colors = { 50 | 'green' : '\033[0;32m%s\033[0m', 51 | 'yellow' : '\033[0;33m%s\033[0m', 52 | 'red' : '\033[0;31m%s\033[0m' 53 | } 54 | 55 | 56 | ### functions ### 57 | def color_text(text, color='green'): 58 | if sys.stdout.isatty(): 59 | return tty_colors[color] % text 60 | else: 61 | return text 62 | 63 | 64 | def wprint(text): 65 | print(textwrap.fill(text, width=80, initial_indent=" ", 66 | subsequent_indent=" ", break_on_hyphens=False)) 67 | 68 | 69 | def check_location_var_is_set(): 70 | 71 | # making sure there is a KO_data_dir env variable 72 | try: 73 | NCBI_data_dir = os.environ['NCBI_assembly_data_dir'] 74 | except: 75 | wprint(color_text("The environment variable 'NCBI_assembly_data_dir' does not seem to be set :(", "yellow")) 76 | wprint("This shouldn't happen, check on things with `bit-data-locations check`.") 77 | print("") 78 | sys.exit(0) 79 | 80 | return(NCBI_data_dir) 81 | 82 | 83 | def check_if_data_present_and_less_than_4_weeks_old(location): 84 | 85 | # seeing if present already and if it was downloaded less than 4 weeks ago 86 | # if this function returns True, then we don't do anything 87 | # if it returns False, then we need to download things 88 | table_path = os.path.join(str(location), "ncbi-assembly-info.tsv") 89 | date_retrieved_path = os.path.join(str(location), "date-retrieved.txt") 90 | 91 | # if either file is missing, we are going to download, we also package the date-retrieved file empty with conda to retain directory, so checking it's not empty as well 92 | if not os.path.isfile(table_path) or not os.path.isfile(date_retrieved_path) or not os.path.getsize(date_retrieved_path) > 0: 93 | 94 | if os.path.exists(table_path): 95 | os.remove(table_path) 96 | if os.path.isdir(date_retrieved_path): 97 | shutil.rmtree(date_retrieved_path) 98 | 99 | return(False) 100 | 101 | # if both files are present (and not empty), we are checking if it was downloaded more than 4 weeks ago 102 | # and will download if it was 103 | if os.path.isfile(table_path) and os.path.isfile(date_retrieved_path): 104 | 105 | # getting current date 106 | curr_date = date.today() 107 | 108 | # reading date it was downloaded 109 | with open(date_retrieved_path, 'r') as file: 110 | stored_date = file.read().strip() 111 | 112 | # setting to date object 113 | stored_date_list = stored_date.split(",") 114 | stored_date = date(int(stored_date_list[0]), int(stored_date_list[1]), int(stored_date_list[2])) 115 | 116 | # getting difference 117 | diff = curr_date - stored_date 118 | 119 | # checking if difference is greater than 28 days 120 | if diff.days > 28: 121 | 122 | return(False) 123 | 124 | else: 125 | 126 | return(True) 127 | 128 | else: 129 | 130 | return(True) 131 | 132 | 133 | def get_NCBI_assembly_summary_data(location): 134 | 135 | """ downloads the needed ncbi assembly summary tables and combines them """ 136 | 137 | # setting links 138 | if args.use_http: 139 | 140 | genbank_link = "https://ftp.ncbi.nlm.nih.gov/genomes/genbank/assembly_summary_genbank.txt" 141 | refseq_link = "https://ftp.ncbi.nlm.nih.gov/genomes/refseq/assembly_summary_refseq.txt" 142 | 143 | else: 144 | 145 | genbank_link = "ftp://ftp.ncbi.nlm.nih.gov/genomes/genbank/assembly_summary_genbank.txt" 146 | refseq_link = "ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/assembly_summary_refseq.txt" 147 | 148 | table_path = os.path.join(str(location), "ncbi-assembly-info.tsv") 149 | refseq_temp_path = os.path.join(str(location), "refseq-assembly-info.tmp") 150 | 151 | print(color_text(" Downloading NCBI assembly summaries (only done once, or updated after 4 weeks)...\n", "yellow")) 152 | 153 | urllib.request.urlretrieve(genbank_link, table_path) 154 | urllib.request.urlretrieve(refseq_link, refseq_temp_path) 155 | 156 | # combining 157 | with open (table_path, "a") as final_table: 158 | with open(refseq_temp_path, "r") as refseq: 159 | final_table.write(refseq.read()) 160 | 161 | # removing temp 162 | if os.path.exists(refseq_temp_path): 163 | os.remove(refseq_temp_path) 164 | 165 | # storing date retrieved 166 | date_retrieved = str(date.today()).replace("-", ",") 167 | date_retrieved.replace("-", ",") 168 | 169 | date_retrieved_path = os.path.join(str(location), "date-retrieved.txt") 170 | 171 | with open(date_retrieved_path, "w") as outfile: 172 | outfile.write(date_retrieved + "\n") 173 | 174 | ################################################################################ 175 | 176 | if __name__ == "__main__": 177 | main() -------------------------------------------------------------------------------- /bit/helper-bit-get-ncbi-tax-data: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | This is a helper program of mine taken from GToTree (https://github.com/AstrobioMike/GToTree/wiki) 5 | to download NCBI tax data for using TaxonKit (https://bioinf.shenwei.me/taxonkit/) with 6 | bit-get-lineage-from-taxids. 7 | """ 8 | 9 | import sys 10 | import os 11 | import urllib.request 12 | import argparse 13 | import shutil 14 | import textwrap 15 | import filecmp 16 | import tarfile 17 | import gzip 18 | 19 | parser = argparse.ArgumentParser(description = "This is a helper program to setup NCBI tax data for programs that use TaxonKit (bioinf.shenwei.me/taxonkit/) \ 20 | to retrieve taxonomy info.", \ 21 | epilog = "Ex. usage: helper-bit-get-ncbi-tax-data\n") 22 | 23 | args = parser.parse_args() 24 | 25 | 26 | ################################################################################ 27 | 28 | def main(): 29 | 30 | NCBI_data_dir = check_location_var_is_set() 31 | 32 | data_present = check_if_data_present(NCBI_data_dir) 33 | 34 | if data_present: 35 | exit() 36 | 37 | else: 38 | 39 | print("") 40 | print(color_text(" Downloading required NCBI taxonomy data (only needs to be done once)...\n", "yellow")) 41 | get_NCBI_tax_data(NCBI_data_dir) 42 | 43 | 44 | ################################################################################ 45 | 46 | 47 | # setting some colors 48 | tty_colors = { 49 | 'green' : '\033[0;32m%s\033[0m', 50 | 'yellow' : '\033[0;33m%s\033[0m', 51 | 'red' : '\033[0;31m%s\033[0m' 52 | } 53 | 54 | 55 | ### functions ### 56 | def color_text(text, color='green'): 57 | if sys.stdout.isatty(): 58 | return tty_colors[color] % text 59 | else: 60 | return text 61 | 62 | 63 | def wprint(text): 64 | print(textwrap.fill(text, width=80, initial_indent=" ", 65 | subsequent_indent=" ", break_on_hyphens=False)) 66 | 67 | 68 | def check_location_var_is_set(): 69 | 70 | # making sure there is a KO_data_dir env variable 71 | try: 72 | NCBI_data_dir = os.environ['TAXONKIT_DB'] 73 | except: 74 | wprint(color_text("The environment variable 'TAXONKIT_DB' does not seem to be set :(", "yellow")) 75 | wprint("This should have been handled automatically if things were installed with conda.") 76 | wprint("If you can't sort this out, please feel free to post an issue here:") 77 | print(" github.com/AstrobioMike/bit/issues\n\n") 78 | sys.exit(0) 79 | 80 | return(NCBI_data_dir) 81 | 82 | 83 | def check_if_data_present(location): 84 | 85 | # seeing if present already 86 | # if this function returns True, then data is present 87 | # if it returns False, then we need to download things 88 | names_path = os.path.join(str(location) + "/names.dmp") 89 | nodes_path = os.path.join(str(location) + "/nodes.dmp") 90 | 91 | 92 | if not os.path.isfile(names_path) or not os.path.isfile(nodes_path): 93 | 94 | if os.path.exists(names_path): 95 | os.remove(names_path) 96 | if os.path.isdir(nodes_path): 97 | shutil.rmtree(nodes_path) 98 | 99 | return(False) 100 | 101 | else: 102 | 103 | return(True) 104 | 105 | 106 | def get_NCBI_tax_data(location): 107 | """ downloads the needed ncbi tax data """ 108 | 109 | taxdump_path = os.path.join(str(location) + "taxdump.tar.gz") 110 | 111 | urllib.request.urlretrieve("http://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz", taxdump_path) 112 | 113 | # unpacking 114 | with tarfile.open(taxdump_path) as tarball: 115 | tarball.extractall(location) 116 | 117 | # removing tarball 118 | os.remove(taxdump_path) 119 | 120 | 121 | ################################################################################ 122 | 123 | if __name__ == "__main__": 124 | main() 125 | -------------------------------------------------------------------------------- /bit/helper-bit-parse-assembly-summary-file.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import argparse 5 | import os 6 | 7 | parser = argparse.ArgumentParser(description = 'This script is for parsing NCBI\'s assembly summary file down\ 8 | to the provided accessions. It is used by the `bit-dl-ncbi-assemblies`\ 9 | script. For version info, run `bit-version`.') 10 | 11 | required = parser.add_argument_group('required arguments') 12 | 13 | required.add_argument("-a", "--assembly-summary", metavar = "", help = "NCBI's assembly summary file", action = "store", dest = "all_assemblies", required = True) 14 | required.add_argument("-w", "--wanted-accessions", metavar = "", help = "Single-column file with wanted accessions", action = "store", dest = "wanted_accs", required = True) 15 | parser.add_argument("-o", "--output-file", help = 'Output file of wanted summary info only (default: "wanted.tsv")', action = "store", default = "wanted.tsv") 16 | 17 | if len(sys.argv)==1: 18 | parser.print_help(sys.stderr) 19 | sys.exit(0) 20 | 21 | args = parser.parse_args() 22 | 23 | wanted_dict = {} 24 | 25 | with open(args.wanted_accs, "r") as wanted_accs: 26 | 27 | for line in wanted_accs: 28 | root_acc = line.strip().split(".")[0] 29 | wanted_dict[str(root_acc)] = line.strip() 30 | 31 | out_file = open(args.output_file, "w") 32 | 33 | with open(args.all_assemblies) as assemblies: 34 | 35 | for line in assemblies: 36 | line = line.split("\t") 37 | 38 | if line[0].split(".")[0] in wanted_dict: 39 | dl_acc = str(line[0]) 40 | 41 | if not dl_acc: 42 | dl_acc = "NA" 43 | 44 | ass_name = str(line[15]) 45 | if not ass_name: 46 | ass_name = "NA" 47 | 48 | taxid = str(line[5]) 49 | if not taxid: 50 | taxid = "NA" 51 | 52 | org_name = str(line[7]) 53 | if not org_name: 54 | org_name = "NA" 55 | 56 | infra_name = str(line[8]) 57 | if not infra_name: 58 | infra_name = "NA" 59 | 60 | version_status = str(line[10]) 61 | if not version_status: 62 | version_status = "NA" 63 | 64 | ass_level = str(line[11]) 65 | if not ass_level: 66 | ass_level = "NA" 67 | 68 | ftp_path = str(line[19]) 69 | if not ftp_path: 70 | ftp_path = "NA" 71 | 72 | out_file.write(str(wanted_dict[str(line[0].split(".")[0])]) + "\t" + str(dl_acc) + "\t" + str(ass_name) + "\t" + str(taxid) + "\t" + str(org_name) + "\t" + str(infra_name) + "\t" + str(version_status) + "\t" + str(ass_level) + "\t" + str(ftp_path) + "\n") 73 | -------------------------------------------------------------------------------- /bit/helper-bit-setup-GO-dbs: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | ## This is a helper script to setup the GO databases if they aren't already present 5 | 6 | YELLOW='\033[0;33m' 7 | NC='\033[0m' 8 | 9 | # checking env var is set 10 | 11 | if [ -z ${GO_DB_DIR} ]; then 12 | 13 | printf "${YELLOW} The environment variable 'GO_DB_DIR' does not seem to be set :(${NC}\n" 14 | printf " This should have been handled automatically if things were installed with conda.\n" 15 | printf " If you can't sort this out, please feel free to post an issue here:" 16 | printf " github.com/AstrobioMike/bit/issues\n\n" 17 | 18 | exit 19 | 20 | fi 21 | 22 | if [ ! -s ${GO_DB_DIR}/go-basic.obo ] || [ ! -s ${GO_DB_DIR}/goslim_metagenomics.obo ]; then 23 | 24 | rm -rf ${GO_DB_DIR}/go-basic.obo ${GO_DB_DIR}/goslim_metagenomics.obo ${GO_DB_DIR}/conda-placeholder 25 | 26 | printf "\n ${YELLOW}Downloading required GO data (only needs to be done once)...${NC}\n" 27 | 28 | curl -L --silent --retry 10 -o ${GO_DB_DIR}/go-basic.obo http://purl.obolibrary.org/obo/go/go-basic.obo 29 | curl -L --silent --retry 10 -o ${GO_DB_DIR}/goslim_metagenomics.obo http://current.geneontology.org/ontology/subsets/goslim_metagenomics.obo 30 | 31 | fi 32 | -------------------------------------------------------------------------------- /bit/helper-bit-update-tax-table-for-seqscreen-go-tax-summary.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ### Helper script for `bit-cov-summarize-go-annots-with-domains`; for version info, run or see `bit-version` ### 4 | ### generates a table grouping all taxids from Euks, Bacteria, Archaea, and viruses 5 | 6 | # getting domain info for all taxids in ncbi files and storing in taxonkit data dir 7 | cut -f 1 ${TAXONKIT_DB}/nodes.dmp | taxonkit lineage | taxonkit reformat -r NA | cut -f 1,3 | tr ";" "\t" | cut -f 1,2 | grep -v "NA" > ${TAXONKIT_DB}/taxids-and-domains.tsv 8 | -------------------------------------------------------------------------------- /images/bit-cov-analyzer.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AstrobioMike/bit/c050fc3a649225c6f7a9a39e6e28f68831243b8f/images/bit-cov-analyzer.pdf -------------------------------------------------------------------------------- /images/bit-cov-analyzer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AstrobioMike/bit/c050fc3a649225c6f7a9a39e6e28f68831243b8f/images/bit-cov-analyzer.png -------------------------------------------------------------------------------- /images/bit-metagenomics-overview.afdesign: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AstrobioMike/bit/c050fc3a649225c6f7a9a39e6e28f68831243b8f/images/bit-metagenomics-overview.afdesign -------------------------------------------------------------------------------- /images/bit-metagenomics-overview.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AstrobioMike/bit/c050fc3a649225c6f7a9a39e6e28f68831243b8f/images/bit-metagenomics-overview.pdf -------------------------------------------------------------------------------- /images/bit-metagenomics-overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AstrobioMike/bit/c050fc3a649225c6f7a9a39e6e28f68831243b8f/images/bit-metagenomics-overview.png -------------------------------------------------------------------------------- /test-data/ez-screen-assembly.fasta: -------------------------------------------------------------------------------- 1 | >partial-NC_003131.1 2 | CCTTAATTTCCGCGAGGATGACCTCGCTATTCCAGACATTCTCTGCCAGGCGCATGTCGATGTAGTCCAT 3 | AAACGGTTTCAGCTTAACCATTTTGTGGCGAGTCTTTCTGGCTGGCGGTTCAGGGTATTTGAGGTAGCGT 4 | CTGACAGTTCGTTCAGAGCAACCCACCTGAGTCGCAATATCGATAATGTACGCCCCCTGCTGGCGCATTT 5 | GCTTTATCATGTAAAAGTCCTCTCTGCTCAGCATGTTGATGTCCTTTCTGGTGTGAGAACCTCAAGGAAA 6 | CAACATGTTGGGTGGAGCGGACAATACTAATGGTGAATTACCGTCTTATATCACTGGCGCTAACACCGTG 7 | AAGGGCTTCATGTTAATCATAAGCGCGTGTACCGGCTTTATCACCTCAGTGGCCTGGGCGTAAAACGCAG 8 | AAGGCGTCGGAAAGGGCTGGCAACAGAACGTCTGCCGCTGCTCCGTCCGGCGGCGCCCAATCTGACCTGG 9 | TCGATGGATTTCGTCATGGACGCATTGGCCACCGGTCGCAGGATAAAGTGCCTTACCTGCGTGGACGACT 10 | ACACGAAGGAATGCCTGACGGTCACTGTTGCCTTTGGGATTTCAGGCGTGCAGGTCACGCGTATTCTGGA 11 | CAGCATTGCGCTGTTTCGCGGCTATCCGGCGACGATAAGAACTGATCAGGGCCCGGAATTTACCTGCCGC 12 | GCGCTCGATCAATGGGCCTTTGAGCATGGCGTGGAACTGCGACTTATCCAGCCCGGCAAGCCGACACAGA 13 | ACGGATTTATTGAGAGTTTTAACGGACGCTTTCGCGATGAATGCCTGAATGAGCACTGGTTCAGTGACGT 14 | CAGTCATGCCAGGAAAACCATCAGTGAATGGCGTCAGGATTATAATGAGTGCCGCCCGCACTCTACGCTG 15 | AATTATCAGACGCCGTCTGAATTTGCGGCGGCCTGGAGAAAGGGTAATTCTGATAGTGAAGGATCCGACA 16 | TTACTAAGTGAGCGTTGTATCTAATCCTGGGGGCAGGTCATTCCGTATAATAAGGCAACAACCAAAAATC 17 | TACTCAACTAAATGACCGTGGTGGTGAGATTAGTGATGAGGTTTGTAGCCGTTCAGCCCCCTGCACCAGC 18 | ATCTCAAGCTGAGTATATAGTGAGTTATTATCCAGGCTGTTCAATGGTTGTCGATTCCATAACACTGGGT 19 | GCCCCCCAACCTCGTCCCAGGATAAGATGGGTTTTAATATATCTTGACTGAATATATTATGGCTAAGTAA 20 | GGTTTCCTTTTCATCATTATTGTCAAGAGAAGGTAGGGTAAACATTAATATTTGCCCGACAGGATGCTCT 21 | GTTATATGGCAGGCGAATTCCCCAACTTTGACACCGATAACCGGTTCAATAGTATCTGGAATAGACAACG 22 | AAAGTTGTTGAAATAATTGAGTGATAGCTTGTTCAAATGAATACATTATGATCTCATAATAGTTAGATAA 23 | AATATCAACTTAACCAAAGCACTCTCGGCAGACCATCAATTTTAGCCTATAATTTTTAGTTTTTGTTTTG 24 | TCTAATATAACAACAAAAACAGCAGCGATTTTTTATATAGCCATCGGCTATTTTCCCACTAAGATAACCT 25 | TGTTTTAATAGCCAAGGTAATAAATAGTCATGAAAATATCATCATTTATTTCTACATCACTGCCCCTGCC 26 | GACATCTGTGTCAGGATCTAGCAGCGTAGGAGAAATGTCTGGGCGCTCAGTCTCACAGCAAACAAGTGAT 27 | CAATATGCAAACAATCTGGCCGGGCGCACTGAAAGCCCTCAGGGTTCCAGCTTAGCCAGCCGTATCATTG 28 | AGAGGTTATCATCAGTGGCCCACTCTGTGATTGGGTTTATCCAACGCATGTTCTCGGAGGGGAGCCATAA 29 | ACCGGTGGTGACACCAGCACCCACACCTGCACAAATGCCAAGTCCTACGTCTTTCAGTGACAGTATCAAG 30 | CAACTTGCTGCTGAGACGCTGCCAAAATACATGCAGCAGTTGAATAGCTTGGATGCAGAGATGCTGCAGA 31 | AAAATCATGATCAGTTCGCTACGGGCAGCGGCCCTCTTCGTGGCAGTATCACTCAATGCCAAGGGCTGAT 32 | GCAGTTTTGTGGTGGGGAATTGCAAGCTGAGGCCAGTGCCATCTTAAACACGCCTGTTTGTGGTATTCCC 33 | TTCTCGCAGTGGGGAACTATTGGTGGGGCGGCCAGCGCGTACGTCGCCAGTGGCGTTGATCTAACGCAGG 34 | CAGCAAATGAGATCAAAGGGCTGGCGCAACAGATGCAGAAATTACTGTCATTGATGTGATATGGATAAAA 35 | ACAAGGGGATAGTGTTTCCCCCTTTTTCTATCAATATTGCGAATATCTTCGTCCCTGATCTTTCAGGGGC 36 | GAATCGTTTTTTAGCATGCTCATTGTTAGAATTTCTGACTTATCTCTCTTCTGTATTACTACTCATGCTC 37 | TGGAAAATCCTGAACATCTATATCTATGGATTGATGCAGCACTCGAGAAATCAAAATATCATTGCTAAGC 38 | GTTATATAGTATATACCGTGCTTTTTATACTGAAAACGGCGAATATCAGAGCAAATCCAGTTACACTCAG 39 | CCCCTAACTCTGGATTTTTAGCTAATAGCTCGAATACCTTTGCCAAGTTCTCATGGTATAACTTAGCCTG 40 | AGTCACACCGAAATGCCGGATAGTATAACTGGCAATATTATAAATATCCTCATCAGCTAGTTCAGACAGT 41 | TTATACACTAGTATCTTTCACCGCAGCAGAAAAAATCTCATCCATTAAACGATGGCTCACAGGTACATTT 42 | GTTCCTGCAAGCACCATATCGCGTACATGTTGAACACGCTGTTCACGTGCTTCCATTAGCCTTAAGGCAT 43 | CACGAAGCACTTCTGATATATTGCCATAACGACCAGACTGAATCATTTCCCCCACAAAACCTGTCAAATG 44 | CTCTCCAAGTGTTACGCTGGTTACGTGAGCCATATCCCCTCCGTTATGTATTACTGAGTAATACAATTAT 45 | -------------------------------------------------------------------------------- /test-data/ez-screen-targets.fasta: -------------------------------------------------------------------------------- 1 | >yopE 2 | ATGAAAATATCATCATTTATTTCTACATCACTGCCCCTGCCGACATCTGTGTCAGGATCTAGCAGCGTAG 3 | GAGAAATGTCTGGGCGCTCAGTCTCACAGCAAACAAGTGATCAATATGCAAACAATCTGGCCGGGCGCAC 4 | TGAAAGCCCTCAGGGTTCCAGCTTAGCCAGCCGTATCATTGAGAGGTTATCATCAGTGGCCCACTCTGTG 5 | ATTGGGTTTATCCAACGCATGTTCTCGGAGGGGAGCCATAAACCGGTGGTGACACCAGCACCCACACCTG 6 | CACAAATGCCAAGTCCTACGTCTTTCAGTGACAGTATCAAGCAACTTGCTGCTGAGACGCTGCCAAAATA 7 | CATGCAGCAGTTGAATAGCTTGGATGCAGAGATGCTGCAGAAAAATCATGATCAGTTCGCTACGGGCAGC 8 | GGCCCTCTTCGTGGCAGTATCACTCAATGCCAAGGGCTGATGCAGTTTTGTGGTGGGGAATTGCAAGCTG 9 | AGGCCAGTGCCATCTTAAACACGCCTGTTTGTGGTATTCCCTTCTCGCAGTGGGGAACTATTGGTGGGGC 10 | GGCCAGCGCGTACGTCGCCAGTGGCGTTGATCTAACGCAGGCAGCAAATGAGATCAAAGGGCTGGCGCAA 11 | CAGATGCAGAAATTACTGTCATTGATGTGA 12 | >yopK 13 | ATGTTTATTAAAGATACTTATAACATGCGTGCTTTATGTACCGCTCTTGAACAGTCGGCTCCTGATACAA 14 | TAATAAATACATCTAAAGAAGAAAATAACAGTTACTACTGCGCTACTGCTCATTTACTGAGAACGGATGT 15 | TTGTTCATTGGTCAATAGAGTAGGGATTGAACCACTTAAAAGTGGATCAATATTATCTACTTTAGAAGAG 16 | TTATGGCAGGCTGTTGGTATAGTATATCGCTTATACGAATGGCAACATGTCAGCGATATTGACACCAATT 17 | TTAAGAAACTACCCAATAATTCTGATTTTGGTCTTGTGTTTTCTGTATTAGATTGTGATATAGAGTATGT 18 | GTTCATAGGGAAAAAAGACAGTGAAGGGAATATAGAATTTTATGATCCGAAAAACTCTCTACTTATAGAG 19 | AATGATGACATAAAAAAATATTTATATGATGAAGATTTTCATCGTTTTTGTATTATGCTGATCATCTCTA 20 | AATCTGAGTTGGAGGAATTGAGTCGCGAATCCTGCGATCAAGAATGTATTATGGGATGA 21 | -------------------------------------------------------------------------------- /test-data/kraken-example-out.tsv: -------------------------------------------------------------------------------- 1 | U A00159:145:H75T2DMXX:1:1101:7735:13792 unclassified (taxid 0) 16 0:0 2 | U A00159:145:H75T2DMXX:1:1101:11216:13557 unclassified (taxid 0) 30 0:0 3 | U A00159:145:H75T2DMXX:1:1101:22688:14074 unclassified (taxid 0) 26 0:0 4 | U A00159:145:H75T2DMXX:1:1101:1325:14559 unclassified (taxid 0) 31 0:0 5 | U A00159:145:H75T2DMXX:1:1101:23719:15013 unclassified (taxid 0) 30 0:0 6 | C A00159:145:H75T2DMXX:1:1102:11388:8312 Ochrobactrum (taxid 528) 194 0:12 1224:16 28211:7 528:15 7 | U A00159:145:H75T2DMXX:1:1102:15465:8390 unclassified (taxid 0) 27 0:0 8 | U A00159:145:H75T2DMXX:1:1102:6343:7560 unclassified (taxid 0) 271 0:237 9 | U A00159:145:H75T2DMXX:1:1102:30101:11600 unclassified (taxid 0) 26 0:0 10 | U A00159:145:H75T2DMXX:1:1101:19678:2221 unclassified (taxid 0) 279 0:245 11 | -------------------------------------------------------------------------------- /workflows/genome-summarize-wf/README.md: -------------------------------------------------------------------------------- 1 | # [bit](https://github.com/AstrobioMike/bit) genome-summarize workflow 2 | This is a [snakemake](https://snakemake.github.io/) workflow for generating and combining genome assembly stats, quality estimates, and taxonomy info. Inputs are fasta files of genome assemblies. For all workflows available with _bit_, see [here](https://github.com/AstrobioMike/bit?tab=readme-ov-file#workflows). 3 | 4 | --- 5 | 6 | * [**Overview**](#overview) 7 | * [**Usage**](#usage) 8 | * [Retrieving the workflow](#retrieving-the-workflow) 9 | * [Modifying the config.yaml](#modifying-the-configyaml) 10 | * [Running the workflow](#running-the-workflow) 11 | * [**Version info**](#version-info) 12 | 13 | --- 14 | 15 | ## Overview 16 | 17 | This workflow will summarize input genome assemblies, estimate quality, and assign taxonomy via the following programs: 18 | 19 | - [bit](https://github.com/AstrobioMike/bit#bioinformatics-tools-bit) for generating assembly summary stats 20 | - [checkm2](https://github.com/chklovski/CheckM2#checkm2) for estimating quality of bacteria/archaea 21 | - [GTDB-tk](https://github.com/Ecogenomics/GTDBTk#gtdb-tk) for assigning taxonomy of bacteria/archaea 22 | - [eukcc](https://github.com/Finn-Lab/EukCC#eukcc) for estimating quality of eukarya 23 | - [CAT](https://github.com/dutilh/CAT#cat-and-bat) with the NCBI nr database for assigning taxonomy of eukarya 24 | 25 | It ultimately produce an output table like this: 26 | 27 | ```bash 28 | Assembly Total contigs Total length Ambiguous characters GC content Maximum contig length Minimum contig length N50 L50 Est. Completeness (%) Est. Redundancy (%) Domain Phylum Class Order Family Genus Species 29 | input_genome_A 1 5,276,633 0 61.17 5,276,633 5,276,633 5,276,633 1 99.99 0.96 Bacteria Proteobacteria Gammaproteobacteria Pseudomonadales Pseudomonadaceae Pseudomonas_E Pseudomonas_E fulva 30 | input_MAG_B 4 2,702,105 0 33.15 2,601,030 30,881 2,601,030 1 90.22 2.78 Bacteria Firmicutes Bacilli Staphylococcales Staphylococcaceae Staphylococcus Staphylococcus saprophyticus 31 | ``` 32 | 33 | All required databases will be setup by the workflow if they don't exist already whenever they are used for the first time. 34 | 35 | --- 36 | 37 | ## Usage 38 | _bit_ should be installed via conda as described [here](https://github.com/AstrobioMike/bit?tab=readme-ov-file#conda-install). 39 | 40 | ### Retrieving the worklfow 41 | 42 | ```bash 43 | bit-get-workflow genome-summarize 44 | ``` 45 | 46 | ### Modifying the config.yaml 47 | Before running it, you first need to set some variables in the config.yaml file (there are notes in there). 48 | 49 | The workflow cannot currently run on a mix of input bacteria/archaea genomes and eukaryotic genomes, it can only run on bacteria/archaea by themselves, or eukarya by themselves (as set by a parameter in the config.yaml file). 50 | 51 | In the config.yaml file, you mostly just need to point to where the input fasta files are, specify what their extensions are, where the reference databases are stored (or where you want them to go if this is the first time running the workflow), and then you can run the snakemake workflow as exemplified below. 52 | 53 | ### Running the workflow 54 | After variables are set in the config.yaml, here's an example of how it could be run (note that it should still be run inside the _bit_ conda environment): 55 | 56 | ```bash 57 | snakemake --use-conda --conda-prefix ${CONDA_PREFIX}/envs -j 4 -p 58 | ``` 59 | 60 | - `--use-conda` – this specifies to use the conda environments included in the workflow 61 | - `--conda-prefix` – this allows us to point to where the needed conda environments should be stored. Including this means if we use the workflow on a different dataset somewhere else in the future, it will re-use the same conda environments rather than make new ones. The value listed here, `${CONDA_PREFIX}/envs`, is the default location for conda environments (the variable `${CONDA_PREFIX}` will be expanded to the appropriate location on whichever system it is run on). 62 | - `-j` – this lets us set how many jobs Snakemake should run concurrently (keep in mind that many of the thread and cpu parameters set in the config.yaml file will be multiplied by this) 63 | - `-p` – specifies to print out each command being run to the screen 64 | 65 | See `snakemake -h` for more options and details. 66 | 67 | --- 68 | 69 | ## Version info 70 | Note that the workflows are version independently of the _bit_ package. When you pull one with `bit-get-workflow`, the directory name will have the version, and it is also listed at the top of the Snakefile. 71 | 72 | All versions of programs used can be found in their corresponding conda yaml file in the envs/ directory. 73 | -------------------------------------------------------------------------------- /workflows/genome-summarize-wf/config.yaml: -------------------------------------------------------------------------------- 1 | ################################################################################################### 2 | ## Config file for the "bit" genome summarize workflow. ## 3 | ## bit: https://github.com/AstrobioMike/bit ## 4 | ## ## 5 | ## If you use this workflow in a publication, please consider citing :) ## 6 | ## Lee M. bit: a multipurpose collection of bioinformatics tools. F1000Research 2022, 11:122. ## 7 | ## https://doi.org/10.12688/f1000research.79530.1 ## 8 | ################################################################################################### 9 | 10 | 11 | # this is just a prefix for the final output table 12 | output_prefix: 13 | "Output" 14 | 15 | # set to True, with no quotes, if genomes are from eukarya, 16 | # set to False, with no quotes if genomes bacteria/archaea 17 | is_euk: 18 | False 19 | 20 | # path to where the genomes are located 21 | genomes_dir: 22 | "../genomes" 23 | 24 | # extension the fasta files have (must not be gzipped as currently written; include the period preceding, e.g., ".fasta", ".fna", ".fa") 25 | # gzip-compressed not accepted currently 26 | assembly_extension: 27 | ".fasta" 28 | 29 | ## reference database locations 30 | # these should be full paths to the directories that will hold the databases (more info below) 31 | CHECKM2_DATA_PATH: 32 | "/checkm2-ref-dir" 33 | 34 | GTDB_DATA_PATH: 35 | "/GTDB-tk-ref-dir" 36 | 37 | DIR_HOLDING_CAT_DIR: 38 | "/dir-holding-CAT-ref-dir" 39 | # actual directory name of CAT DB is below 40 | 41 | DIR_HOLDING_eukcc_DIR: 42 | "/dir-holding-eukcc-db-dir" 43 | # actual directory name of eukcc db is below 44 | 45 | ## number of threads or cpus (depending on how the program labeled them) to use per snakemake job (set with the -j parameter to the snakemake call) 46 | # passed to eukcc, CAT, checkm2, gtdb-tk 47 | threads: 48 | 20 49 | 50 | ## number of cpus used by pplacer by gtdb-tk 51 | # pplacer can have issues with memory with multiple cpus; see e.g. https://ecogenomics.github.io/GTDBTk/faq.html#gtdb-tk-reaches-the-memory-limit-pplacer-crashes 52 | gtdb_tk_pplacer_cpus: 53 | 4 54 | 55 | logs_dir: 56 | "logs/" 57 | 58 | checkm2_output_dir: 59 | "checkm2-output/" 60 | 61 | gtdbtk_output_dir: 62 | "gtdb-tk-output/" 63 | 64 | ## keep all files? 65 | # set this to "YES" (all caps needed) if wanting to keep all produced files by all programs, anything else here 66 | # means all be deleted upon completion except for the primary output summary table 67 | keep_all_files: 68 | "no" 69 | 70 | ################################################################################################################ 71 | ##### Resource specifications that may need to be changed (mostly only necessary if using a job scheduler) ##### 72 | ####### Could leave these as-is to start, but they are here to be increased if a job fails due to memory ####### 73 | ################################################################################################################ 74 | 75 | ### these are all passed in the "resources" directive of their respective rules in the Snakefile, going to 76 | # the "mem_mb" argument (so should be provided in terms of megabytes) 77 | 78 | # passed to rule gtdbtk_classify 79 | gtdbtk_memory_resources: 80 | 100000 81 | 82 | # passed to rule run_checkm2 83 | checkm2_memory_resources: 84 | 50000 85 | 86 | # passed to the run_CAT rule 87 | CAT_memory_resources: 88 | 40000 89 | 90 | # passed to the run_eukcc rule 91 | eukcc_memory_resources: 92 | 50000 93 | 94 | ####################################################### 95 | ################# REFERENCE DATABASES ################# 96 | ####################################################### 97 | # The workflow will check the locations specified above for the corresponding refernence databases, 98 | # and install them if they are not already there. It looks for the below "TRIGGER" filenames (they 99 | # all end with "*_DB_SETUP") in the directory of each database, which it creates when 100 | # it sets them up initially. 101 | # If we want to point to DBs that already exist on our setup, that were not prepared by this workflow, 102 | # we need to add these (empty) "TRIGGER" files to their respective directories. The 103 | # workflow just checks the file is there to know it doesn't need to setup the DB. This might tricky 104 | # to figure out, and easiest would be to let the workflow do it so all DB versions match the program 105 | # versions for sure. 106 | 107 | # there are some database filenames coded below that are noted as things that 108 | # we likely shouldn't change, so leave those unless you are sure you want to change them 109 | 110 | ### checkm2 ### 111 | # likely shouldn't change 112 | CHECKM2_DB_FILENAME: 113 | "uniref100.KO.1.dmnd" 114 | 115 | # likely shouldn't change 116 | CHECKM2_TRIGGER_FILE: 117 | "CHECKM2_DB_SETUP" 118 | 119 | ### gtdb-tk ### 120 | # likely shouldn't change 121 | GTDB_TRIGGER_FILE: 122 | "GTDBTK_DB_SETUP" 123 | 124 | ### CAT ### 125 | # likely shouldn't change all of the below 126 | CAT_DIR: 127 | "CAT_prepare_20210107" 128 | CAT_DB: 129 | "2021-01-07_CAT_database" 130 | CAT_TAX: 131 | "2021-01-07_taxonomy" 132 | CAT_DL_FILE: 133 | "CAT_prepare_20210107.tar.gz" 134 | CAT_DL_LINK: 135 | "https://tbb.bio.uu.nl/bastiaan/CAT_prepare/CAT_prepare_20210107.tar.gz" 136 | CAT_COMPRESSED_NR_FAA: 137 | "2021-01-07.nr.gz" 138 | CAT_TRIGGER_FILE: 139 | "CAT_DB_SETUP" 140 | 141 | ### eukcc ### 142 | # likely shouldn't change all below 143 | eukcc_db_dir: 144 | "eukcc2_db_ver_1.1" 145 | eukcc_DL_FILE: 146 | "eukcc2_db_ver_1.1.tar.gz" 147 | eukcc_DL_LINK: 148 | "http://ftp.ebi.ac.uk/pub/databases/metagenomics/eukcc/eukcc2_db_ver_1.1.tar.gz" 149 | eukcc_TRIGGER_FILE: 150 | "eukcc_DB_SETUP" 151 | 152 | ## example usage command ## 153 | # snakemake --use-conda --conda-prefix ${CONDA_PREFIX}/envs -j 4 -p 154 | 155 | # `--use-conda` – this specifies to use the conda environments included in the workflow 156 | # `--conda-prefix` – this allows us to point to where the needed conda environments should be stored. Including this means if we use the workflow on a different dataset somewhere else in the future, it will re-use the same conda environments rather than make new ones. The value listed here, `${CONDA_PREFIX}/envs`, is the default location for conda environments (the variable `${CONDA_PREFIX}` will be expanded to the appropriate location on whichever system it is run on). 157 | # `-j` – this lets us set how many jobs Snakemake should run concurrently (keep in mind that many of the thread and cpu parameters set in the config.yaml file will be multiplied by this) 158 | # `-p` – specifies to print out each command being run to the screen 159 | 160 | # See `snakemake -h` for more options and details. 161 | -------------------------------------------------------------------------------- /workflows/genome-summarize-wf/envs/bit.yaml: -------------------------------------------------------------------------------- 1 | name: bit 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | - astrobiomike 7 | dependencies: 8 | - bit=1.8.65 9 | -------------------------------------------------------------------------------- /workflows/genome-summarize-wf/envs/cat.yaml: -------------------------------------------------------------------------------- 1 | name: cat 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - cat=5.2.2 8 | -------------------------------------------------------------------------------- /workflows/genome-summarize-wf/envs/checkm2.yaml: -------------------------------------------------------------------------------- 1 | name: checkm2 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - checkm2=1.0.1 8 | -------------------------------------------------------------------------------- /workflows/genome-summarize-wf/envs/eukcc.yaml: -------------------------------------------------------------------------------- 1 | name: eukcc 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - eukcc=2.1.0 8 | -------------------------------------------------------------------------------- /workflows/genome-summarize-wf/envs/gtdb-tk.yaml: -------------------------------------------------------------------------------- 1 | name: gtdb-tk 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - gtdbtk=2.4.0 8 | - numpy=1.23.1 9 | -------------------------------------------------------------------------------- /workflows/genome-summarize-wf/scripts/combine-euk-outputs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import pandas as pd 4 | import re 5 | import argparse 6 | import sys 7 | 8 | ## contact: Michael D. Lee (Mike.Lee@nasa.gov) 9 | 10 | parser = argparse.ArgumentParser(description='This script combines the outputs in our GeneLab genome standard processing.') 11 | 12 | required = parser.add_argument_group('required arguments') 13 | 14 | required.add_argument("-s", "--input-summary-tsv", help="Input assembly summary stats file", action="store", required=True) 15 | required.add_argument("-c", "--input-eukcc-tsv", help="Input eukcc summaries", action="store", required=True) 16 | required.add_argument("-t", "--input-tax-tsv", help="Input CAT taxonomies", action="store", required=True) 17 | 18 | parser.add_argument("-o", "--output-tsv", help='Output table filename (default: "Genomes-summaries.tsv")', action="store", default="Genome-summaries.tsv") 19 | 20 | args = parser.parse_args() 21 | 22 | if len(sys.argv)==1: 23 | parser.print_help(sys.stderr) 24 | sys.exit(0) 25 | 26 | # reading in summary stats 27 | stats_df = pd.read_csv(args.input_summary_tsv, sep="\t", index_col=0) 28 | 29 | # slimming down to those we want 30 | wanted_summary_stats = ["Total contigs", "Total length", "Ambiguous characters", "GC content", "Maximum contig length", "Minimum contig length", "N50", "L50"] 31 | wanted_stats_df = stats_df.loc[wanted_summary_stats, ] 32 | 33 | # transposing 34 | trans_df = wanted_stats_df.T 35 | 36 | ## for the life of me i can't figure out how to do this the easy way, but formatting the numbers 37 | trans_df["Total contigs"] = trans_df["Total contigs"].map('{:,.0f}'.format) 38 | trans_df["Total length"] = trans_df["Total length"].map('{:,.0f}'.format) 39 | trans_df["Ambiguous characters"] = trans_df["Ambiguous characters"].map('{:,.0f}'.format) 40 | trans_df["Maximum contig length"] = trans_df["Maximum contig length"].map('{:,.0f}'.format) 41 | trans_df["Minimum contig length"] = trans_df["Minimum contig length"].map('{:,.0f}'.format) 42 | trans_df["N50"] = trans_df["N50"].map('{:,.0f}'.format) 43 | trans_df["L50"] = trans_df["L50"].map('{:,.0f}'.format) 44 | 45 | # reading in checkm results 46 | eukcc_df = pd.read_csv(args.input_eukcc_tsv, sep="\t", index_col=0) 47 | 48 | # slimming down to those we want 49 | wanted_eukcc_cols = ["Est. Comp.", "Est. Redund."] 50 | eukcc_df = eukcc_df.loc[:, wanted_eukcc_cols] 51 | 52 | # renaming columns 53 | eukcc_df.columns = ["Est. Completeness (%)", "Est. Redundancy (%)"] 54 | eukcc_df.index.names = ["Assembly"] 55 | 56 | # merging those two 57 | combined_df = trans_df.merge(eukcc_df, left_index=True, right_index=True) 58 | 59 | # creating a dictionary to hold lineage info from CAT 60 | tax_dict = {} 61 | 62 | ranks = ["Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"] 63 | 64 | # iterating through that input file 65 | with open(args.input_tax_tsv, "r") as tax: 66 | for line in tax: 67 | if line.strip().startswith("Assembly"): 68 | continue 69 | 70 | line = line.strip().split("\t") 71 | ID = line[0] 72 | tax_list = line[1:8] 73 | 74 | curr_dict = dict(zip(iter(ranks), iter(tax_list))) 75 | 76 | tax_dict[ID] = curr_dict 77 | 78 | # creating dataframe from our tax dictionary 79 | tax_df = pd.DataFrame.from_dict(tax_dict, orient="index") 80 | tax_df.index.names = ["Assembly"] 81 | 82 | # merging with summary stats table 83 | final_df = combined_df.merge(tax_df, left_index=True, right_index=True) 84 | final_df.index.names = ["Assembly"] 85 | 86 | # changing empties to "Not Assigned" 87 | final_df.replace({"": "Not Assigned"}, inplace=True) 88 | 89 | # writing out 90 | with open(args.output_tsv, "w") as out: 91 | out.write(final_df.to_csv(index=True, sep="\t")) 92 | -------------------------------------------------------------------------------- /workflows/genome-summarize-wf/scripts/combine-outputs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import pandas as pd 4 | import re 5 | import argparse 6 | import sys 7 | 8 | ## contact: Michael D. Lee (Mike.Lee@nasa.gov) 9 | 10 | parser = argparse.ArgumentParser(description = 'This script combines the taxonomic classification, quality estimates, and summary stats into one table.') 11 | 12 | required = parser.add_argument_group('required arguments') 13 | 14 | required.add_argument("-s", "--input-summary-tsv", help = "Input assembly summary stats file from bit", action = "store", required = True) 15 | required.add_argument("-c", "--input-checkm2-tsv", help = "Input summary from checkm2", action = "store", required = True) 16 | required.add_argument("-t", "--input-tax-tsv", help = "Input slimmed results from GTDB-tk", action = "store", required = True) 17 | 18 | parser.add_argument("-o", "--output-tsv", help = 'Output table filename (default: "Genomes-summaries.tsv")', action = "store", default = "Genome-summaries.tsv") 19 | 20 | args = parser.parse_args() 21 | 22 | if len(sys.argv)==1: 23 | parser.print_help(sys.stderr) 24 | sys.exit(0) 25 | 26 | # reading in summary stats 27 | stats_df = pd.read_csv(args.input_summary_tsv, sep = "\t", index_col = 0) 28 | 29 | # slimming down to those we want 30 | wanted_summary_stats = ["Total contigs", "Total length", "Ambiguous characters", "GC content", "Maximum contig length", "Minimum contig length", "N50", "L50"] 31 | wanted_stats_df = stats_df.loc[wanted_summary_stats, ] 32 | 33 | # transposing 34 | trans_df = wanted_stats_df.T 35 | 36 | ## for the life of me i can't figure out how to do this the easy way right now, but formatting the numbers here 37 | trans_df["Total contigs"] = trans_df["Total contigs"].map('{:,.0f}'.format) 38 | trans_df["Total length"] = trans_df["Total length"].map('{:,.0f}'.format) 39 | trans_df["Ambiguous characters"] = trans_df["Ambiguous characters"].map('{:,.0f}'.format) 40 | trans_df["Maximum contig length"] = trans_df["Maximum contig length"].map('{:,.0f}'.format) 41 | trans_df["Minimum contig length"] = trans_df["Minimum contig length"].map('{:,.0f}'.format) 42 | trans_df["N50"] = trans_df["N50"].map('{:,.0f}'.format) 43 | trans_df["L50"] = trans_df["L50"].map('{:,.0f}'.format) 44 | 45 | # reading in checkm2 results 46 | checkm2_df = pd.read_csv(args.input_checkm2_tsv, sep = "\t", index_col = 0) 47 | 48 | # slimming down to those we want 49 | wanted_checkm_cols = ["Completeness", "Contamination"] 50 | checkm2_df = checkm2_df.loc[:, wanted_checkm_cols] 51 | 52 | # renaming columns 53 | checkm2_df.columns = ["Est. Completeness (%)", "Est. Redundancy (%)"] 54 | checkm2_df.index.names = ["Assembly"] 55 | 56 | # merging those two 57 | combined_df = trans_df.merge(checkm2_df, left_index = True, right_index = True) 58 | 59 | # creating a dictionary to hold lineage info from gtdb-tk 60 | tax_dict = {} 61 | 62 | ranks = ["Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"] 63 | 64 | # iterating through that input file 65 | with open(args.input_tax_tsv, "r") as tax: 66 | for line in tax: 67 | if line.strip().startswith("user_genome"): 68 | continue 69 | 70 | line = line.strip().split("\t") 71 | ID = line[0] 72 | tax_str = line[1].replace(";", "") 73 | 74 | tax_list = re.split(".?__", tax_str)[1:8] 75 | 76 | # handling if nothing was at all classified 77 | if not tax_list: 78 | 79 | tax_list = ["Not Assigned"] * 7 80 | 81 | curr_dict = dict(zip(iter(ranks), iter(tax_list))) 82 | 83 | tax_dict[ID] = curr_dict 84 | 85 | # creating dataframe from our tax dictionary 86 | tax_df = pd.DataFrame.from_dict(tax_dict, orient = "index") 87 | tax_df.index.names = ["Assembly"] 88 | 89 | # merging with summary stats table 90 | final_df = combined_df.merge(tax_df, left_index = True, right_index = True) 91 | final_df.index.names = ["Assembly"] 92 | 93 | # changing empties to "Not Assigned" 94 | final_df.replace({"": "Not Assigned"}, inplace = True) 95 | 96 | # writing out 97 | with open(args.output_tsv, "w") as out: 98 | out.write(final_df.to_csv(index = True, sep = "\t")) 99 | -------------------------------------------------------------------------------- /workflows/genome-summarize-wf/scripts/slurm-status.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import subprocess 3 | import sys 4 | 5 | jobid = sys.argv[1] 6 | 7 | # if wanting to use, this should be added to the snakemake call from the root workflow dir: `--cluster-status scripts/slurm-status.py` 8 | 9 | output = str(subprocess.check_output("sacct -j %s --format State --noheader | head -1 | awk '{print $1}'" % jobid, shell=True).strip()) 10 | 11 | running_status=["PENDING", "CONFIGURING", "COMPLETING", "RUNNING", "SUSPENDED"] 12 | if "COMPLETED" in output: 13 | print("success") 14 | elif any(r in output for r in running_status): 15 | print("running") 16 | else: 17 | print("failed") 18 | -------------------------------------------------------------------------------- /workflows/metagenomics-wf/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Metagenomics workflow change log 2 | 3 | ## 1.0.2 4 | - pinned specific version of diamond (2.0.6) to the CAT environment 5 | 6 | ## 1.0.1 7 | - can optionally skip binning and MAG recovery and characterization with new option in config.yaml, "perform_binning_and_MAG_recovery" 8 | 9 | ## 1.0.0 10 | - initial workflow release 11 | -------------------------------------------------------------------------------- /workflows/metagenomics-wf/config/multiqc.config: -------------------------------------------------------------------------------- 1 | extra_fn_clean_exts: 2 | - "_raw" 3 | - "_HRremoved_raw" 4 | - "_filtered" 5 | 6 | show_analysis_paths: False 7 | show_analysis_time: False 8 | -------------------------------------------------------------------------------- /workflows/metagenomics-wf/envs/bit.yaml: -------------------------------------------------------------------------------- 1 | name: bit 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | - astrobiomike 7 | dependencies: 8 | - bit=1.8.65 9 | - numpy=1.26.4 10 | -------------------------------------------------------------------------------- /workflows/metagenomics-wf/envs/cat.yaml: -------------------------------------------------------------------------------- 1 | name: cat 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - cat=5.2.2 8 | - diamond=2.0.6 9 | -------------------------------------------------------------------------------- /workflows/metagenomics-wf/envs/checkm2.yaml: -------------------------------------------------------------------------------- 1 | name: checkm2 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - checkm2=1.0.1 8 | -------------------------------------------------------------------------------- /workflows/metagenomics-wf/envs/gtdb-tk.yaml: -------------------------------------------------------------------------------- 1 | name: gtdb-tk 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - gtdbtk=2.4.0 8 | - numpy=1.23.1 9 | -------------------------------------------------------------------------------- /workflows/metagenomics-wf/envs/keggdecoder.yaml: -------------------------------------------------------------------------------- 1 | name: keggdecoder 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - python=3.6 6 | - pip 7 | - pip: 8 | - KEGGDecoder==1.2.2 9 | -------------------------------------------------------------------------------- /workflows/metagenomics-wf/envs/kofamscan.yaml: -------------------------------------------------------------------------------- 1 | name: kofamscan 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | - astrobiomike 7 | dependencies: 8 | - kofamscan=1.3.0 9 | - hmmer=3.3.0 10 | - bit=1.8.65 11 | -------------------------------------------------------------------------------- /workflows/metagenomics-wf/envs/mapping.yaml: -------------------------------------------------------------------------------- 1 | name: mapping 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - bowtie2=2.3.5.1 8 | - tbb=2020.2 9 | - bbmap=38.86 10 | - samtools=1.9 11 | -------------------------------------------------------------------------------- /workflows/metagenomics-wf/envs/megahit.yaml: -------------------------------------------------------------------------------- 1 | name: megahit 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | - astrobiomike 7 | dependencies: 8 | - megahit=1.2.9 9 | - bit=1.8.65 10 | -------------------------------------------------------------------------------- /workflows/metagenomics-wf/envs/metabat.yaml: -------------------------------------------------------------------------------- 1 | name: metabat 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - metabat2=2.15 8 | -------------------------------------------------------------------------------- /workflows/metagenomics-wf/envs/prodigal.yaml: -------------------------------------------------------------------------------- 1 | name: prodigal 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | - astrobiomike 7 | dependencies: 8 | - prodigal=2.6.3 9 | - bit=1.8.65 10 | -------------------------------------------------------------------------------- /workflows/metagenomics-wf/envs/qc.yaml: -------------------------------------------------------------------------------- 1 | name: qc 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - fastqc=0.11.9 8 | - multiqc=1.11 9 | - bbmap=38.86 10 | - zip=3.0 11 | - python=3.8 12 | -------------------------------------------------------------------------------- /workflows/metagenomics-wf/scripts/combine-benchmarks.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | ls benchmarks/ > benchmark-filenames.tmp 5 | 6 | head -n 1 benchmarks/$( head -n 1 benchmark-filenames.tmp ) > benchmark-header.tmp 7 | 8 | paste <( printf "process" ) benchmark-header.tmp > building-tab.tmp 9 | 10 | for file in $(cat benchmark-filenames.tmp) 11 | do 12 | 13 | cat <( paste <( echo ${file} | sed 's/-benchmarks.tsv//' ) <( tail -n +2 benchmarks/${file} ) ) >> building-tab.tmp 14 | 15 | done 16 | 17 | mv building-tab.tmp benchmarks/ALL-benchmarks.tsv 18 | rm -rf benchmark-filenames.tmp benchmark-header.tmp 19 | -------------------------------------------------------------------------------- /workflows/metagenomics-wf/scripts/download-gtdbtk-refs.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | ### This is modified from the "download-db.sh" script that came with the conda install of gtdbtk v2.4.0. 5 | # The primary download site, link commented out below, had been consistently taking over a week to download for me and others. 6 | # So I added this to the workflow for now to pull from the mirror site (new DB_URL below). 7 | 8 | # Configuration 9 | N_FILES_IN_TAR=241860 10 | # DB_URL="https://data.gtdb.ecogenomic.org/releases/release220/220.0/auxillary_files/gtdbtk_package/full_package/gtdbtk_r220_data.tar.gz" 11 | DB_URL="https://data.ace.uq.edu.au/public/gtdb/data/releases/release220/220.0/auxillary_files/gtdbtk_package/full_package/gtdbtk_r220_data.tar.gz" 12 | TARGET_TAR_NAME="gtdbtk_r220_data.tar.gz" 13 | 14 | # Script variables (no need to configure) 15 | TARGET_DIR=${1:-$GTDBTK_DATA_PATH} 16 | TARGET_TAR="${TARGET_DIR}/${TARGET_TAR_NAME}" 17 | 18 | # Check if this is overriding an existing version 19 | mkdir -p "$TARGET_DIR" 20 | n_folders=$(find "$TARGET_DIR" -maxdepth 1 -type d | wc -l) 21 | if [ "$n_folders" -gt 1 ]; then 22 | echo "[ERROR] - The GTDB-Tk database directory must be empty, please empty it: $TARGET_DIR" 23 | exit 1 24 | fi 25 | 26 | # Start the download process 27 | # Note: When this URL is updated, ensure that the "--total" flag of TQDM below is also updated 28 | echo "[INFO] - Downloading the GTDB-Tk database to: ${TARGET_DIR}" 29 | wget $DB_URL -O "$TARGET_TAR" 30 | 31 | # Uncompress and pipe output to TQDM 32 | echo "[INFO] - Extracting archive..." 33 | tar xvzf "$TARGET_TAR" -C "${TARGET_DIR}" --strip 1 | tqdm --unit=file --total=$N_FILES_IN_TAR --smoothing=0.1 >/dev/null 34 | 35 | # Remove the file after successful extraction 36 | rm "$TARGET_TAR" 37 | echo "[INFO] - The GTDB-Tk database has been successfully downloaded and extracted." 38 | 39 | # Set the environment variable 40 | if conda env config vars set GTDBTK_DATA_PATH="$TARGET_DIR"; then 41 | echo "[INFO] - Added GTDBTK_DATA_PATH ($TARGET_DIR) to the GTDB-Tk conda environment." 42 | else 43 | echo "[INFO] - Conda not found in PATH, please be sure to set the GTDBTK_DATA_PATH envrionment variable" 44 | echo "export GTDBTK_DATA_PATH=$TARGET_DIR before running GTDB-Tk. " 45 | fi 46 | 47 | exit 0 48 | -------------------------------------------------------------------------------- /workflows/metagenomics-wf/scripts/format-contig-tax-classifications.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | awk -F $'\t' ' BEGIN { OFS = FS } { if ( $2 == "classification" ) { print $1,$4,$6,$7,$8,$9,$10,$11,$12 } \ 4 | else if ( $2 == "no taxid assigned" ) { print $1,"NA","NA","NA","NA","NA","NA","NA","NA" } \ 5 | else { n=split($4,lineage,";"); print $1,lineage[n],$6,$7,$8,$9,$10,$11,$12 } } ' ${1} \ 6 | | sed 's/no support/NA/g' | sed 's/superkingdom/domain/' | sed 's/^# contig/contig_ID/' | sed 's/lineage/taxid/' > ${2} 7 | -------------------------------------------------------------------------------- /workflows/metagenomics-wf/scripts/format-gene-tax-classifications.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | awk -F $'\t' ' BEGIN { OFS = FS } { if ( $3 == "lineage" ) { print $1,$3,$5,$6,$7,$8,$9,$10,$11 } \ 4 | else if ( $2 == "ORF has no hit to database" || $2 ~ /^no taxid found/ ) { print $1,"NA","NA","NA","NA","NA","NA","NA","NA" } \ 5 | else { n=split($3,lineage,";"); print $1,lineage[n],$5,$6,$7,$8,$9,$10,$11 } } ' ${1} \ 6 | | sed 's/no support/NA/g' | sed 's/superkingdom/domain/' | sed 's/# ORF/gene_ID/' | sed 's/lineage/taxid/' > ${2} 7 | -------------------------------------------------------------------------------- /workflows/metagenomics-wf/scripts/generate-assembly-based-overview-table.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | sample_IDs_file=${1} 4 | assemblies_dir=${2} 5 | genes_dir=${3} 6 | mapping_dir=${4} 7 | bins_dir=${5} 8 | MAGs_dir=${6} 9 | output=${7} 10 | 11 | # starting output file 12 | printf "Sample_ID\tassembly_produced\tgene_calls_identified\tread_mapping_successful\tbins_recovered\tMAGs_recovered\n" > ${output} 13 | 14 | # looping through all input files and generating columns for final table 15 | for sample in $(cat ${sample_IDs_file}) 16 | do 17 | 18 | # checking assembly 19 | if [ ! -s ${assemblies_dir}/${sample}-assembly.fasta ]; then 20 | printf "No\n" >> assembly-status.tmp 21 | 22 | # removing empty output fasta 23 | rm -rf ${assemblies_dir}/${sample}-assembly.fasta 24 | 25 | else 26 | printf "Yes\n" >> assembly-status.tmp 27 | fi 28 | 29 | # checking gene calls 30 | if [ ! -s ${genes_dir}/${sample}-genes.faa ]; then 31 | printf "No\n" >> genes-status.tmp 32 | 33 | # removing empty output files 34 | rm -rf ${genes_dir}/${sample}-genes.faa ${genes_dir}/${sample}-genes.fasta ${genes_dir}/${sample}-genes.gff 35 | 36 | else 37 | printf "Yes\n" >> genes-status.tmp 38 | fi 39 | 40 | # checking read-mapping outputs 41 | if [ ! -s ${mapping_dir}/${sample}.bam ]; then 42 | printf "No\n" >> mapping-status.tmp 43 | 44 | # removing empty output files 45 | rm -rf ${mapping_dir}/${sample}.bam ${mapping_dir}/${sample}-metabat-assembly-depth.tsv 46 | 47 | else 48 | printf "Yes\n" >> mapping-status.tmp 49 | fi 50 | 51 | # getting number of bins recovered if any produced 52 | if compgen -G "${bins_dir}*.fasta" > /dev/null; then 53 | num_bins=$(ls ${bins_dir}*.fasta | grep -c "${sample}-bin.[0-9]*.fasta") 54 | printf "${num_bins}\n" >> bins-status.tmp 55 | else 56 | printf "0\n" >> bins-status.tmp 57 | fi 58 | 59 | # getting number of MAGs recovered 60 | if compgen -G "${MAGs_dir}*.fasta" >/dev/null; then 61 | num_MAGs=$(ls ${MAGs_dir}*.fasta | grep -c "${sample}-MAG-[0-9]*.fasta") 62 | printf "${num_MAGs}\n" >> MAGs-status.tmp 63 | else 64 | printf "0\n" >> MAGs-status.tmp 65 | fi 66 | 67 | done 68 | 69 | # combining, adding to output file and removing intermediates 70 | cat <( paste ${sample_IDs_file} assembly-status.tmp \ 71 | genes-status.tmp mapping-status.tmp \ 72 | bins-status.tmp MAGs-status.tmp ) >> ${output} 73 | 74 | rm assembly-status.tmp genes-status.tmp mapping-status.tmp bins-status.tmp MAGs-status.tmp 75 | -------------------------------------------------------------------------------- /workflows/metagenomics-wf/scripts/parse-MAG-annots.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | 5 | parser = argparse.ArgumentParser(description='This script does whatever it needs to do.') 6 | 7 | required = parser.add_argument_group('required arguments') 8 | 9 | required.add_argument("-i", "--input-tsv", help='no help for you, come back, 2 years!', action="store", required=True) 10 | required.add_argument("-w", "--wanted-things", help="what'd i tell you?", action="store", required=True) 11 | required.add_argument("-M", "--MAG-ID", action="store", required=True) 12 | 13 | parser.add_argument("-o", "--output_tsv", help='Default: "out.tsv"', action="store", dest="output_tsv", default="out.tsv") 14 | 15 | args = parser.parse_args() 16 | 17 | targets_set = set(line.strip() for line in open(args.wanted_things)) 18 | 19 | out_tab = open(args.output_tsv, "a") 20 | 21 | for line in open(args.input_tsv): 22 | line = line.strip().split("\t") 23 | if line[2] != "NA": 24 | 25 | # dropping last coding seq # field so matches contig ID 26 | if line[0].rsplit('_', 1)[0] in targets_set: 27 | 28 | out_tab.write(str(args.MAG_ID) + "\t" + line[2] + "\n") 29 | 30 | out_tab.close() 31 | -------------------------------------------------------------------------------- /workflows/metagenomics-wf/scripts/slurm-status.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import subprocess 3 | import sys 4 | 5 | jobid = sys.argv[1] 6 | 7 | # if wanting to use, this should be added to the snakemake call from the root workflow dir: `--cluster-status scripts/slurm-status.py` 8 | 9 | output = str(subprocess.check_output("sacct -j %s --format State --noheader | head -1 | awk '{print $1}'" % jobid, shell=True).strip()) 10 | 11 | running_status=["PENDING", "CONFIGURING", "COMPLETING", "RUNNING", "SUSPENDED"] 12 | if "COMPLETED" in output: 13 | print("success") 14 | elif any(r in output for r in running_status): 15 | print("running") 16 | else: 17 | print("failed") 18 | -------------------------------------------------------------------------------- /workflows/metagenomics-wf/scripts/swap-MAG-IDs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import pandas as pd 5 | 6 | parser = argparse.ArgumentParser(description='This script swaps the MAG IDs back to what they were prior to running KEGGDecoder.') 7 | 8 | required = parser.add_argument_group('required arguments') 9 | 10 | required.add_argument("-i", "--input-tsv", help='Output table from KEGGDecoder', action="store", required=True) 11 | required.add_argument("-m", "--map-tsv", help='Tab-delimited map with 1st column holding original name, and 2nd column holding modified name', action="store", required=True) 12 | 13 | parser.add_argument("-o", "--output-tsv", help='Output table with adjusted MAG IDs (Default: "out.tsv")', action="store", default="out.tsv") 14 | 15 | args = parser.parse_args() 16 | 17 | # reading in mapping file into dictionary 18 | map_dict = {} 19 | with open(args.map_tsv) as mapping: 20 | for line in mapping: 21 | line = line.strip().split("\t") 22 | map_dict[line[1]] = line[0] 23 | 24 | # reading in output table from KEGGDecoder 25 | in_tab = pd.read_csv(args.input_tsv, sep = "\t", index_col = 0) 26 | 27 | # renaming back to what they were before modifying to be compliant with KEGGDecoder 28 | mod_tab = in_tab.rename(index = map_dict) 29 | 30 | # writing out modified file 31 | mod_tab.to_csv(args.output_tsv, sep = "\t") 32 | -------------------------------------------------------------------------------- /workflows/sra-download-wf/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # SRA-download workflow change log 2 | 3 | ## 1.1.0 4 | - workflow can now also handle sra objects that hold single-end data 5 | 6 | ## 1.0.1 7 | - updates to `scripts/combine-sra-accessions.sh` 8 | - more efficient now by not cat'ing if there is only one SRR for a sample 9 | - default is to remove original files now, and `-k` needs to be added in order to keep them 10 | 11 | ## 1.0.0 12 | - initial workflow release 13 | -------------------------------------------------------------------------------- /workflows/sra-download-wf/README.md: -------------------------------------------------------------------------------- 1 | # [bit](https://github.com/AstrobioMike/bit) sra-download workflow 2 | This is a [snakemake](https://snakemake.github.io/) workflow for downloading reads from [NCBI's SRA](https://www.ncbi.nlm.nih.gov/sra) in fastq format. For all workflows available with _bit_, see [here](https://github.com/AstrobioMike/bit?tab=readme-ov-file#workflows). 3 | 4 | --- 5 | 6 | * [**Overview**](#overview) 7 | * [**Usage**](#usage) 8 | * [Retrieving the workflow](#retrieving-the-workflow) 9 | * [Creating the input file and modifying the config.yaml](#creating-the-input-file-and-modifying-the-configyaml) 10 | * [Running the workflow](#running-the-workflow) 11 | * [Combining SRRs if needed](#combining-srrs-if-needed) 12 | * [**Version info**](#version-info) 13 | 14 | --- 15 | 16 | ## Overview 17 | 18 | This workflow will download reads from SRA based on input run accessions (i.e., the accessions starting with ERR..., SRR, or DRR) using prefetch and fasterq-dump. 19 | 20 | --- 21 | 22 | ## Usage 23 | _bit_ should be installed via conda as described [here](https://github.com/AstrobioMike/bit?tab=readme-ov-file#conda-install). 24 | 25 | ### Retrieving the worklfow 26 | 27 | ```bash 28 | bit-get-workflow sra-download 29 | ``` 30 | 31 | ### Creating the input file and modifying the config.yaml 32 | Before running it, you first need to make a file holding the target run accessions, one per line in a single-column. 33 | 34 | The path to that file needs to be set for the "target_sra_accessions_file" variable in the config.yaml. 35 | 36 | ### Running the workflow 37 | After the target run accessions file has been created and set in the config.yaml, here's an example of how it could be run (note that it should still be run inside the _bit_ conda environment): 38 | 39 | ```bash 40 | snakemake --use-conda --conda-prefix ${CONDA_PREFIX}/envs -j 4 -p 41 | ``` 42 | 43 | - `--use-conda` – this specifies to use the conda environments included in the workflow 44 | - `--conda-prefix` – this allows us to point to where the needed conda environments should be stored. Including this means if we use the workflow on a different dataset somewhere else in the future, it will re-use the same conda environments rather than make new ones. The value listed here, `${CONDA_PREFIX}/envs`, is the default location for conda environments (the variable `${CONDA_PREFIX}` will be expanded to the appropriate location on whichever system it is run on). 45 | - `-j` – this lets us set how many jobs Snakemake should run concurrently (keep in mind that many of the thread and cpu parameters set in the config.yaml file will be multiplied by this) 46 | - `-p` – specifies to print out each command being run to the screen 47 | 48 | See `snakemake -h` for more options and details. 49 | 50 | ### Combining SRRs if needed 51 | 52 | Sometimes multiple "runs" belong to the same sample, but we still need to download the runs independently from SRA. A helper script is included with the workflow to facilitate combining those multiple read files into one forward and one reverse for a given sample. We first need to prepare a tab-delimited mapping file with 2 columns that lists: 53 | 1. The ultimate sample name we want to have 54 | 2. The SRR accessions that belong with each sample name 55 | 56 | Here is an example: 57 | 58 | ```bash 59 | cat map.tsv 60 | ``` 61 | 62 | ```bash 63 | Sample-1 SRR123456 64 | Sample-1 SRR123457 65 | Sample-2 SRR123458 66 | Sample-3 SRR123459 67 | Sample-3 SRR123460 68 | ``` 69 | 70 | For example, SRR123456 and SRR123457 read files would be combined (via `cat`) into one forward and one reverse read file called "Sample-1_R1.fastq.gz" and "Sample-1_R2.fastq.gz". Since Sample-2 only has one input, those files would just be renamed. 71 | 72 | The helper script takes two positional arguments: the first being the tsv mapping file; and the second being the path to the directory holding all the starting fastq files. 73 | 74 | Example usage: 75 | ```bash 76 | bash scripts/combine-sra-accessions.sh -i map.tsv -d fastq-files/ 77 | ``` 78 | 79 | Note that by default the original files will be removed after they are combined or renamed. If you want to keep them, provide the `-k` flag also. See `bash scripts/combine-sra-accessions.sh -h` for more info. This helper script is only suitable for paired-end data. 80 | 81 | --- 82 | 83 | ## Version info 84 | Note that the workflows are versioned independently of the _bit_ package. When you pull one with `bit-get-workflow`, the directory name will have the version, and it is also listed at the top of the Snakefile. 85 | 86 | All versions of programs used can be found in their corresponding conda yaml file in the envs/ directory. 87 | -------------------------------------------------------------------------------- /workflows/sra-download-wf/Snakefile: -------------------------------------------------------------------------------- 1 | ################################################################################################### 2 | ## Snakefile for the "bit" SRA download workflow ## 3 | ## Version 1.1.0 ## 4 | ## bit: https://github.com/AstrobioMike/bit ## 5 | ## ## 6 | ## If you use this workflow in a publication, please consider citing :) ## 7 | ## Lee M. bit: a multipurpose collection of bioinformatics tools. F1000Research 2022, 11:122. ## 8 | ## https://doi.org/10.12688/f1000research.79530.1 ## 9 | ################################################################################################### 10 | 11 | import os 12 | import pandas as pd 13 | 14 | configfile: "config.yaml" 15 | 16 | 17 | ######################################## 18 | ############# General Info ############# 19 | ######################################## 20 | 21 | """ 22 | See the corresponding 'config.yaml' file for general use information. 23 | Variables that may need to be adjusted should usually be changed there, not here. 24 | """ 25 | 26 | 27 | ######################################## 28 | ######## Some colors and helpers ####### 29 | ######################################## 30 | 31 | tty_colors = { 32 | 'green' : '\033[0;32m%s\033[0m', 33 | 'yellow' : '\033[0;33m%s\033[0m', 34 | 'red' : '\033[0;31m%s\033[0m' 35 | } 36 | 37 | def color_text(text, color='green'): 38 | if sys.stdout.isatty(): 39 | return(tty_colors[color] % text) 40 | else: 41 | return(text) 42 | 43 | 44 | ################################################ 45 | #### Reading target SRA accesions into list #### 46 | ################################################ 47 | target_sra_accessions_list = [line.strip() for line in open(config["target_sra_accessions_file"])] 48 | 49 | ## when i want to try integrating combinging runs that belong to the same sample given an input table, revisit what i did here: 50 | # https://github.com/AstrobioMike/NASA-Exo-N-project/blob/main/metagenomics/workflow/Snakefile 51 | 52 | ################################################ 53 | ############## Pre-flight checks ############### 54 | ################################################ 55 | 56 | # making sure there are all unique names 57 | if len(set(target_sra_accessions_list)) != len(target_sra_accessions_list): 58 | 59 | print(color_text(f"\n Not all sample IDs in the '{config['target_sra_accessions_file']}' file are unique :(\n", "yellow")) 60 | print(" Exiting for now.\n") 61 | exit(1) 62 | 63 | # making sure they all start with an expected prefix 64 | expected_prefixes = ["SRR", "ERR", "DRR"] 65 | for acc in target_sra_accessions_list: 66 | if not any([acc.startswith(prefix) for prefix in expected_prefixes]): 67 | 68 | print(color_text(f"\n At least one of the sample IDs in the '{config['target_sra_accessions_file']}' file (e.g., '{acc}') does not start with an expected prefix :(\n", "yellow")) 69 | print(f" Acceptable SRA prefixes are: {', '.join(expected_prefixes)}\n") 70 | print(" Exiting for now.\n") 71 | exit(1) 72 | 73 | 74 | ######################################## 75 | ######## Setting up directories ######## 76 | ######################################## 77 | 78 | triggers_dir = "logs/triggers" 79 | dirs_to_create = ["fastq-files", "logs", 80 | "benchmarks", triggers_dir] 81 | 82 | if config["keep_sra_files"] == "TRUE": 83 | dirs_to_create.append("sra-files") 84 | 85 | for dir in dirs_to_create: 86 | try: 87 | os.mkdir(dir) 88 | except: 89 | pass 90 | 91 | 92 | ######################################## 93 | ############# Rules start ############## 94 | ######################################## 95 | 96 | 97 | rule all: 98 | input: 99 | expand(f"{triggers_dir}/{{acc}}/all.done", acc = target_sra_accessions_list) 100 | shell: 101 | """ 102 | bash scripts/combine-benchmarks.sh 103 | """ 104 | 105 | 106 | rule prefetch: 107 | """ 108 | This rule runs prefetch on all target SRA accessions. 109 | """ 110 | conda: 111 | "envs/sra-dl.yaml" 112 | params: 113 | max_size = config["prefetch_max_size"] 114 | output: 115 | "{acc}-tmp/{acc}/{acc}.sra" 116 | benchmark: 117 | "benchmarks/{acc}-prefetch-benchmarks.tsv" 118 | log: 119 | "logs/prefetch-{acc}.log" 120 | shell: 121 | """ 122 | prefetch --max-size {params.max_size} --progress -O {wildcards.acc}-tmp {wildcards.acc} > {log} 2>&1 123 | """ 124 | 125 | 126 | rule fasterq_dump: 127 | """ 128 | This rule runs fasterq-dump on all target SRA accessions. 129 | """ 130 | conda: 131 | "envs/sra-dl.yaml" 132 | input: 133 | "{acc}-tmp/{acc}/{acc}.sra" 134 | output: 135 | touch(f"{triggers_dir}/{{acc}}/fq-dump.done") 136 | params: 137 | num_threads = config["num_threads"] 138 | benchmark: 139 | "benchmarks/{acc}-fasterq-dump-benchmarks.tsv" 140 | log: 141 | "logs/fasterq-dump-{acc}.log" 142 | shell: 143 | """ 144 | fasterq-dump --progress -O {wildcards.acc}-tmp/ --seq-defline '@$ac.$si/$ri $sn' --qual-defline '+' --threads {params.num_threads} {input} > {log} 2>&1 145 | 146 | # renaming the files to have R1/R2 in their names if they are paired end 147 | if [ -f {wildcards.acc}-tmp/{wildcards.acc}_1.fastq ]; then 148 | mv {wildcards.acc}-tmp/{wildcards.acc}_1.fastq {wildcards.acc}-tmp/{wildcards.acc}_R1.fastq.fastq 149 | mv {wildcards.acc}-tmp/{wildcards.acc}_2.fastq {wildcards.acc}-tmp/{wildcards.acc}_R2.fastq.fastq 150 | fi 151 | """ 152 | 153 | 154 | rule gzip_fastq_files: 155 | """ 156 | This rule gzips the fastq files. 157 | """ 158 | conda: 159 | "envs/sra-dl.yaml" 160 | input: 161 | f"{triggers_dir}/{{acc}}/fq-dump.done" 162 | output: 163 | touch(f"{triggers_dir}/{{acc}}/all.done") 164 | params: 165 | num_threads = config["num_threads"], 166 | initial_sra_dir = "{acc}-tmp/", 167 | keep_sra_files = config["keep_sra_files"] 168 | shell: 169 | """ 170 | pigz -p {params.num_threads} {wildcards.acc}-tmp/{wildcards.acc}*.fastq 171 | 172 | # moving files to the final directory 173 | mv {wildcards.acc}-tmp/{wildcards.acc}*.fastq.gz fastq-files/ 174 | 175 | # removing initial SRA directory unless specified otherwise in config.yaml 176 | if [ "{params.keep_sra_files}" == "TRUE" ]; then 177 | mv $(find {wildcards.acc}-tmp -name "*.sra") sra-files/ 178 | fi 179 | 180 | rm -rf {params.initial_sra_dir} 181 | """ 182 | -------------------------------------------------------------------------------- /workflows/sra-download-wf/config.yaml: -------------------------------------------------------------------------------- 1 | ################################################################################################### 2 | ## Config file for the "bit" SRA download workflow ## 3 | ## bit: https://github.com/AstrobioMike/bit ## 4 | ## ## 5 | ## If you use this workflow in a publication, please consider citing :) ## 6 | ## Lee M. bit: a multipurpose collection of bioinformatics tools. F1000Research 2022, 11:122. ## 7 | ## https://doi.org/10.12688/f1000research.79530.1 ## 8 | ################################################################################################### 9 | 10 | ############################################################ 11 | ##################### VARIABLES TO SET ##################### 12 | ############################################################ 13 | 14 | ## single-column file with target sra accessions (these should start with SRR, ERR, or DRR) 15 | target_sra_accessions_file: 16 | "target-sra-accs.txt" 17 | 18 | 19 | ###################################################################### 20 | ###### These only need to be altered if we want to change them ####### 21 | ###################################################################### 22 | 23 | ## for more info on prefetch and fasterq-dump options, see: https://github.com/ncbi/sra-tools/wiki/08.-prefetch-and-fasterq-dump 24 | 25 | ## number of threads to use PER snakemake job (which is set with the -j parameter passed to snakemake call) 26 | # passed to fasterq-dump and pigz (many may be running concurrently) 27 | num_threads: 28 | 8 29 | 30 | ## prefetch --max-size argument 31 | prefetch_max_size: 32 | "500G" 33 | 34 | ## keep sra objects after download (TRUE for yes, anything else is treated as no) 35 | keep_sra_files: 36 | "FALSE" 37 | 38 | ## example usage command ## 39 | # snakemake --use-conda --conda-prefix ${CONDA_PREFIX}/envs -j 2 -p 40 | 41 | # `--use-conda` – this specifies to use the conda environments included in the workflow 42 | # `--conda-prefix` – this allows us to point to where the needed conda environments should be stored. Including this means if we use the workflow on a different dataset somewhere else in the future, it will re-use the same conda environments rather than make new ones. The value listed here, `${CONDA_PREFIX}/envs`, is the default location for conda environments (the variable `${CONDA_PREFIX}` will be expanded to the appropriate location on whichever system it is run on). 43 | # `-j` – this lets us set how many jobs Snakemake should run concurrently (keep in mind that many of the thread and cpu parameters set in the config.yaml file will be multiplied by this) 44 | # `-p` – specifies to print out each command being run to the screen 45 | 46 | # See `snakemake -h` for more options and details. 47 | -------------------------------------------------------------------------------- /workflows/sra-download-wf/envs/sra-dl.yaml: -------------------------------------------------------------------------------- 1 | name: sra-dl 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - sra-tools=3.1.0 8 | - pigz=2.8 9 | -------------------------------------------------------------------------------- /workflows/sra-download-wf/scripts/combine-benchmarks.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | ls benchmarks/ > benchmark-filenames.tmp 5 | 6 | head -n 1 benchmarks/$( head -n 1 benchmark-filenames.tmp ) > benchmark-header.tmp 7 | 8 | paste <( printf "process" ) benchmark-header.tmp > building-tab.tmp 9 | 10 | for file in $(cat benchmark-filenames.tmp) 11 | do 12 | 13 | cat <( paste <( echo ${file} | sed 's/-benchmarks.tsv//' ) <( tail -n +2 benchmarks/${file} ) ) >> building-tab.tmp 14 | 15 | done 16 | 17 | mv building-tab.tmp benchmarks/ALL-benchmarks.tsv 18 | rm -rf benchmark-filenames.tmp benchmark-header.tmp 19 | -------------------------------------------------------------------------------- /workflows/sra-download-wf/scripts/combine-sra-accessions.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | RED='\033[0;31m' 4 | YELLOW='\033[0;33m' 5 | GREEN='\033[0;32m' 6 | NC='\033[0m' 7 | 8 | # a function for providing help 9 | print_help() { 10 | 11 | 12 | printf "\n ${YELLOW}HELP MENU${NC}" 13 | printf "\n ${YELLOW}********************************************************************${NC}\n" 14 | 15 | printf "\n Sometimes multiple SRA accessions comprise 1 sample. This script\n" 16 | printf " is a helper to combine multiple SRA fastq files together.\n\n" 17 | printf " It expects as input a tsv, with no header, where the first column\n" 18 | printf " is the sample name, and the second column is the SRA accession, e.g.:\n\n" 19 | 20 | printf " Sample-1\tSRR123456\n" 21 | printf " Sample-1\tSRR123457\n" 22 | printf " Sample-2\tSRR123458\n" 23 | printf " Sample-3\tSRR123459\n" 24 | printf " Sample-3\tSRR123460\n\n" 25 | 26 | printf " It takes two positional arguments, the first being the tsv mapping file,\n" 27 | printf " and the second being the path to the directory holding all the fastq files.\n\n" 28 | 29 | printf " Ex. Usage:\n\t bash scripts/combine-sra-accessions.sh -i map.tsv -d fastq-files/ \n\n" 30 | 31 | printf " Note that this is a simple bash script, it is for paired-end sample-sets only,\n" 32 | printf " and there is not much checked to catch human error on the input table.\n\n" 33 | 34 | printf " By default it will remove the initial fastq files. Provide the '-k' flag if you want to\n" 35 | printf " keep them.\n" 36 | 37 | printf "\n ${YELLOW}********************************************************************${NC}\n\n" 38 | 39 | exit 40 | 41 | } 42 | 43 | if [ "$#" == 0 ] || [ $1 == "-h" ]; then 44 | print_help 45 | fi 46 | 47 | 48 | ######################################## 49 | ### Setting up and parsing arguments ### 50 | ######################################## 51 | remove_original_fastqs="true" 52 | 53 | while getopts ":i:d:k" args; do 54 | case "${args}" 55 | in 56 | i) map_file=$OPTARG;; 57 | d) fastq_dir=$OPTARG;; 58 | k) remove_original_fastqs="false";; 59 | \?) printf "\n ${RED}Invalid argument: -${OPTARG}${NC}\n" 1>&2 60 | print_help 61 | ;; 62 | :) 63 | echo "Invalid option: $OPTARG requires an argument" 1>&2 64 | print_help 65 | ;; 66 | esac 67 | done 68 | 69 | ################################################## 70 | ## Making sure required arguments were provided ## 71 | ################################################## 72 | if [ ! -n "${map_file}" ] || [ ! -n "${fastq_dir}" ]; then 73 | 74 | printf "\n ${RED}ERROR${NC}: The required arguments were not provided. See help below.\n" 75 | print_help 76 | 77 | fi 78 | 79 | 80 | ######################################## 81 | ########### Pre-flight checks ########## 82 | ######################################## 83 | 84 | 85 | # check that the first positional argument is a file 86 | if [ ! -f ${map_file} ]; then 87 | 88 | printf "\n ${RED}ERROR${NC}: The file '${map_file}' does not exist.\n" 89 | print_help 90 | 91 | fi 92 | 93 | # check that the second positional argument is a directory 94 | if [ ! -d ${fastq_dir} ]; then 95 | 96 | printf "\n ${RED}ERROR${NC}: The directory '${fastq_dir}' does not exist.\n" 97 | print_help 98 | 99 | fi 100 | 101 | # checking input table has 2 columns 102 | if [ $(head -n 1 ${map_file} | awk '{print NF}') -ne 2 ]; then 103 | 104 | printf "\n ${RED}ERROR${NC}: The input table must have 2 columns. See help below.\n" 105 | print_help 106 | 107 | fi 108 | 109 | 110 | ######################################## 111 | ########## Getting to work ############# 112 | ######################################## 113 | 114 | starting_dir=$(pwd) 115 | path_to_map=$(realpath ${map_file}) 116 | 117 | # moving into directory to make it easier to run cat 118 | cd ${fastq_dir} 119 | 120 | printf "\n" 121 | 122 | for sample in $(cut -f 1 ${path_to_map} | sort -u) 123 | do 124 | 125 | printf " Currently working on: ${sample} ...\r" 126 | 127 | target_R1s=$(grep ${sample} ${path_to_map} | cut -f 2 | sed 's/$/_R1.fastq.gz/' | tr '\n' ' ' | sed 's/ $//') 128 | target_R2s=$(grep ${sample} ${path_to_map} | cut -f 2 | sed 's/$/_R2.fastq.gz/' | tr '\n' ' ' | sed 's/ $//') 129 | 130 | 131 | if [ ${remove_original_fastqs} == "true" ]; then 132 | 133 | # checking if there are multiple, if so we cat them; if just one, we just mv/rename it 134 | if printf "${target_R1s}" | grep -q " "; then 135 | 136 | cat ${target_R1s} > ${sample}_R1.fastq.gz 137 | cat ${target_R2s} > ${sample}_R2.fastq.gz 138 | 139 | rm ${target_R1s} 140 | rm ${target_R2s} 141 | 142 | else 143 | 144 | mv ${target_R1s} ${sample}_R1.fastq.gz 145 | mv ${target_R2s} ${sample}_R2.fastq.gz 146 | 147 | fi 148 | 149 | else 150 | 151 | cat ${target_R1s} > ${sample}_R1.fastq.gz 152 | cat ${target_R2s} > ${sample}_R2.fastq.gz 153 | 154 | fi 155 | 156 | done 157 | 158 | # moving back to initial dir 159 | cd ${starting_dir} 160 | 161 | printf "\n\n ${GREEN}DONE!${NC}\n\n" 162 | printf " ${YELLOW}The combined fastq files are in the directory: ${fastq_dir}${NC}\n\n" 163 | 164 | if [ ${remove_original_fastqs} == "true" ]; then 165 | 166 | printf " ${YELLOW}The original fastq files were removed because the '-k' flag was not provided.${NC}\n\n" 167 | 168 | else 169 | 170 | printf " ${YELLOW}Note that the original fastq files were left because the '-k' flag was provided.${NC}\n\n" 171 | 172 | fi 173 | --------------------------------------------------------------------------------