├── .gitignore
├── LICENSE
├── README.md
├── bit
    ├── bit-GL-combine-KO-and-tax-tables
    ├── bit-GL-combine-contig-tax-tables
    ├── bit-calc
    ├── bit-calc-gc-per-sequence
    ├── bit-calc-gc-sliding-window
    ├── bit-calc-variation-in-msa
    ├── bit-check-for-fastq-dup-headers
    ├── bit-colnames
    ├── bit-combine-bracken-and-add-lineage
    ├── bit-combine-go-summaries
    ├── bit-combine-kraken2-taxon-summaries
    ├── bit-count-bases
    ├── bit-count-bases-per-seq
    ├── bit-cov-analyzer
    ├── bit-data-locations
    ├── bit-dedupe-fasta-headers
    ├── bit-dl-ncbi-assemblies
    ├── bit-extract-seqs-by-coords
    ├── bit-ez-screen
    ├── bit-fasta-to-bed
    ├── bit-fasta-to-genbank
    ├── bit-figshare-upload
    ├── bit-filter-KOFamScan-results
    ├── bit-filter-seqs-by-length
    ├── bit-filter-table
    ├── bit-gen-iToL-binary-dataset
    ├── bit-gen-iToL-colorstrip
    ├── bit-gen-iToL-map
    ├── bit-gen-iToL-text-dataset
    ├── bit-gen-kraken2-tax-plots
    ├── bit-gen-reads
    ├── bit-genbank-locus-clean-slate
    ├── bit-genbank-to-AA-seqs
    ├── bit-genbank-to-cds-table
    ├── bit-genbank-to-fasta
    ├── bit-get-accessions-from-GTDB
    ├── bit-get-cov-stats
    ├── bit-get-go-term-info
    ├── bit-get-lineage-from-taxids
    ├── bit-get-test-data
    ├── bit-get-workflow
    ├── bit-gff-to-anvio
    ├── bit-kraken2-to-taxon-summaries
    ├── bit-lineage-to-tsv
    ├── bit-mutate-seqs
    ├── bit-normalize-table
    ├── bit-parse-fasta-by-headers
    ├── bit-parse-fastq-by-headers
    ├── bit-prot-acc-to-taxid
    ├── bit-remove-wraps
    ├── bit-rename-fasta-headers
    ├── bit-reorder-fasta
    ├── bit-slim-down-go-terms
    ├── bit-split-multifasta
    ├── bit-summarize-assembly
    ├── bit-summarize-column
    ├── bit-summarize-go-annotations
    ├── bit-update-go-dbs
    ├── bit-update-ncbi-taxonomy
    ├── bit-version
    ├── helper-bit-check-or-setup-GTDB-files.py
    ├── helper-bit-combine-bracken.py
    ├── helper-bit-dl-ncbi-assemblies-parallel.sh
    ├── helper-bit-get-ncbi-assembly-tables
    ├── helper-bit-get-ncbi-tax-data
    ├── helper-bit-parse-assembly-summary-file.py
    ├── helper-bit-setup-GO-dbs
    └── helper-bit-update-tax-table-for-seqscreen-go-tax-summary.sh
├── images
    ├── bit-cov-analyzer.pdf
    ├── bit-cov-analyzer.png
    ├── bit-metagenomics-overview.afdesign
    ├── bit-metagenomics-overview.pdf
    └── bit-metagenomics-overview.png
├── test-data
    ├── ez-screen-assembly.fasta
    ├── ez-screen-targets.fasta
    ├── kraken-example-out.tsv
    └── kraken-example.report
└── workflows
    ├── genome-summarize-wf
        ├── README.md
        ├── Snakefile
        ├── config.yaml
        ├── envs
        │   ├── bit.yaml
        │   ├── cat.yaml
        │   ├── checkm2.yaml
        │   ├── eukcc.yaml
        │   └── gtdb-tk.yaml
        └── scripts
        │   ├── combine-euk-outputs.py
        │   ├── combine-outputs.py
        │   └── slurm-status.py
    ├── metagenomics-wf
        ├── CHANGELOG.md
        ├── README.md
        ├── Snakefile
        ├── config.yaml
        ├── config
        │   └── multiqc.config
        ├── envs
        │   ├── bit.yaml
        │   ├── cat.yaml
        │   ├── checkm2.yaml
        │   ├── gtdb-tk.yaml
        │   ├── keggdecoder.yaml
        │   ├── kofamscan.yaml
        │   ├── mapping.yaml
        │   ├── megahit.yaml
        │   ├── metabat.yaml
        │   ├── prodigal.yaml
        │   └── qc.yaml
        └── scripts
        │   ├── combine-benchmarks.sh
        │   ├── download-gtdbtk-refs.sh
        │   ├── format-contig-tax-classifications.sh
        │   ├── format-gene-tax-classifications.sh
        │   ├── generate-assembly-based-overview-table.sh
        │   ├── parse-MAG-annots.py
        │   ├── slurm-status.py
        │   └── swap-MAG-IDs.py
    └── sra-download-wf
        ├── CHANGELOG.md
        ├── README.md
        ├── Snakefile
        ├── config.yaml
        ├── envs
            └── sra-dl.yaml
        └── scripts
            ├── combine-benchmarks.sh
            └── combine-sra-accessions.sh


/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | 


--------------------------------------------------------------------------------
/bit/bit-calc:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | if [ "$#" == 0 ] || [ $1 == "-h" ]; then
 4 |     printf "\n  Uses \`awk\` for quick calculations at the command line. For version\n"
 5 |     printf "  info run \`bit-version\`.\n\n"
 6 |     printf "    Usage:\n\t bit-calc \"(5+5)/2\"\n\n"
 7 |     exit
 8 | fi
 9 | 
10 | awk "BEGIN { print $1 }"
11 | 


--------------------------------------------------------------------------------
/bit/bit-calc-gc-per-sequence:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from Bio import SeqIO
 4 | import sys
 5 | import argparse
 6 | 
 7 | parser = argparse.ArgumentParser(description = "This script takes a nucleotide multifasta and returns \
 8 |                                              a tab-delimited file with 3 columns: header, sequence length, \
 9 |                                              and GC. For version info, run `bit-version`.")
10 | 
11 | required = parser.add_argument_group('required arguments')
12 | 
13 | required.add_argument("-i", "--input-fasta", metavar = "<FILE>", help = "fasta file", action = "store", required = True)
14 | parser.add_argument("-o", "--output-file", metavar = "<FILE>", help = 'Name of output tsv file (default: "GC-out.tsv")',
15 |                     action = "store", default = "GC-out.tsv")
16 | 
17 | if len(sys.argv)==1:
18 |     parser.print_help(sys.stderr)
19 |     sys.exit(0)
20 | 
21 | args = parser.parse_args()
22 | 
23 | with open(args.input_fasta, "r") as in_fasta:
24 | 
25 |     with open(args.output_file, "w") as out_file:
26 | 
27 |         out_file.write("header" + "\t" + "length" + "\t" + "gc" + "\n")
28 | 
29 |         for cur_record in SeqIO.parse(in_fasta, "fasta"):
30 |             gene_name = cur_record.name
31 |             A_count = cur_record.seq.count('A')
32 |             C_count = cur_record.seq.count('C')
33 |             G_count = cur_record.seq.count('G')
34 |             T_count = cur_record.seq.count('T')
35 |             length = len(cur_record.seq)
36 |             gc_percentage = float(G_count + C_count) / length
37 |             gc_percentage = round(gc_percentage,2)
38 |             out_file.write(str(gene_name)+"\t"+str(length)+"\t"+str(gc_percentage)+"\n")
39 | 


--------------------------------------------------------------------------------
/bit/bit-calc-gc-sliding-window:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from Bio import SeqIO
 4 | import sys
 5 | import argparse
 6 | 
 7 | parser = argparse.ArgumentParser(description = "This script is for nucleotide multifastas and will return \
 8 |                                               a tab-delimited file with 4 columns: header, sequence length, \
 9 |                                               gc of whole sequence, and gc of each window of the specified \
10 |                                               window size (-w) for each step of the specified step size (-s). \
11 |                                               For version info, run `bit-version`.")
12 | 
13 | required = parser.add_argument_group('required arguments')
14 | 
15 | required.add_argument("-i", "--input-fasta", metavar = "<FILE>", help = "fasta file", action = "store", required = True)
16 | required.add_argument("-o", "--output-file", metavar = "<FILE>", help = "Name of output tsv file", action = "store", required = True)
17 | parser.add_argument("-w", "--window-size", metavar = "<INT>", help = "Desired size of sliding window (default: 100)", action = "store", dest = "window", default = 100)
18 | parser.add_argument("-s", "--step-size", metavar = "<INT>", help = "Desired size of steps between each window (default: 1)", action = "store", dest = "step", default = 1)
19 | 
20 | if len(sys.argv)==1:
21 |     parser.print_help(sys.stderr)
22 |     sys.exit(0)
23 | 
24 | args = parser.parse_args()
25 | 
26 | window = int(args.window)
27 | step = int(args.step)
28 | 
29 | half_window = int(window / 2)
30 | 
31 | with open(args.input_fasta, "r") as in_fasta:
32 | 
33 |     with open(args.output_file, "w") as out_file:
34 | 
35 |         out_file.write(f"header\tlength\tgc\tgc_per_window_of_size_{window}_with_step_of_size_{step}\n")
36 | 
37 |         for cur_record in SeqIO.parse(in_fasta, "fasta"):
38 |             gene_name = cur_record.name
39 |             cur_record.seq = cur_record.seq.upper()
40 |             A_count = cur_record.seq.count('A')
41 |             C_count = cur_record.seq.count('C')
42 |             G_count = cur_record.seq.count('G')
43 |             T_count = cur_record.seq.count('T')
44 |             length = len(cur_record.seq)
45 |             gc_percentage = float(G_count + C_count) / length
46 |             gc_percentage = round(gc_percentage,2)
47 | 
48 |             values = []
49 | 
50 |             for i in range(0, len(cur_record.seq), step):
51 | 
52 |                 s = cur_record.seq[i - half_window : i + half_window]
53 |                 s = s.upper()
54 |                 g = s.count('G')
55 |                 c = s.count('C')
56 |                 try:
57 |                     window_gc_perc = float(g + c) / window
58 |                 except ZeroDivisionError:
59 |                     window_gc_perc = 0.0
60 |                 values.append(window_gc_perc)
61 | 
62 |             out_file.write(f"{gene_name}\t{length}\t{gc_percentage}\t{values}\n")
63 | 


--------------------------------------------------------------------------------
/bit/bit-calc-variation-in-msa:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from skbio import TabularMSA, DNA, Protein
 4 | import pandas as pd
 5 | import argparse
 6 | import sys
 7 | 
 8 | parser = argparse.ArgumentParser(description='This script takes an alignment in fasta format as input and returns the Shannon uncertainty values for each column \
 9 |                                               using: http://scikit-bio.org/docs/0.5.3/generated/skbio.alignment.TabularMSA.conservation.html. In output "variation" column: 0 is \
10 |                                               same character in all sequences for that position (highest conservation); 1 is equal probability of any character \
11 |                                               (greatest variability). "Conservation" column is inverse. As written, any ambiguous bases or residues are converted to gap characters. \
12 |                                               For version info, run `bit-version`.')
13 | 
14 | required = parser.add_argument_group('required arguments')
15 | 
16 | required.add_argument("-i", "--input_alignment_fasta", metavar = "<FILE>", help = "Input alignment fasta file", action = "store", dest = "input_alignment_fasta", required = True)
17 | 
18 | parser.add_argument("-g", "--gap_treatment", metavar = "<STR>", help = 'How to treat gaps, either "nan", "ignore", "error", "include" (default: "ignore")', choices = ["nan", "ignore", "error", "include"], action = "store", dest = "gap_treatment", default = "ignore")
19 | parser.add_argument("-t", "--type", metavar = "<STR>", help = 'Either "DNA" or "Protein" (default: "Protein")', choices = ["DNA", "Protein"], action = "store", dest = "type", default = "Protein")
20 | parser.add_argument("-o", "--output_file", metavar = "<FILE>", help = 'Name of output tab-separated file (default: "variation.tsv")', action = "store", dest = "output_tsv", default = "variation.tsv")
21 | 
22 | if len(sys.argv)==1:
23 |     parser.print_help(sys.stderr)
24 |     sys.exit(0)
25 | 
26 | args = parser.parse_args()
27 | 
28 | # i'm not certain unequal alignments are all that would throw this error, so i'm leaving this out for now so skbio just spits out their problem if they have one reading in the alignment
29 | # try:
30 |     # msa = TabularMSA.read(args.input_alignment_fasta, constructor=DNA)
31 | # except ValueError:
32 | #     print('\n\tSorry, it seems not all sequences in the alignment are the same length... :(\n')
33 | #     sys.exit(1)
34 | 
35 | msa = TabularMSA.read(args.input_alignment_fasta, constructor=eval(args.type), lowercase=True)
36 | 
37 | list_of_cleaned_seqs = []
38 | 
39 | # converting degenerate bases to gaps
40 | for seq in msa:
41 | 
42 |     seq = seq.replace(seq.degenerates(), "-")
43 |     list_of_cleaned_seqs.append(seq)
44 | 
45 | clean_msa = TabularMSA(list_of_cleaned_seqs)
46 | 
47 | conserved = clean_msa.conservation(gap_mode=args.gap_treatment)
48 | indexes = list(range(1,clean_msa.shape[1] + 1))
49 | 
50 | df = pd.DataFrame({"position": indexes, "variation":1 - conserved, "conservation": conserved})
51 | 
52 | df.to_csv(args.output_tsv, sep="\t", index=False)
53 | 


--------------------------------------------------------------------------------
/bit/bit-check-for-fastq-dup-headers:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from Bio import SeqIO
 4 | import sys
 5 | import argparse
 6 | import gzip
 7 | 
 8 | class colors:
 9 |     GREEN = '\033[0;32m'
10 |     YELLOW = '\033[0;33m'
11 |     NC = '\033[0m'
12 | 
13 | parser = argparse.ArgumentParser(description = 'This script is just for checking if there are any duplicate headers in a fastq file. \
14 |                                  For version info, run `bit-version`.')
15 | 
16 | required = parser.add_argument_group('required arguments')
17 | 
18 | required.add_argument("-i", "--input-fastq", metavar = "<FILE>", help = "Fastq file", action = "store", required = True)
19 | parser.add_argument("--not-gzipped", help = "Add this flag if the input fastq is not gzipped (program expects they are gzipped by default)", action = "store_true")
20 | parser.add_argument("--write-dupes", help = "Add this flag if you want duplicate headers written to a file (will write to 'duplicate-headers.txt')", action = "store_true")
21 | 
22 | if len(sys.argv)==1:
23 |     parser.print_help(sys.stderr)
24 |     sys.exit(0)
25 | 
26 | args = parser.parse_args()
27 | 
28 | headers_dict = {}
29 | seq_count = 0
30 | 
31 | if args.not_gzipped:
32 | 
33 |     with open(args.input_fastq, "rt") as fastq_in:
34 | 
35 |         for seq_record in SeqIO.parse(fastq_in, "fastq"):
36 | 
37 |             seq_count += 1
38 | 
39 |             if seq_record.id in headers_dict:
40 |                 headers_dict[seq_record.id] += 1
41 | 
42 |             else:
43 |                 headers_dict[seq_record.id] = 1
44 | 
45 | else:
46 | 
47 |     with gzip.open(args.input_fastq, "rt") as fastq_in:
48 | 
49 |         for seq_record in SeqIO.parse(fastq_in, "fastq"):
50 | 
51 |             seq_count += 1
52 | 
53 |             if seq_record.id in headers_dict:
54 |                 headers_dict[seq_record.id] += 1
55 | 
56 |             else:
57 |                 headers_dict[seq_record.id] = 1
58 | 
59 | 
60 | dup_keys = [k for k,v in headers_dict.items() if v > 1]
61 | 
62 | if len(dup_keys) > 0:
63 | 
64 |     if len(dup_keys) == 1:
65 |         print(colors.YELLOW + "\n  There was 1 duplicate header among the " + str(seq_count) + " input fastq entries:\n\n    " + colors.NC + str(dup_keys[0]) + "\n")
66 | 
67 |     else:
68 | 
69 |         print(colors.YELLOW + "\n  There were " + str(len(dup_keys)) + " duplicate headers among the " + str(seq_count) + " input fastq entries." + colors.NC)
70 | 
71 |         if not args.write_dupes:
72 |             print("  If you'd like to know which ones, add the `--write-dups` flag.\n")
73 | 
74 |         else:
75 | 
76 |             with open("duplicate-headers.txt", "w") as out:
77 |                 out.write("\n".join(dup_keys))
78 |                 out.write("\n")
79 | 
80 |             print("  They were written to 'duplicate-headers.txt'.\n")
81 | 
82 | else:
83 | 
84 |     print(colors.GREEN + "\n  There were no duplicate headers detected among the " + str(seq_count) + " input fastq entries :)\n" + colors.NC)
85 | 


--------------------------------------------------------------------------------
/bit/bit-colnames:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | GREEN='\033[0;32m'
 4 | RED='\033[0;31m'
 5 | NC='\033[0m'
 6 | 
 7 | if [ "$#" == 0 ] || [ $1 == "-h" ]; then
 8 |     printf "\n  This script returns the column names (with number) from a tab-separated file.\n"
 9 |     printf "  For version info, run \`bit-version\`.\n\n"
10 |     printf "    Usage:\n\t bit-colnames input.tsv\n\n"
11 |     exit
12 | fi
13 | 
14 | if [ -f $1 ]; then
15 |     
16 |     head -n1 $1 | tr "\t" "\n" | cat -n
17 | 
18 | else
19 |     echo -e "     ${RED}Input file not found :/${NC}" >&2
20 |     exit 1
21 | fi
22 | 


--------------------------------------------------------------------------------
/bit/bit-combine-bracken-and-add-lineage:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | # setting some colors
  4 | RED='\033[0;31m'
  5 | GREEN='\033[0;32m'
  6 | NC='\033[0m'
  7 | 
  8 | 
  9 | ### HELP INFO
 10 | ## called by program name with no arguments or with "-h" as only positional argument ##
 11 | if [ "$#" == 0 ] || [ $1 == "-h" ] || [ $1 == "help" ]; then
 12 | 
 13 |     printf "\n --------------------------------  HELP INFO  --------------------------------- \n\n"
 14 |     printf "  This program combines multiple bracken sample outputs and adds full lineage info.\n"
 15 |     printf "  The local NCBI taxonomy database that \`taxonkit\` uses can be updated at anytime\n"
 16 |     printf "  with \`bit-update-ncbi-taxonomy\`. Recommended at least weekly. For version info,\n"
 17 |     printf "  run \`bit-version\`."
 18 | 
 19 |     printf "\n    Required:\n\n"
 20 |     printf "      – [-i <file>] An input file holding either a single-column holding the input\n"
 21 |     printf "                    file names, or a tab-delimited 2-column file holding input\n"
 22 |     printf "                    filenames in column 1 and the wanted sample names in column 2.\n"
 23 | 
 24 |     printf "\n    Optional:\n\n"
 25 | 
 26 |     printf '      - [-o <str>]  Specify the output file name. Default: "Combined-taxonomy.tsv"\n'
 27 |     printf '      - [-d <str>]  Specify the taxonkit database location. Default: "~/.taxonkit"\n'
 28 | 
 29 |     printf "\n    Example usage:\n\n\t bit-combine-bracken-and-add-lineage -i input-files.tsv\n\n"
 30 | 
 31 |     exit
 32 | fi
 33 | 
 34 | ### PARSING ARGUMENTS
 35 | ## setting defaults
 36 | output_file="Combined-taxonomy.tsv"
 37 | database="~/.taxonkit"
 38 | 
 39 | while getopts :i:o:d: args
 40 | do
 41 |     case "${args}"
 42 |     in
 43 |         i) input_file=${OPTARG};;
 44 |         o) output_file=${OPTARG};;
 45 |         d) database=${OPTARG};;
 46 |         \?) printf "\n  ${RED}Invalid argument: -${OPTARG}${NC}\n\n    Run with no arguments or '-h' only to see help menu.\n\n" >&2 && exit
 47 |     esac
 48 | done
 49 | 
 50 | 
 51 | ### CHECKING REQUIRED INPUT WAS PROVIDED
 52 | if [ ! -n "$input_file" ]; then
 53 |     printf "\n  ${RED}You need to provide an input file to '-i' :(${NC}\n"
 54 |     printf "\nExiting for now.\n\n"
 55 |     exit
 56 | fi
 57 | 
 58 | 
 59 | ### COMBINING MULTIPLE BRACKEN OUTPUT TABLES ###
 60 | 
 61 | printf "\n\t${GREEN}Combining tables...${NC}\n\n"
 62 | 
 63 | # this `helper-bit-combine-bracken.py` script was modified from the `combine_bracken_outputs.py` script provided by Jennifer Lu (jlu26@jhmi.edu) that comes with bracken
 64 | 
 65 |     # checking if there are 2 columns in input file (and therefore we are providing sample names too, otherwise base of filename is used)
 66 | if grep -q '\t' ${input_file}; then
 67 | 
 68 |     helper-bit-combine-bracken.py -i $(cut -f 1 ${input_file}) -n $(cut -f 2 ${input_file}  | tr "\n" "," | sed 's/,$//') -o combined-bracken.tmp
 69 | 
 70 | else
 71 | 
 72 |     helper-bit-combine-bracken.py -i $(cut -f 1 ${input_file}) -o combined-bracken.tmp
 73 | 
 74 | fi
 75 | 
 76 | 
 77 | ### GETTING FULL LINEAGE INFO WITH TAXONKIT ###
 78 | printf "\n\t${GREEN}Getting full lineage info...${NC}\n\n"
 79 | tail -n +2 combined-bracken.tmp | cut -f 2 | taxonkit lineage --data-dir ${database} | taxonkit reformat --data-dir ${database} -r NA | cut -f 3 | tr ";" "\t" | cut -f 1-7 > lineages.tmp
 80 | 
 81 |     # adding a header
 82 | cat <(printf "domain\tphylum\tclass\torder\tfamily\tgenus\tspecies\n") lineages.tmp > lineages-tab.tmp
 83 | 
 84 |     # combining lineage info and bracken combined-sample table
 85 | paste lineages-tab.tmp <(cut -f 2- combined-bracken.tmp) > ${output_file}
 86 | 
 87 |     # clearing intermediate files
 88 | rm combined-bracken.tmp lineages.tmp lineages-tab.tmp
 89 | 
 90 | printf "\n\t\t${GREEN}DONE!${NC}\n\n"
 91 | printf "\tOutput written to: $output_file\n\n"
 92 | 
 93 | 
 94 | ### FINAL OUTPUT TABLE FORMAT ###
 95 | 
 96 | ## columns in "Combined-bracken-species-taxonomy-for-other-microbes.tsv"
 97 | # 1.  domain
 98 | # 2.  phylum
 99 | # 3.  class
100 | # 4.  order
101 | # 5.  family
102 | # 6.  genus
103 | # 7.  species (which is genus and species in NCBI)
104 | # 8.  taxonomy_id
105 | # 9.  taxonomy_lvl
106 | # 10. ..._num (sample info starts here, this first one is number of reads classified)
107 | # 11. ..._frac (This one is the fraction normalized to 1 of the same sample. 
108 |     # Importantly, bracken only considers those classified. So this does not include unclassified reads. 
109 |     # They will always sum to 1, or very near 1, and there is no row currently accounting for unclassified.)
110 |     # The rest of the columns are samples just like 10 and 11 above, 2 columns, first is read counts, second is fraction.
111 | 
112 | # We can get "Unclassified" from the kraken report, but i didn't do that currently because those numbers don't add up to the total starting reads either (as all that was included in that run were things that were already filtered)
113 | 


--------------------------------------------------------------------------------
/bit/bit-combine-go-summaries:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import os
  4 | import argparse
  5 | import sys
  6 | import pandas as pd
  7 | 
  8 | parser = argparse.ArgumentParser(description='This script is for combining GO summary tables produced by\
  9 |                                               `bit-summarize-go-annotations`. For version info, run `bit-version`.')
 10 | 
 11 | 
 12 | required = parser.add_argument_group('required arguments')
 13 | 
 14 | required.add_argument("-i", "--input-files", metavar = "<FILE(s)>", nargs = "+",
 15 |                       type = str, help = "space-delimited list of `bit-summarize-go-annotations` output files",
 16 |                       action = "store", required = True)
 17 | parser.add_argument("-n", "--sample-names", metavar = "<NAME(s)>",
 18 |                     help = 'Sample names provided as a comma-delimited list, be sure it matches the order of the input files (by default will use basename of input files up to last period)',
 19 |                     action = "store", default = '')
 20 | parser.add_argument("-o", "--output-file", metavar = "<FILE>",
 21 |                     help = 'Output combined summaries (default: "combined-GO-summaries.tsv")',
 22 |                     action = "store", default = "combined-GO-summaries.tsv")
 23 | 
 24 | if len(sys.argv)==1:
 25 |   parser.print_help(sys.stderr)
 26 |   sys.exit(0)
 27 | 
 28 | args = parser.parse_args()
 29 | 
 30 | # setting up variables
 31 | sample_counts = {}
 32 | total_counts = {}
 33 | # file name is key, and sample name is value
 34 | all_samples = {}
 35 | 
 36 | # setting sample names and intializing counts
 37 | if len(args.sample_names) == 0:
 38 |     for file in args.input_files:
 39 |         curr_sample = os.path.basename(file).rsplit('.', 1)[0]
 40 |         total_counts[curr_sample] = 0
 41 | 
 42 |         if file in all_samples:
 43 |             print('\n    It seems the file "' + file + '" is trying to get in here twice.')
 44 |             print("\n    That's not gonna fly :(\n")
 45 |             sys.exit(1)
 46 | 
 47 |         all_samples[file] = curr_sample
 48 | 
 49 | else:
 50 | 
 51 |     # checking if sample names provided the length equals the number of input files
 52 |     if len(args.sample_names.split(",")) != len(args.input_files):
 53 |         print("\n    It seems the number of provided sample names doesn't match the number of provided input files :(")
 54 |         print("\n    Check usage with `bit-combine-go-summaries -h`.\n")
 55 |         sys.exit(0)
 56 | 
 57 |     # setting iterator
 58 |     i = 0
 59 | 
 60 |     for curr_sample in args.sample_names.split(","):
 61 |         total_counts[curr_sample] = 0
 62 |         all_samples[args.input_files[i]] = curr_sample
 63 |         i += 1
 64 | 
 65 | # keeping a nested dictionary of info for all GO terms that show up in any table
 66 | GO_dict = {}
 67 | 
 68 | # building counts/percents table
 69 | building_tab = pd.DataFrame(columns=["GO_term"])
 70 | 
 71 | ## working on each file
 72 | for sample_key in all_samples:
 73 | 
 74 |     # reading current file into pandas dataframe
 75 |     curr_tab = pd.read_csv(sample_key, sep="\t")
 76 | 
 77 | 
 78 |     # adding to building GO dictionary of all GO terms in the input tables
 79 |     for row in curr_tab.itertuples():
 80 | 
 81 |         if row[1] not in GO_dict:
 82 |             GO_dict[row[1]] = {'namespace': row[2], 'depth': row[3], 'name': row[4]}
 83 | 
 84 |     # trimming down current table
 85 |     curr_sub_tab = curr_tab[["GO_term", "counts", "percent_of_annotated"]]
 86 |     # and changing names to match sample ID
 87 |     curr_sub_tab.columns = ['GO_term', str(all_samples[sample_key]) + "_counts", str(all_samples[sample_key]) + "_perc_of_annotated"]
 88 | 
 89 |     # merging with master tab on GO_term
 90 |     building_tab = building_tab.merge(curr_sub_tab, on="GO_term", how="outer")
 91 | 
 92 | ## replacing NAs with 0s
 93 | building_tab = building_tab.fillna(0)
 94 | 
 95 | ## making GO info dict into dataframe and merging into final table
 96 | go_df = pd.DataFrame.from_dict(GO_dict, orient="index")
 97 | # moving index to column and renaming
 98 | go_df.reset_index(inplace=True)
 99 | go_df.rename(columns = {'index': 'GO_term'}, inplace=True)
100 | # merging
101 | final_tab = go_df.merge(building_tab, on="GO_term", how="outer")
102 | # sorting
103 | final_tab.sort_values(by=["namespace", "depth"], inplace=True)
104 | 
105 | ## writing out
106 | with open(args.output_file, "w") as out:
107 |     out.write(final_tab.to_csv(index=False, sep="\t"))
108 | 


--------------------------------------------------------------------------------
/bit/bit-combine-kraken2-taxon-summaries:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """
  4 | This script combines the outputs from the `bit-kraken2-to-taxon-summaries` program.
  5 | """
  6 | 
  7 | import sys
  8 | import argparse
  9 | import pandas as pd
 10 | import os
 11 | 
 12 | parser = argparse.ArgumentParser(description="This script combines the outputs from the `bit-kraken2-to-taxon-summaries` program. \
 13 |                                               For version info, run `bit-version`.")
 14 | 
 15 | required = parser.add_argument_group('required arguments')
 16 | 
 17 | required.add_argument("-i", "--input-files", metavar = "<FILE(s)>", nargs = "+", type = str,
 18 |                       help = "space-delimited list of `bit-kraken2-to-taxon-summaries` output files, can be provided with shell wildcards",
 19 |                       action = "store", required = True)
 20 | parser.add_argument("-n", "--sample-names", metavar = "<NAME(s)>",
 21 |                     help = 'Sample names provided as a comma-delimited list, be sure it matches the order of the input files (by default will use basename of input files up to last period)',
 22 |                     action = "store", default = '')
 23 | parser.add_argument("-o", "--output-file", metavar = "<FILE>",
 24 |                     help = 'Output combined summaries (default: "combined-kraken2-taxon-summaries.tsv")',
 25 |                     action = "store", default = "combined-kraken2-taxon-summaries.tsv")
 26 | 
 27 | if len(sys.argv)==1:
 28 |   parser.print_help(sys.stderr)
 29 |   sys.exit(0)
 30 | 
 31 | args = parser.parse_args()
 32 | 
 33 | # setting up variable
 34 | # file name is key, and sample name is value
 35 | all_samples = {}
 36 | 
 37 | # setting sample names and intializing counts
 38 | if len(args.sample_names) == 0:
 39 |     for file in args.input_files:
 40 |         curr_sample = os.path.basename(file).rsplit('.', 1)[0]
 41 | 
 42 |         if file in all_samples:
 43 |             print('\n    It seems the file "' + file + '" is trying to get in here twice.')
 44 |             print("\n    That's not gonna fly :(\n")
 45 |             sys.exit(1)
 46 | 
 47 |         all_samples[file] = curr_sample
 48 | 
 49 | else:
 50 | 
 51 |     # checking if sample names provided the length equals the number of input files
 52 |     if len(args.sample_names.split(",")) != len(args.input_files):
 53 |         print("\n    It seems the number of provided sample names doesn't match the number of provided input files :(")
 54 |         print("\n    Check usage with `bit-combine-kraken2-taxon-summaries -h`.\n")
 55 |         sys.exit(0)
 56 | 
 57 |     # setting iterator
 58 |     i = 0
 59 | 
 60 |     for curr_sample in args.sample_names.split(","):
 61 | 
 62 |         all_samples[args.input_files[i]] = curr_sample
 63 |         i += 1
 64 | 
 65 | # keeping dictionary of all taxids (keys) and full lineages (values)
 66 | taxid_dict = {}
 67 | 
 68 | # building final table
 69 | building_tab = pd.DataFrame(columns=["taxid"])
 70 | 
 71 | ## working on each file
 72 | for sample_key in all_samples:
 73 | 
 74 |     # reading current file into pandas dataframe
 75 |     curr_tab = pd.read_csv(sample_key, sep="\t")
 76 | 
 77 |     # adding to building taxid dictionary
 78 |     for row in curr_tab.itertuples():
 79 |         if row[1] not in taxid_dict:
 80 |             taxid_dict[row[1]] = {'domain': row[2], 'phylum': row[3], 'class': row[4], 'order': row[5], 'family': row[6], 'genus': row[7], 'species': row[8]}
 81 | 
 82 |     # trimming down current table
 83 |     curr_sub_tab = curr_tab[["taxid", "read_counts", "percent_of_reads"]]
 84 | 
 85 |     # and changing count and percent column names to match sample ID
 86 |     curr_sub_tab.columns = ['taxid', str(all_samples[sample_key]) + "_read_counts", str(all_samples[sample_key]) + "_perc_of_reads"]
 87 | 
 88 |     # merging with master tab on GO_term
 89 |     building_tab = building_tab.merge(curr_sub_tab, on="taxid", how="outer")
 90 | 
 91 | ## replacing NAs with 0s
 92 | building_tab = building_tab.fillna(0)
 93 | 
 94 | ## making taxid dictionary into dataframe and merging into final table
 95 | taxid_df = pd.DataFrame.from_dict(taxid_dict, orient="index")
 96 | 
 97 | # moving index to column and renaming
 98 | taxid_df.reset_index(inplace=True)
 99 | taxid_df.rename(columns = {'index': 'taxid'}, inplace=True)
100 | 
101 | # merging
102 | final_tab = taxid_df.merge(building_tab, on="taxid", how="outer")
103 | 
104 | # sorting
105 | final_tab.sort_values(by=["taxid"], inplace=True)
106 | 
107 | # changing NAs to "NA"
108 | final_tab = final_tab.fillna("NA")
109 | 
110 | ## writing out
111 | with open(args.output_file, "w") as out:
112 |     out.write(final_tab.to_csv(index=False, sep="\t"))
113 | 
114 | 


--------------------------------------------------------------------------------
/bit/bit-count-bases:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | RED='\033[0;31m'
 4 | NC='\033[0m'
 5 | 
 6 | if [ "$#" == 0 ] || [ $1 == "-h" ]; then
 7 |     printf "\n  This script returns the total number of bases (or amino acids) in a fasta file.\n"
 8 |     printf "  For version info, run \`bit-version\`.\n\n"
 9 |     printf "    Usage:\n\t bit-count-bases input.fasta\n\n"
10 |     exit
11 | fi
12 | 
13 | if [ -f $1 ]; then
14 |     echo $(grep -v ">" $1 | wc | awk '{print $3-$1}')
15 | 
16 | else
17 |     echo -e "     ${RED}Input file not found :/${NC}" >&2
18 |     exit 1
19 | fi
20 | 


--------------------------------------------------------------------------------
/bit/bit-count-bases-per-seq:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from Bio import SeqIO
 4 | import argparse
 5 | import sys
 6 | import os
 7 | from statistics import mean, median
 8 | 
 9 | parser = argparse.ArgumentParser(description = "This script takes a multifasta as input and returns a tab-delimited file with two columns, \
10 |                                               header and number of bases or amino acids for each sequence. It also \
11 |                                               prints out some general stats. For version info, run `bit-version`.")
12 | 
13 | required = parser.add_argument_group('required arguments')
14 | 
15 | required.add_argument("-i", "--input-fasta", metavar = "<FILE>", help = "Original fasta file", action = "store", required = True)
16 | parser.add_argument("-o", "--output-file", metavar = "<FILE>", help = 'Name of output tab-delimited file (default: "Num-bps.tsv")', action = "store", \
17 |                     default = "Num-bps.tsv")
18 | 
19 | if len(sys.argv) == 1:
20 |     parser.print_help(sys.stderr)
21 |     sys.exit(0)
22 | 
23 | args = parser.parse_args()
24 | 
25 | 
26 | # starting list to hold seq lengths so we can print out some summary stats
27 | lengths_list = []
28 | 
29 | # counting number of seqs
30 | n = 0
31 | 
32 | with open(args.input_fasta, "r") as in_fasta:
33 | 
34 |     with open(args.output_file, "w") as out_file:
35 | 
36 |         for seq_record in SeqIO.parse(in_fasta, "fasta"):
37 | 
38 |             out_file.write(seq_record.id + "\t" + str(len(seq_record.seq)) + "\n")
39 | 
40 |             lengths_list.append(len(seq_record.seq))
41 | 
42 |             n += 1
43 | 
44 | print("\n    Number of seqs:  " + str(n))
45 | print("    Min. length:     " + str(min(lengths_list)))
46 | print("    Max length:      " + str(max(lengths_list)))
47 | print("    Mean length:     " + str(round(mean(lengths_list), 2)))
48 | print("    Median length:   " + str(round(median(lengths_list), 2)) + "\n")
49 | print("  All seq lengths written to: '" + args.output_file + "'\n")
50 | 


--------------------------------------------------------------------------------
/bit/bit-dedupe-fasta-headers:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from Bio import SeqIO
 4 | import sys
 5 | import argparse
 6 | import os
 7 | 
 8 | parser = argparse.ArgumentParser(description='This script will append a number to headers if that exact ID has already appeared in the fasta file. For version info, run `bit-version`.')
 9 | 
10 | required = parser.add_argument_group('required arguments')
11 | 
12 | required.add_argument("-i", "--input-fasta", metavar = "<FILE>", help = "Starting fasta file",
13 |                       action = "store", required = True)
14 | parser.add_argument("-o", "--output-fasta", metavar = "<FILE>", help = 'Output fasta file (default: "Renamed.fasta").', default = "Renamed.fasta")
15 | 
16 | if len(sys.argv)==1:
17 |     parser.print_help(sys.stderr)
18 |     sys.exit(0)
19 | 
20 | args = parser.parse_args()
21 | 
22 | in_fasta = open(args.input_fasta, "r")
23 | out_fasta = open(args.output_fasta, "w")
24 | 
25 | ids = {}
26 | 
27 | for seq_record in SeqIO.parse(in_fasta, "fasta"):
28 | 
29 |     if seq_record.id not in ids:
30 |         ids[seq_record.id] = 1
31 |         out_fasta.write(">" + seq_record.id + "\n")
32 |         out_fasta.write(str(seq_record.seq) + "\n")
33 | 
34 |     else:
35 |         count = ids[seq_record.id] + 1
36 |         ids[seq_record.id] = count
37 |         out_fasta.write(">" + seq_record.id + "_" + str(count) + "\n")
38 |         out_fasta.write(str(seq_record.seq) + "\n")
39 | 
40 | in_fasta.close()
41 | out_fasta.close()
42 | 


--------------------------------------------------------------------------------
/bit/bit-extract-seqs-by-coords:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from pybedtools import BedTool
 4 | import sys
 5 | import argparse
 6 | 
 7 | parser = argparse.ArgumentParser(description = 'This script takes a multifasta file and tab-delimited file specifying which \
 8 |                                                 contigs and coordinates are wanted and returns a multifasta of the chopped out \
 9 |                                                 sequences."For version info, run `bit-version`.')
10 | 
11 | required = parser.add_argument_group('required arguments')
12 | 
13 | required.add_argument("-i", "--input-fasta", metavar = "<FILE>", help = "Starting fasta file", action = "store", required = True)
14 | required.add_argument("-b", "--bed-file", metavar = "<FILE>", help = "Bed file of desired contigs and coordinates (3 columns - contig, start, end - no header, 0-based counting", required = True)
15 | parser.add_argument("-o", "--output-fasta", metavar = "<FILE>", help = 'Name of output fasta file (default: "extracted-seqs.fasta")', action = "store", default = "extracted-seqs.fasta")
16 | 
17 | if len(sys.argv)==1:
18 |     parser.print_help(sys.stderr)
19 |     sys.exit(0)
20 | 
21 | args = parser.parse_args()
22 | 
23 | coordinates_file = BedTool(args.bed_file)
24 | fasta = BedTool(args.input_fasta)
25 | seq = coordinates_file.sequence(fi = fasta)
26 | 
27 | with open(args.output_fasta, "w") as out_fasta:
28 |     out_fasta.write(open(seq.seqfn).read())
29 | 


--------------------------------------------------------------------------------
/bit/bit-fasta-to-bed:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from Bio import SeqIO
 4 | import sys
 5 | import argparse
 6 | import os
 7 | 
 8 | parser = argparse.ArgumentParser(description="This script takes a nucleotide multifasta and returns a tab-delimited bed file (see: https://bedtools.readthedocs.io/en/latest/content/general-usage.html). For version info, run `bit-version`.")
 9 | 
10 | required = parser.add_argument_group('required arguments')
11 | 
12 | required.add_argument("-i", "--input-fasta", metavar = "<FILE>", help = "input fasta file", action = "store", required = True)
13 | parser.add_argument("-o", "--output-bed-file", metavar = "<FILE>", help = 'Name of output bed file (default: "Output.bed")',
14 |                     action = "store", default = "Output.bed")
15 | 
16 | if len(sys.argv)==1:
17 |     parser.print_help(sys.stderr)
18 |     sys.exit(0)
19 | 
20 | args = parser.parse_args()
21 | 
22 | # in_fasta = open(args.input_fasta, "r")
23 | # out_file = open(args.output_file, "w")
24 | 
25 | with open(args.output_bed_file, "w") as out:
26 |     with open(args.input_fasta, "r") as in_fasta:
27 | 
28 |         for record in SeqIO.parse(in_fasta, "fasta"):
29 |             name = record.name
30 |             length = len(record.seq) - 1
31 |             out.write(str(name) + "\t" "0" + "\t" + str(length) + "\n")
32 | 


--------------------------------------------------------------------------------
/bit/bit-fasta-to-genbank:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import argparse
 4 | import sys
 5 | from Bio import SeqIO
 6 | from Bio.Alphabet import generic_dna
 7 | import os
 8 | 
 9 | parser = argparse.ArgumentParser(description = "This script takes a fasta file and converts it into genbank format. For version info, run `bit-version`.")
10 | 
11 | required = parser.add_argument_group('required arguments')
12 | 
13 | required.add_argument('-i', '--input-fasta', metavar = "<FILE>", action = 'store', help='input fasta file', required = True)
14 | parser.add_argument("-o", "--output-genbank-file", metavar = "<FILE>", action = "store", dest = "output_gb", default = "new.gb",
15 |                     help = 'Output genbank file (default: "new.gb")')
16 | 
17 | if len(sys.argv)==1:
18 |     parser.print_help(sys.stderr)
19 |     sys.exit(0)
20 | 
21 | args = parser.parse_args()
22 | 
23 | input_fasta = open(args.input_fasta, "r")
24 | 
25 | output_gb = open(args.output_gb, "w")
26 | 
27 | sequences = list(SeqIO.parse(input_fasta, "fasta"))
28 | 
29 | for seq in sequences:
30 |     seq.seq.alphabet = generic_dna
31 | 
32 | SeqIO.write(sequences, output_gb, "genbank")
33 | 
34 | input_fasta.close()
35 | output_gb.close()
36 | 


--------------------------------------------------------------------------------
/bit/bit-filter-KOFamScan-results:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | This script filters the "detail-tsv"-formatted output of KOFamScan to retain only those above the
 5 | KO-specific score threshold, and retain only the best hit for each gene.
 6 | Outputs a 3-column tab-delimited table with: gene_ID, KO_ID, and KO_annotation
 7 | 
 8 | KOFamScan e.g. usage prior to input here:
 9 |     exec_annotation -p profiles/ -k ko_list --cpu 15 -f detail-tsv -o 5492-KO-tab.tmp 5492-genes.faa --tmp-dir 5492-tmp-KO --report-unannotated
10 | 
11 | Then would be:
12 |     bit-filter-KOFamScan-results -i 5492-KO-tab.tmp -o 5492-annotations.tsv
13 | """
14 | 
15 | import sys
16 | import argparse
17 | import pandas as pd
18 | 
19 | parser = argparse.ArgumentParser(description = "This script filters the 'detail-tsv'-formatted output file from KOFamScan to retain only those above the KO-specific score threshold, and retains only the hit with the lowest e-value for each gene if there are multiple. It outputs a 3-column tab-delimited file with: gene_ID, KO_ID, and KO_annotation. For version info, run `bit-version`")
20 | 
21 | required = parser.add_argument_group('required arguments')
22 | 
23 | required.add_argument("-i", "--input-file", help = "Input annotation table", metavar = "<FILE>", action = "store", required = True)
24 | parser.add_argument("-o", "--output-file", metavar = "<FILE>", help = 'Output table filename (default: "output.tsv")', action = "store", default = "output.tsv")
25 | 
26 | if len(sys.argv)==1:
27 |     parser.print_help(sys.stderr)
28 |     sys.exit(0)
29 | 
30 | args = parser.parse_args()
31 | 
32 | # initializing dictionaries
33 | annot_dict = {}
34 | e_value_dict = {}
35 | 
36 | # looping through input file
37 |   # input table looks like this:
38 | 
39 | """
40 | #   gene name     KO         thrshld score    E-value   "KO definition"
41 | #   ---------     ------     ------- ------   --------- -------------
42 |     k119_6520_1   K01999     126.63  44.1     6.7e-11   "branched-chain amino acid transport system substrate-binding protein"
43 |     k119_6520_1   K11954     433.33  39.0     1.8e-09   "neutral amino acid transport system substrate-binding protein"
44 |     k119_6520_1   K04615     290.13  24.9     1.7e-05   "gamma-aminobutyric acid type B receptor"
45 |     k119_6520_1   K11959     388.40  18.0     0.0035    "urea transport system substrate-binding protein"
46 |     k119_6520_1   K05387     467.63  11.1     0.25      "glutamate receptor, ionotropic, plant"
47 |     k119_19560_1  K02014     164.80  144.6    2.1e-41   "iron complex outermembrane recepter protein"
48 |     k119_19560_1  K15721     575.63  122.5    7.3e-35   "pesticin/yersiniabactin receptor"
49 |     k119_19560_1  K16090     578.13  90.5     3.3e-25   "catecholate siderophore receptor"
50 | """
51 | 
52 | with open(args.input_file, "r") as annots:
53 | 
54 |     for line in annots:
55 | 
56 |         if line.startswith("#"):
57 |             continue
58 | 
59 |         line = line.lstrip("*").strip().split("\t")
60 | 
61 |         # adding gene ID if not present, to ensure all end up in final table
62 |         if line[0] not in annot_dict:
63 |             annot_dict[line[0]] = {"KO_ID":"NA", "KO_function":"NA"}
64 | 
65 |         # nothing there if no annotations for current gene, skipping
66 |         if len(line) == 1:
67 |             continue
68 | 
69 |         else:
70 | 
71 |             # only considering if its score is above the threshold
72 |               # some, though very few, like K15869, don't have a threshold score due to having too few representatives, so if no threshold, just taking
73 |             if line[2] == "" or float(line[3]) > float(line[2]):
74 | 
75 |                 # adding to e_value_dict if not represented already, adding annotation to annot_dict, and moving on
76 |                 if line[0] not in e_value_dict:
77 | 
78 |                     annot_dict[line[0]] = {"KO_ID":line[1], "KO_function":line[5].strip('"')}
79 |                     e_value_dict[line[0]] = line[4]
80 |                     continue
81 | 
82 |                 else:
83 | 
84 |                     # replacing current annotation only if e-value is lower than current
85 |                     if float(line[4]) < float(e_value_dict[line[0]]):
86 | 
87 |                         annot_dict[line[0]] = {"KO_ID":line[1], "KO_function":line[5].strip('"')}
88 |                         e_value_dict[line[0]] = line[4]
89 | 
90 | annot_tab = pd.DataFrame.from_dict(annot_dict, orient="index")
91 | annot_tab.reset_index(inplace=True)
92 | annot_tab.rename(columns = {'index':'gene_ID'}, inplace=True)
93 | 
94 | with open(args.output_file, "w") as out:
95 |     out.write(annot_tab.to_csv(index=False, sep="\t"))
96 | 


--------------------------------------------------------------------------------
/bit/bit-filter-seqs-by-length:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from Bio import SeqIO
 4 | import argparse
 5 | import sys
 6 | import os
 7 | 
 8 | parser = argparse.ArgumentParser(description="This script takes a multifasta as input and filters out sequences based on length. For version info, run `bit-version`.")
 9 | 
10 | required = parser.add_argument_group('required arguments')
11 | 
12 | required.add_argument("-i", "--input-fasta", help = "Original fasta file", metavar = "<FILE>", action = "store", required = True)
13 | required.add_argument("-m", "--min-length", metavar = "<INT>", help = "minimum length retained", action = "store", dest = "min_len", required = True)
14 | parser.add_argument("-M", "--max-length", metavar = "<INT>", help = "maximum length retained", action = "store", dest = "max_len", default = "9223372036854775807")
15 | parser.add_argument("-o", "--output-file", metavar = "<FILE>", help = 'Name of output fasta file (default: "filtered.fasta")', action = "store", default = "filtered.fasta")
16 | 
17 | if len(sys.argv)==1:
18 |     parser.print_help(sys.stderr)
19 |     sys.exit(0)
20 | 
21 | args = parser.parse_args()
22 | 
23 | in_fasta = open(args.input_fasta, "r")
24 | out_file = open(args.output_file, "w")
25 | min_len = args.min_len
26 | max_len = args.max_len
27 | 
28 | total=0
29 | kept=0
30 | 
31 | for seq_record in SeqIO.parse(in_fasta, "fasta"):
32 | 
33 |     total+=1
34 | 
35 |     if len(seq_record.seq) >= int(min_len) and len(seq_record.seq) <= int(max_len):
36 | 
37 |         kept+=1
38 |         out_file.write(">" + str(seq_record.description) + "\n" + str(seq_record.seq) + "\n")
39 | 
40 | 
41 | perc = round(float(kept) / float(total) * 100, 2)
42 | print("\n\tRetained " + str(kept) + " sequences of the initial " + str(total) + " (" + str(perc) + "%).\n")
43 | 
44 | in_fasta.close()
45 | out_file.close()
46 | 


--------------------------------------------------------------------------------
/bit/bit-filter-table:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import argparse
 4 | import gzip
 5 | 
 6 | parser = argparse.ArgumentParser(description = 'Ad hoc script for filtering a table based on values in a specified column. For version info, run `bit-version`.')
 7 | 
 8 | required = parser.add_argument_group('required arguments')
 9 | 
10 | required.add_argument("-i", "--input-table", metavar = "<FILE>", help = 'Input table', action = "store", dest = "in_tab", required = True)
11 | required.add_argument("-w", "--wanted-values", metavar = "<FILE>", help = 'Wanted values', action = "store", dest = "wanted", required = True)
12 | 
13 | parser.add_argument("-o", "--output-file", metavar = "<FILE>", help='Output table filename (default: "Output.tsv")', action = "store", dest = "out_tab", default = "Output.tsv")
14 | parser.add_argument("-d", "--delimiter", metavar = "<STR>", help = 'Delimiter (default: "\\t")', action = "store", default = "\t")
15 | parser.add_argument("-c", "--column", metavar = "<INT>", help = 'Index of column to filter on (default: 1)', action = "store", default = 1, type = int)
16 | parser.add_argument("--no-header", help='Add if there is no header', action = "store_true")
17 | parser.add_argument("--gz", help = 'Add if the input is gzipped (output will not be)', action = "store_true")
18 | 
19 | args = parser.parse_args()
20 | 
21 | targets = set(line.strip() for line in open(args.wanted))
22 | 
23 | output = open(args.out_tab, "w")
24 | 
25 | if not args.gz:
26 |     input = open(args.in_tab, "r")
27 | else:
28 |     input = gzip.open(args.in_tab, "rt")
29 | 
30 | target_column = args.column - 1
31 | 
32 | 
33 | with open(args.out_tab, "w") as output:
34 | 
35 |     if not args.no_header:
36 |         # only doing this firstline variable because i can't figure out a better way to just print the first line when the header is included (and still be just iterating over the file contents)
37 |         firstline = True
38 | 
39 |         for line in input:
40 | 
41 |             if firstline:
42 |                 output.write(line)
43 |                 firstline = False
44 |                 continue
45 | 
46 |             split_line = line.strip().split(args.delimiter)
47 | 
48 |             if split_line[target_column] in targets:
49 |                 output.write(line)
50 | 
51 |     else:
52 | 
53 |         for line in input:
54 | 
55 |             split_line = line.strip().split(args.delimiter)
56 | 
57 |             if split_line[target_column] in targets:
58 |                 output.write(line)
59 | 
60 | input.close()
61 | 


--------------------------------------------------------------------------------
/bit/bit-gen-iToL-binary-dataset:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import sys
  4 | import argparse
  5 | import os
  6 | 
  7 | parser = argparse.ArgumentParser(description='This script is for creating a standard iToL binary dataset. For version info, run `bit-version`.')
  8 | 
  9 | required = parser.add_argument_group('required arguments')
 10 | 
 11 | required.add_argument("-g", "--target-genomes", metavar = "<FILE>", help = 'Single-column file with the genomes to color (need to match the IDs in the tree file)', action = "store", required = True)
 12 | parser.add_argument("-d", "--dataset-label", metavar = "<STR>", help = 'Label of the dataset (default: "data")', action = "store", default = "data")
 13 | parser.add_argument("-s", "--shape-to-add", metavar = "<STR>", help = 'Shape to add, must be one of: "square", "circle", "star", "rtriangle", "ltriangle", or "check" (default: "square")', action = "store", dest = "shape", default = "square")
 14 | parser.add_argument("-c", "--color", metavar = "<STR>", help='Color to use of either: "blue", "green", "red", "purple", or "black" (default: "blue", of course, \'cause it\'s the best)', action = "store", default = "blue")
 15 | parser.add_argument("-H", "--height-factor", metavar = "<INT>", help = 'Increase or decrease symbol size. Values below 1 will decrease the standard size, above 1 will increase it (default: "1")', action = "store", dest = "height", default = "1")
 16 | 
 17 | parser.add_argument("-o", "--output-file", metavar = "<FILE>", help = 'Output file for iToL (default: "iToL-binary-dataset.txt")', action = "store", default = "iToL-binary-dataset.txt")
 18 | 
 19 | if len(sys.argv)==1:
 20 |     parser.print_help(sys.stderr)
 21 |     sys.exit(0)
 22 | 
 23 | args = parser.parse_args()
 24 | 
 25 | if args.color == "blue":
 26 |     col = "#434da7"
 27 | elif args.color == "green":
 28 |     col = "#48a743"
 29 | elif args.color == "red":
 30 |     col = "#c01820"
 31 | elif args.color == "purple":
 32 |     col = "#512f9c"
 33 | elif args.color == "black":
 34 |     col = "#000000"
 35 | else:
 36 |     print("\n\tSorry, we're not prepared to handle \"" + str(args.color) + "\" as the color... :(\n")
 37 |     parser.print_help(sys.stderr)
 38 |     sys.exit(1)
 39 | 
 40 | if args.shape not in ["square", "circle", "star", "rtriangle", "ltriangle", "check"]:
 41 |     print("\n\tSorry, we're not prepared to handle \"" + str(args.shape) + "\" as the argument for what shape to use... :(\n")
 42 |     parser.print_help(sys.stderr)
 43 |     sys.exit(1)
 44 | 
 45 | try:
 46 |     height = float(args.height)
 47 | except ValueError:
 48 |     print("\n\tSorry, " + str(args.height) + " doesn't appear to be a number... :(\n")
 49 |     parser.print_help(sys.stderr)
 50 |     sys.exit(1)
 51 | 
 52 | target_list = []
 53 | 
 54 | with open(args.target_genomes, "r") as target_genomes:
 55 |     for genome in target_genomes:
 56 |         target_list.append(genome.strip())
 57 | 
 58 | out_file = open(args.output_file, "w")
 59 | 
 60 | out_file.write("DATASET_BINARY\nSEPARATOR TAB\n\n")
 61 | 
 62 | # setting DATASET_LABEL
 63 | out_file.write("DATASET_LABEL" + "\t" + str(args.dataset_label) + "\n\n")
 64 | 
 65 | # setting dataset main color
 66 | out_file.write("COLOR\t" + str(col) + "\n\n")
 67 | 
 68 | # setting FIELD_LABELS
 69 | out_file.write("FIELD_LABELS\tf1\n\n")
 70 | 
 71 | # setting FIELD_SHAPES
 72 | 
 73 | if args.shape == "square":
 74 |     shape = "1"
 75 | elif args.shape == "circle":
 76 |     shape = "2"
 77 | elif args.shape == "star":
 78 |     shape = "3"
 79 | elif args.shape == "rtriangle":
 80 |     shape = "4"
 81 | elif args.shape == "ltriangle":
 82 |     shape = "5"
 83 | else:
 84 |     shape = "6"
 85 | 
 86 | out_file.write("FIELD_SHAPES\t" + str(shape) + "\n\n")
 87 | 
 88 | # writing out FIELD_COLORS
 89 | out_file.write("FIELD_COLORS\t" + str(col) + "\n\n")
 90 | 
 91 | # writing out HEIGHT_FACTOR
 92 | out_file.write("HEIGHT_FACTOR\t" + str(height) + "\n\n")
 93 | 
 94 | # writing lines for each labels
 95 | out_file.write("DATA\n")
 96 | 
 97 | for target in target_list:
 98 |     out_file.write(str(target) + "\t" + str(shape) + "\n")
 99 | 
100 | out_file.close()
101 | 


--------------------------------------------------------------------------------
/bit/bit-gen-iToL-colorstrip:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | import argparse
 4 | import os
 5 | 
 6 | parser = argparse.ArgumentParser(description = 'This script is for creating a standard iToL colorstrip dataset file when given the IDs of the genomes we want to color (formatting comes from iToL help page here: https://itol.embl.de/help/dataset_color_strip_template.txt). For version info, run `bit-version`.')
 7 | 
 8 | required = parser.add_argument_group('required arguments')
 9 | 
10 | required.add_argument("-g", "--target-genomes", metavar = "<FILE>", help = 'Single-column file with the genomes to color (need to match the IDs in the tree file, with no ">")', action = "store", required = True)
11 | parser.add_argument("-l", "--label", metavar = "<STR>", help = 'Label used in the legend table (default: "label1")', action = "store", default = "label1")
12 | parser.add_argument("-c", "--color", metavar = "<STR>", help = 'Color to use, pre-baked options include "blue", "green", "red", "purple", or "black", or can provide the hexcode (default: "blue", of course, \'cause it\'s the best)', action = "store", default = "blue")
13 | parser.add_argument("-w", "--width", metavar = "<INT>", help = 'width of the colorstrip (default: 25)', action = "store", default = "25")
14 | parser.add_argument("--color-branches-too", help = "Add this flag if wanting to color branches also", action = "store_true")
15 | parser.add_argument("-o", "--output-file", metavar = "<FILE>", help = 'Output file for iToL (default: "iToL-colorstrip.txt")', action = "store", default = "iToL-colorstrip.txt")
16 | 
17 | 
18 | if len(sys.argv)==1:
19 |     parser.print_help(sys.stderr)
20 |     sys.exit(0)
21 | 
22 | args = parser.parse_args()
23 | 
24 | if args.color == "blue":
25 |     col = "#434da7"
26 | elif args.color == "green":
27 |     col = "#48a743"
28 | elif args.color == "red":
29 |     col = "#c01820"
30 | elif args.color == "purple":
31 |     col = "#512f9c"
32 | elif args.color == "black":
33 |     col = "#000000"
34 | else:
35 |     if not args.color.startswith("#"):
36 |         print("\n\tSorry, we're not prepared to handle \"" + str(args.color) + "\" as the color... :(\n")
37 |         parser.print_help(sys.stderr)
38 |         sys.exit(1)
39 |     else:
40 |         col = args.color
41 | 
42 | target_list = []
43 | 
44 | with open(args.target_genomes, "r") as target_genomes:
45 |     for genome in target_genomes:
46 |         target_list.append(genome.strip())
47 | 
48 | out_file = open(args.output_file, "w")
49 | 
50 | out_file.write("DATASET_COLORSTRIP" + "\n" + "SEPARATOR TAB" + "\n\n" + "DATASET_LABEL" + "\t" + str(args.label) + "\n" + "COLOR" + "\t" + str(col) + "\n\n")
51 | 
52 | if args.color_branches_too:
53 |     out_file.write("COLOR_BRANCHES\t1\n\n")
54 | else:
55 |     out_file.write("COLOR_BRANCHES\t0\n\n")
56 | 
57 | out_file.write("STRIP_WIDTH" + "\t" + str(args.width) + "\n\n")
58 | 
59 | out_file.write("BORDER_WIDTH" + "\t" + "1" + "\n")
60 | out_file.write("BORDER_COLOR" + "\t" + "#999999" + "\n\n")
61 | 
62 | out_file.write("DATA\n\n")
63 | 
64 | # writing out primary data lines
65 | for target in target_list:
66 |     out_file.write(str(target) + "\t" + str(col) + "\t" + str(args.label) + "\n")
67 | 
68 | out_file.close()
69 | 


--------------------------------------------------------------------------------
/bit/bit-gen-iToL-map:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | import argparse
 5 | import os
 6 | 
 7 | parser = argparse.ArgumentParser(description = 'This script is for creating a standard iToL "label" and/or "branch" color file when given the IDs of the genomes you want to color. For version info, run `bit-version`.')
 8 | 
 9 | required = parser.add_argument_group('required arguments')
10 | 
11 | required.add_argument("-g", "--target-genomes", metavar = "<FILE>", help = 'Single-column file with the genomes to color (need to match the IDs in the tree file, with no ">")', action = "store", required = True)
12 | parser.add_argument("-w", "--what-to-color", metavar = "<STR>", help = 'What to color, must be: "branches", "labels", or "both" (default: "both")', action = "store", dest = "to_color", default = "both")
13 | parser.add_argument("-c", "--color", metavar = "<STR>", help = 'Color to use, pre-baked options include "blue", "green", "red", "purple", or "black", or can provide the hexcode (default: "blue", of course, \'cause it\'s the best)', action = "store", dest = "color", default = "blue")
14 | parser.add_argument("-l", "--line-weight", metavar = "<INT>", help = 'Line weight if coloring branches (default: "2")', action = "store", default = "2")
15 | parser.add_argument("-o", "--output-file", help = 'Output file for iToL (default: "iToL-colors.txt")', action = "store", default = "iToL-colors.txt")
16 | 
17 | if len(sys.argv)==1:
18 |     parser.print_help(sys.stderr)
19 |     sys.exit(0)
20 | 
21 | args = parser.parse_args()
22 | 
23 | if args.color == "blue":
24 |     col = "#434da7"
25 | elif args.color == "green":
26 |     col = "#48a743"
27 | elif args.color == "red":
28 |     col = "#c01820"
29 | elif args.color == "purple":
30 |     col = "#512f9c"
31 | elif args.color == "black":
32 |     col = "#000000"
33 | else:
34 |     if not args.color.startswith("#"):
35 |         print("\n\tSorry, we're not prepared to handle \"" + str(args.color) + "\" as the color... :(\n")
36 |         parser.print_help(sys.stderr)
37 |         sys.exit(1)
38 |     else:
39 |         col = args.color
40 | 
41 | if args.to_color not in ["both", "branches", "labels"]:
42 |     print("\n\tSorry, we're not prepared to handle \"" + str(args.to_color) + "\" as the argument for what to color... :(\n")
43 |     parser.print_help(sys.stderr)
44 |     sys.exit(1)
45 | 
46 | try:
47 |     line_weight = float(args.line_weight)
48 | except ValueError:
49 |     print("\n\tSorry, " + str(args.line_weight) + " doesn't appear to be a number... :(\n")
50 |     parser.print_help(sys.stderr)
51 |     sys.exit(1)
52 | 
53 | target_list = []
54 | 
55 | with open(args.target_genomes, "r") as target_genomes:
56 |     for genome in target_genomes:
57 |         target_list.append(genome.strip())
58 | 
59 | out_file = open(args.output_file, "w")
60 | 
61 | out_file.write("TREE_COLORS\nSEPARATOR TAB\nDATA\n\n")
62 | 
63 | # writing lines for coloring labels if needed
64 | if args.to_color in ["both", "labels"]:
65 | 
66 |     for target in target_list:
67 |         out_file.write(str(target) + "\tlabel\t" + str(col) + "\tbold\n")
68 | 
69 | # writing lines for coloring branches if needed
70 | if args.to_color in ["both", "branches"]:
71 | 
72 |     for target in target_list:
73 |         out_file.write(str(target) + "\tbranch\t" + str(col) + "\tnormal\t" + str(line_weight) + "\n")
74 | 
75 | out_file.close()
76 | 


--------------------------------------------------------------------------------
/bit/bit-gen-iToL-text-dataset:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | import argparse
 5 | import os
 6 | 
 7 | parser = argparse.ArgumentParser(description = 'This script is for creating a standard iToL text dataset. For version info, run `bit-version`.')
 8 | 
 9 | required = parser.add_argument_group('required arguments')
10 | 
11 | required.add_argument("-g", "--target-genomes", metavar = "<FILE>", help = 'Single-column file with the genomes to color (need to match the IDs in the tree file)', action = "store", required = True)
12 | required.add_argument("-l", "--text-to-add", metavar = "<STR>", help = 'Text to add to the target genomes', action = "store", dest = "text", required = True)
13 | parser.add_argument("-c", "--color", metavar = "<STR>", help = 'Color to use of either: "blue", "green", "red", "purple", or "black" (default: "blue", of course, \'cause it\'s the best)', action = "store", default = "blue")
14 | 
15 | parser.add_argument("-o", "--output-file", metavar = "<FILE>", help = 'Output file for iToL (default: "iToL-text-dataset.txt")', action = "store", default = "iToL-text-dataset.txt")
16 | 
17 | if len(sys.argv)==1:
18 |     parser.print_help(sys.stderr)
19 |     sys.exit(0)
20 | 
21 | args = parser.parse_args()
22 | 
23 | if args.color == "blue":
24 |     col = "#434da7"
25 | elif args.color == "green":
26 |     col = "#48a743"
27 | elif args.color == "red":
28 |     col = "#c01820"
29 | elif args.color == "purple":
30 |     col = "#512f9c"
31 | elif args.color == "black":
32 |     col = "#000000"
33 | else:
34 |     print("\n\tSorry, we're not prepared to handle \"" + str(args.color) + "\" as the color... :(\n")
35 |     parser.print_help(sys.stderr)
36 |     sys.exit(1)
37 | 
38 | target_list = []
39 | 
40 | with open(args.target_genomes, "r") as target_genomes:
41 |     for genome in target_genomes:
42 |         target_list.append(genome.strip())
43 | 
44 | out_file = open(args.output_file, "w")
45 | 
46 | out_file.write("DATASET_TEXT\nSEPARATOR TAB\n\n")
47 | 
48 | # setting DATASET_LABEL
49 | out_file.write("DATASET_LABEL\tdata\n\n")
50 | 
51 | # setting dataset main color
52 | out_file.write("COLOR\t" + str(col) + "\n\n")
53 | 
54 | # writing lines for each labels
55 | out_file.write("DATA\n")
56 | 
57 | for target in target_list:
58 |     out_file.write(str(target) + "\t" + str(args.text) + "\t" + "-1" + "\t" + str(col) + "\t" + "normal" + "\t" + "1" + "\t" + "0" + "\n")
59 | 
60 | out_file.close()
61 | 


--------------------------------------------------------------------------------
/bit/bit-genbank-locus-clean-slate:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import warnings
 4 | warnings.filterwarnings("ignore")
 5 | 
 6 | from Bio import SeqIO
 7 | import argparse
 8 | import sys
 9 | import os
10 | import subprocess
11 | 
12 | parser = argparse.ArgumentParser(description = "Clean slate for LOCUS names in genbank files that are problematic (can be the case, for example, if annotated by NCBI but not officially released yet). This is only helpful if the original LOCUS names don't matter to us, of course. For version info, run `bit-version`.")
13 | 
14 | required = parser.add_argument_group('required arguments')
15 | 
16 | required.add_argument("-i", "--input-gb", metavar = "<FILE>", help = 'Input Genbank file (e.g. "*.gbk", "*.gb", "*.gbff")', action = "store", required = True)
17 | parser.add_argument("-w", "--wanted-name", metavar = "<STR>", help = 'New locus name prefix  (default: "Unknown")', action = "store", default = "Unknown")
18 | parser.add_argument("-o", "--output-gb", metavar = "<FILE>", help = 'Output genbank file (default: "clean.gb")', action = "store", default = "clean.gb")
19 | 
20 | if len(sys.argv)==1:
21 |     parser.print_help(sys.stderr)
22 |     sys.exit(0)
23 | 
24 | args = parser.parse_args()
25 | 
26 | tmp_file = args.input_gb + ".tmp"
27 | new_name = args.wanted_name
28 | 
29 | tmp = open(tmp_file, "w")
30 | 
31 | subprocess.call(['sed', 's/^LOCUS.*$/LOCUS       noname          0 bp    DNA     linear   BCT 00-MIK-0000/', args.input_gb], stdout=tmp)
32 | tmp.close()
33 | 
34 | output_gb = open(args.output_gb, "w")
35 | 
36 | recs = [rec for rec in SeqIO.parse(args.input_gb + ".tmp", "genbank")]
37 | 
38 | num = 0
39 | 
40 | for rec in recs:
41 |     num += 1
42 |     rec.name = new_name + "_" + str(num)
43 | 
44 |     output_gb.write(rec.format("genbank"))
45 | 
46 | output_gb.close()
47 | os.remove(tmp_file)
48 | 


--------------------------------------------------------------------------------
/bit/bit-genbank-to-AA-seqs:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from Bio import SeqIO
 4 | import argparse
 5 | import re
 6 | import sys
 7 | import os
 8 | 
 9 | parser = argparse.ArgumentParser(description = "This script takes a genbank file and returns amino acid sequences for all coding sequences. For version info, run `bit-version`.")
10 | 
11 | required = parser.add_argument_group('required arguments')
12 | 
13 | required.add_argument("-i", "--input-gb", metavar = "<FILE>", help = 'input Genbank file (e.g. "*.gbk", "*.gb", "*.gbff")', action = "store", required = True)
14 | parser.add_argument("-o", "--output-fasta", metavar = "<FILE>", help = 'Output fasta file (default: "output.faa")', action = "store", default = "output.faa")
15 | 
16 | if len(sys.argv)==1:
17 |     parser.print_help(sys.stderr)
18 |     sys.exit(0)
19 | 
20 | args = parser.parse_args()
21 | 
22 | input_gb = open(args.input_gb, "r")
23 | 
24 | output_fasta = open(args.output_fasta, "w")
25 | 
26 | recs = [rec for rec in SeqIO.parse(input_gb, "genbank")]
27 | 
28 | note_terms_to_exclude = ["frameshifted", "internal stop", "incomplete"] # dumping gene if noted as these in the "note" section of the call to keep only complete genes
29 | location_terms_to_exclude = ["join", "<", ">"] # dumping gene if "location" section contains any of these: "join" means the gene call spans multiple contigs; "<" or ">" means the gene call runs off a contig
30 | 
31 | for rec in recs:
32 | 
33 |     genes = [gene for gene in rec.features if gene.type =="CDS"] # focusing on features annotated as "CDS"
34 | 
35 |     for gene in genes:
36 | 
37 |         location = str(gene.location)
38 | 
39 |         # dumping gene if "location" section contains any of these terms set above: "join" means the gene call spans multiple contigs; "<" or ">" means the gene call runs off a contig
40 |         if any(exclusion_term in location for exclusion_term in location_terms_to_exclude):
41 |             continue
42 | 
43 |         if "note" in gene.qualifiers:
44 |             note = str(gene.qualifiers["note"][0])
45 |         else:
46 |             note = ""
47 | 
48 |         # dumping gene if noted as any of these in the "note" section set above
49 |         if any(exclusion_term in note for exclusion_term in note_terms_to_exclude):
50 |             continue
51 | 
52 |         # dumping if overlapping translation frame
53 |         if "transl_except" in gene.qualifiers:
54 |             continue
55 | 
56 |         # dumping if noted a pseudo gene
57 |         if "pseudo" in gene.qualifiers:
58 |             continue
59 | 
60 |         if "locus_tag" in gene.qualifiers:
61 |             locus_tag = str(gene.qualifiers["locus_tag"][0])
62 |         else:
63 |             locus_tag = "No_locus_tag"
64 | 
65 |         if "protein_id" in gene.qualifiers:
66 |             protein_id = str(gene.qualifiers["protein_id"][0])
67 |         else:
68 |             protein_id = "No_protein_id"
69 | 
70 |         if "product" in gene.qualifiers:
71 |             product = str(gene.qualifiers["product"][0])
72 |         else:
73 |             product = "No_product"
74 | 
75 |         output_fasta.write(f">{product}_{locus_tag}_{protein_id}\n{gene.qualifiers['translation'][0]}\n")
76 | 
77 | input_gb.close()
78 | output_fasta.close()
79 | 


--------------------------------------------------------------------------------
/bit/bit-genbank-to-cds-table:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | import argparse
 5 | import pandas as pd
 6 | from Bio import SeqIO
 7 | 
 8 | parser = argparse.ArgumentParser(description = "This script takes a genbank-formatted file and extracts basic info \
 9 |                                  for every 'CDS' feature including: 'gene', 'protein_id', 'locus_tag', and 'product'. \
10 |                                  It then writes those out to a tab-delimited file. For version info, run `bit-version`.",
11 |                                  epilog="Ex. usage: bit-genbank-to-cds-table -i input.gb -o output.tsv")
12 | 
13 | required = parser.add_argument_group('REQUIRED PARAMETERS')
14 | optional = parser.add_argument_group('OPTIONAL PARAMETERS')
15 | 
16 | required.add_argument("-i", "--input-gb", help = "input genbank file",
17 |                       metavar = "<FILE>", required = True)
18 | 
19 | optional.add_argument("-o", "--output-tsv", help = 'output tsv (default: "output.tsv")', action = "store",
20 |                       metavar = "<FILE>", default = "output.tsv")
21 | 
22 | if len(sys.argv)==1:
23 |     parser.print_help(sys.stderr)
24 |     sys.exit(0)
25 | 
26 | args = parser.parse_args()
27 | 
28 | ################################################################################
29 | 
30 | def main():
31 | 
32 |     cds_dataframe = parse_genbank_cds_to_dataframe(args.input_gb)
33 | 
34 |     save_dataframe_to_tsv(cds_dataframe, args.output_tsv)
35 | 
36 |     print(f"\n    CDS table written to '{args.output_tsv}'!\n")
37 | 
38 | ################################################################################
39 | 
40 | 
41 | def parse_genbank_cds_to_dataframe(file_path):
42 |     cds_entries = []
43 | 
44 |     with open(file_path, "r") as handle:
45 | 
46 |         for record in SeqIO.parse(handle, "genbank"):
47 | 
48 |             for feature in record.features:
49 |                 if feature.type == "CDS":
50 | 
51 |                     gene = feature.qualifiers.get("gene", ["NA"])[0]
52 |                     locus_tag = feature.qualifiers.get("locus_tag", ["NA"])[0]
53 |                     product = feature.qualifiers.get("product", ["NA"])[0]
54 |                     protein_id = feature.qualifiers.get("protein_id", ["NA"])[0]
55 | 
56 |                     cds_entries.append({
57 |                         "gene": gene,
58 |                         "protein_id": protein_id,
59 |                         "locus_tag": locus_tag,
60 |                         "product": product,
61 |                     })
62 | 
63 |     cds_df = pd.DataFrame(cds_entries)
64 | 
65 |     return cds_df
66 | 
67 | 
68 | def save_dataframe_to_tsv(df, output_file):
69 | 
70 |     df.to_csv(output_file, sep = '\t', index = False)
71 | 
72 | ################################################################################
73 | 
74 | if __name__ == "__main__":
75 |     main()


--------------------------------------------------------------------------------
/bit/bit-genbank-to-fasta:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from Bio import SeqIO
 4 | import argparse
 5 | import sys
 6 | import os
 7 | 
 8 | parser = argparse.ArgumentParser(description="Parse nucleotide sequences from GenBank file into fasta file. For version info, run `bit-version`.")
 9 | 
10 | required = parser.add_argument_group('required arguments')
11 | 
12 | required.add_argument("-i", "--input-gb", metavar = "<FILE>", help = 'input Genbank file (e.g. "*.gbk", "*.gb", "*.gbff")', action = "store", required = True)
13 | parser.add_argument("-o", "--output-fasta", metavar = "<FILE>", help = 'Output fasta file (default: "genbank.fa")', action = "store", default = "genbank.fa")
14 | 
15 | if len(sys.argv)==1:
16 |     parser.print_help(sys.stderr)
17 |     sys.exit(0)
18 | 
19 | args = parser.parse_args()
20 | 
21 | input_gb = open(args.input_gb, "r")
22 | 
23 | output = open(args.output_fasta, "w")
24 | 
25 | recs = [rec for rec in SeqIO.parse(input_gb, "genbank")]
26 | 
27 | for rec in recs:
28 |     output.write(">" + rec.name  + "\n" + str(rec.seq) + "\n")
29 | 
30 | input_gb.close()
31 | output.close()
32 | 


--------------------------------------------------------------------------------
/bit/bit-get-cov-stats:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import argparse
  4 | import sys
  5 | from pathlib import Path
  6 | import gzip
  7 | from dataclasses import dataclass, field
  8 | from Bio import SeqIO #type: ignore
  9 | 
 10 | parser = argparse.ArgumentParser(
 11 |     description="This script generates whole-reference detection and coverage info\
 12 |         for specified references given the reference fasta(s) and a mosdepth-produced\
 13 |         *per-base.bed.gz file. For version info, run `bit-version`.",
 14 |     epilog="Ex. usage: bit-get-cov-stats -r reference.fasta -b asm-per-base.bed.gz",
 15 | )
 16 | required = parser.add_argument_group("REQUIRED PARAMETERS")
 17 | optional = parser.add_argument_group("OPTIONAL PARAMETERS")
 18 | 
 19 | required.add_argument(
 20 |     "-r",
 21 |     "--reference-fastas",
 22 |     metavar="<STR>",
 23 |     help='Path to reference fasta file(s)',
 24 |     required=True,
 25 |     nargs="+",
 26 | )
 27 | required.add_argument(
 28 |     "-b",
 29 |     "--bed-file",
 30 |     metavar="<PATH(s)>",
 31 |     help="Path to mosdepth-produced *per-base.bed.gz file",
 32 |     required=True,
 33 | )
 34 | optional.add_argument(
 35 |     "-o",
 36 |     "--outpath",
 37 |     metavar="<STR>",
 38 |     help='Name of the output file (default: "coverage-stats.tsv")',
 39 |     default="coverage-stats.tsv",
 40 | )
 41 | 
 42 | 
 43 | def main(reference_fastas, bed_file, outpath):
 44 | 
 45 |     preflight_checks(reference_fastas, bed_file)
 46 | 
 47 |     refs = parse_refs(reference_fastas)
 48 | 
 49 |     refs = parse_bed_file(refs, bed_file)
 50 | 
 51 |     generate_output(refs, outpath)
 52 | 
 53 | 
 54 | def preflight_checks(reference_fastas, bed_file):
 55 |     paths_list = reference_fastas + [bed_file]
 56 |     check_files_are_found(paths_list)
 57 | 
 58 | 
 59 | def check_files_are_found(paths_list):
 60 |     for path in paths_list:
 61 |         if not Path(path).is_file():
 62 |             print(f"\n    We were not able to find the input file: {path}")
 63 |             notify_premature_exit()
 64 | 
 65 | 
 66 | def notify_premature_exit():
 67 |     print("\n    Exiting for now :(\n")
 68 |     sys.exit(1)
 69 | 
 70 | 
 71 | def parse_refs(reference_fastas):
 72 |     refs = []
 73 |     for fasta in reference_fastas:
 74 |         ref = RefData(fasta)
 75 |         ref.load_fasta()
 76 |         refs.append(ref)
 77 | 
 78 |     return refs
 79 | 
 80 | 
 81 | @dataclass
 82 | class RefData:
 83 |     path: str
 84 |     headers: set = field(default_factory=set)
 85 |     total_length: int = 0
 86 |     total_coverage_count: int = 0
 87 |     total_bases_detected_at_all: int = 0
 88 |     total_bases_detected_at_10x: int = 0
 89 | 
 90 |     def load_fasta(self):
 91 |         with open(self.path, "r") as f:
 92 |             for record in SeqIO.parse(f, "fasta"):
 93 |                 self.headers.add(record.id)
 94 |                 self.total_length += len(record.seq)
 95 | 
 96 |     def update_from_bed_line(self, header: str, start: int, end: int, num_reads: int):
 97 |         if header in self.headers:
 98 |             cur_range_covered = end - start
 99 |             self.total_coverage_count += num_reads * cur_range_covered
100 |             if num_reads > 0:
101 |                 self.total_bases_detected_at_all += cur_range_covered
102 |             if num_reads >= 10:
103 |                 self.total_bases_detected_at_10x += cur_range_covered
104 | 
105 |     def compute_metrics(self):
106 |         detection = round(self.total_bases_detected_at_all / self.total_length, 4)
107 |         detection_at_10x = round(self.total_bases_detected_at_10x / self.total_length, 4)
108 |         average_coverage = round(self.total_coverage_count / self.total_length, 4)
109 |         return detection, detection_at_10x, average_coverage
110 | 
111 | 
112 | def parse_bed_file(refs, bed_file):
113 | 
114 |     with gzip.open(bed_file, "rt") as f:
115 |         for line in f:
116 |             header, start, end, num_reads = line.strip().split("\t")
117 |             for ref in refs:
118 |                 ref.update_from_bed_line(header, int(start), int(end), int(num_reads))
119 | 
120 |     return(refs)
121 | 
122 | 
123 | def generate_output(refs, outpath):
124 |     with open(outpath, "w") as f:
125 |         f.write("Ref\tDetection\tDetection_at_10x\tAverage_coverage\n")
126 |         for ref in refs:
127 |             detection, detection_at_10x, average_coverage = ref.compute_metrics()
128 |             f.write(f"{ref.path}\t{detection}\t{detection_at_10x}\t{average_coverage}\n")
129 | 
130 | 
131 | if __name__ == "__main__":
132 |     if len(sys.argv) == 1:  # pragma: no cover
133 |         parser.print_help(sys.stderr)
134 |         sys.exit(0)
135 |     args = parser.parse_args()
136 | 
137 |     main(
138 |         args.reference_fastas,
139 |         args.bed_file,
140 |         args.outpath,
141 |     )
142 | 


--------------------------------------------------------------------------------
/bit/bit-get-go-term-info:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | ## lots of this is from this great tutorial: GO Tutorial in Python - Solutions.ipynb, which comes from here: http://gohandbook.org/doku.php ; https://nbviewer.jupyter.org/urls/dessimozlab.github.io/go-handbook/GO%20Tutorial%20in%20Python%20-%20Solutions.ipynb
  4 | 
  5 | from goatools import obo_parser
  6 | import os
  7 | import argparse
  8 | import pandas as pd
  9 | import sys
 10 | import subprocess
 11 | 
 12 | parser = argparse.ArgumentParser(description = "Get quick information on individual GO terms. For version info, run `bit-version`.")
 13 | parser.add_argument('GO-term', metavar = "<STR>", help =  'GO term you want to investigate, e.g. "GO:0010501"')
 14 | 
 15 | parser.add_argument("-g", "--GO-obo-file", metavar = "<STR>", help = 'GO obo file to use (e.g. from: geneontology.org/docs/download-ontology/). By default will \
 16 |                                                  use "go-basic.obo". "goslim_metagenomics.obo" is also a pre-packaged option (enter `-g goslim_metagenomics` to specify it). Or \
 17 |                                                  a different obo-formatted file can be specified here.',
 18 |                     action = "store", dest = "obo", default = "go_basic")
 19 | 
 20 | parser.add_argument("--parents-only", help = "Add this flag to report parents only, and no children.", action = "store_true")
 21 | 
 22 | 
 23 | if len(sys.argv)==1:
 24 |     parser.print_help(sys.stderr)
 25 |     sys.exit(0)
 26 | 
 27 | args = parser.parse_args()
 28 | 
 29 | pd.set_option('display.max_colwidth', None)
 30 | 
 31 | ### checking and setting up obo file location
 32 | go_data_dir = os.environ["GO_DB_DIR"]
 33 | 
 34 | ## downloading default GO databases if they are not present already
 35 | checking_db_dir = subprocess.run(["helper-bit-setup-GO-dbs"])
 36 | 
 37 | if args.obo == "goslim_metagenomics":
 38 |     go_obo = go_data_dir + "goslim_metagenomics.obo"
 39 | 
 40 | elif args.obo == "go_basic":
 41 |     go_obo = go_data_dir + "go-basic.obo"
 42 | 
 43 | else:
 44 |     go_obo = args.obo
 45 | 
 46 | ## loading GO database
 47 | print("\n\tGO obo file being used:")
 48 | go = obo_parser.GODag(go_obo)
 49 | print("")
 50 | 
 51 | input_go_id = args.GO_term
 52 | 
 53 | # adding "GO:" if not in input
 54 | if not input_go_id.startswith("GO:"):
 55 |     input_go_id = "GO:" + input_go_id
 56 | 
 57 | # trying to pull GO id from database, if not, quitting and reporting
 58 | try:
 59 |     input_go_term = go[input_go_id]
 60 | except:
 61 |     print(str(input_go_id) + " does not seem to be in the GO database :(\n")
 62 |     sys.exit()
 63 | 
 64 | def get_general_info(go_id):
 65 |     go_term = go[go_id]
 66 |     name = go_term.name
 67 |     namespace = go_term.namespace
 68 |     depth = go_term.depth
 69 | 
 70 |     go_term_info = [go_id, namespace, depth, name]
 71 |     return go_term_info
 72 | 
 73 | curr_go_info = get_general_info(input_go_id)
 74 | 
 75 | header = ["GO id", "namespace", "depth", "name"]
 76 | 
 77 |   # getting current term info
 78 | input_df = pd.DataFrame([curr_go_info], columns = header)
 79 | 
 80 | print("Input GO term info:")
 81 | print(input_df.to_string(index=False))
 82 | 
 83 |   # getting parent terms and their info
 84 | parents = input_go_term.get_all_parents()
 85 | 
 86 | if parents:
 87 |     parent_df = pd.DataFrame([])
 88 | 
 89 |     for term in parents:
 90 |         curr_parent_info = get_general_info(term)
 91 |         parent_df = parent_df.append([curr_parent_info])
 92 | 
 93 |     print("\nParent terms info:")
 94 |     print(parent_df.to_string(index=False, header = header))
 95 | 
 96 | else:
 97 |     print("\nThere are no parent terms for " + str(input_go_id) + ".")
 98 | 
 99 |   # getting child terms and their info unless --parents-only flag was specified
100 | if not args.parents_only:
101 |     children = input_go_term.get_all_children()
102 | 
103 |     if children:
104 |         child_df = pd.DataFrame([])
105 | 
106 |         for term in children:
107 |             curr_child_info = get_general_info(term)
108 |             child_df = child_df.append([curr_child_info])
109 | 
110 |         print("\nChild terms info:")
111 |         print(child_df.to_string(index=False, header = header))
112 | 
113 |     else:
114 |         print("\nThere are no child terms for " + str(input_go_id) + ".")
115 | 
116 | print("")
117 | 


--------------------------------------------------------------------------------
/bit/bit-get-lineage-from-taxids:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -e
 3 | 
 4 | # setting colors to use
 5 | GREEN='\033[0;32m'
 6 | RED='\033[0;31m'
 7 | NC='\033[0m'
 8 | 
 9 | 
10 | if [ "$#" == 0 ] || [ $1 == "-h" ] || [ $1 == "help" ] || [ $1 == "--help" ]; then
11 |     printf "\n  This script uses taxonkit in a standard fashion to get lineage info from NCBI taxids.\n"
12 |     printf "  It expects a single column file of taxids with no header, return table in the same order.\n"
13 |     printf "  Thanks go to taxonkit, don't forget to cite that if using: https://bioinf.shenwei.me/taxonkit/.\n"
14 |     printf "  Add the '-s' flag to include strain info if available. For version info, run \`bit-version\`.\n\n"
15 |     printf "    Usage:\n\t bit-get-lineage-from-taxids -i taxids.txt -o lineages.tsv\n\n"
16 |     exit
17 | fi
18 | 
19 | # setting defaults
20 | output_file="lineages.tsv"
21 | include_strain=false
22 | ## parsing arguments
23 | while getopts :i:o:s args
24 | do
25 |     case "${args}"
26 |     in
27 |         i) taxids_file=${OPTARG};;
28 |         o) output_file=${OPTARG};;
29 |         s) include_strain=true;;
30 |         \?) printf "\n  ${RED}Invalid argument: -${OPTARG}${NC}\n\n    Run 'bit-get-lineage-from-taxids' with no arguments or '-h' only to see help menu.\n\n" >&2 && exit
31 |     esac
32 | done
33 | 
34 | ## checking variables are good
35 | if [ -z $taxids_file ]; then
36 |     printf "\n  Please specify an input taxid file to the '-i' argument.\n"
37 |     printf "\nExiting for now.\n\n"
38 |     exit
39 | fi
40 | 
41 | 
42 | if [ ! -f $taxids_file ]; then
43 |     printf "\n  The specified input file, $taxids_file, doesn't seem to be where we think it is :( \n"
44 |     printf "\nExiting for now.\n\n"
45 |     exit
46 | fi
47 | 
48 | 
49 | ### checking that ncbi tax data is present already, and downloading if it isn't
50 | helper-bit-get-ncbi-tax-data
51 | 
52 | if [ "${include_strain}" = true ]; then
53 |     taxonkit_reformat_pattern='{domain|superkingdom}\t{phylum}\t{class}\t{order}\t{family}\t{genus}\t{species}\t{strain|subspecies|no rank}'
54 |     header="taxid\tdomain\tphylum\tclass\torder\tfamily\tgenus\tspecies\tstrain\n"
55 | else
56 |     taxonkit_reformat_pattern='{domain|superkingdom}\t{phylum}\t{class}\t{order}\t{family}\t{genus}\t{species}'
57 |     header="taxid\tdomain\tphylum\tclass\torder\tfamily\tgenus\tspecies\n"
58 | fi
59 | 
60 | cat $taxids_file | taxonkit lineage | taxonkit reformat2 -r NA -f "${taxonkit_reformat_pattern}" | cut -f 1,3- | tr ";" "\t" > lineages.tmp
61 | 
62 | cat <(printf "${header}") lineages.tmp > $output_file
63 | 
64 | rm lineages.tmp
65 | 


--------------------------------------------------------------------------------
/bit/bit-get-test-data:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | This is a program for downloading test data files.
 5 | """
 6 | 
 7 | import sys
 8 | import os
 9 | import argparse
10 | import textwrap
11 | 
12 | parser = argparse.ArgumentParser(description = "This is a helper program for downloading test data for use with bit workflows and programs. For bit verison info run `bit-version`. ",
13 |                                  epilog = "Ex. usage: bit-get-test-data metagenomics\n")
14 | 
15 | required = parser.add_argument_group('required arguments')
16 | 
17 | required.add_argument('datatype', choices = ['metagenomics'],
18 |                                            help = "The first positional argument should be what type of test data you'd like to download")
19 | 
20 | if len(sys.argv)==1:
21 |     parser.print_help(sys.stderr)
22 |     sys.exit(0)
23 | 
24 | args = parser.parse_args()
25 | 
26 | 
27 | ################################################################################
28 | 
29 | def main():
30 | 
31 |     dl_test_data()
32 | 
33 | ################################################################################
34 | 
35 | ### variables and functions ###
36 | 
37 | tty_colors = {
38 |     'green' : '\033[0;32m%s\033[0m',
39 |     'yellow' : '\033[0;33m%s\033[0m',
40 |     'red' : '\033[0;31m%s\033[0m'
41 | }
42 | 
43 | def color_text(text, color='green'):
44 |     if sys.stdout.isatty():
45 |         return tty_colors[color] % text
46 |     else:
47 |         return text
48 | 
49 | 
50 | def wprint(text):
51 |     print(textwrap.fill(text, width=80, initial_indent="  ",
52 |           subsequent_indent="  ", break_on_hyphens=False))
53 | 
54 | 
55 | def report_message(message, color = "yellow"):
56 |     print("")
57 |     wprint(color_text(message, color))
58 | 
59 | 
60 | def dl_test_data():
61 | 
62 |     """ main function for downloading test data """
63 | 
64 |     if args.datatype == "metagenomics":
65 | 
66 |         report_message("Downloading and unpacking 2 paired-end Illumina metagenomics test samples (4 files, ~800 MB total; they are kinda large for test data so MAGs can be recovered):")
67 |         print("")
68 | 
69 |         # getting the metagenomics test data
70 |         os.system("curl -L -o test-metagenomics-reads.zip https://figshare.com/ndownloader/files/46096083")
71 | 
72 |         # extracting
73 |         os.system("unzip -qo test-metagenomics-reads.zip")
74 | 
75 |         # removing archive
76 |         os.system("rm test-metagenomics-reads.zip")
77 | 
78 |         report_message("Pulled metagenomics (Illumina) reads for two test samples from here:", "green")
79 |         print("    https://figshare.com/account/projects/203736/articles/25750935\n")
80 | 
81 |     else:
82 | 
83 |         report_message("The data type you requested is not currently available.", "red")
84 | 
85 |         print("\n    Please check the currently available data types with 'bit-get-test-data --help'\n")
86 | 
87 | 
88 | ################################################################################
89 | 
90 | if __name__ == "__main__":
91 |     main()
92 | 


--------------------------------------------------------------------------------
/bit/bit-kraken2-to-taxon-summaries:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """
  4 | This script parses the regular read-based output of a kraken2 run, producing a table of full standard taxonomic lineages of all
  5 | classifications (filling in NAs for lower ranks), with counts of how many reads went to that specific taxon. Depends on another `bit` program
  6 | and taxonkit, if `bit` was installed with conda, all should be swell :)
  7 | 
  8 | Input table expected to look like this:
  9 | 
 10 | U  A00159:145:H75T2DMXX:1:1101:7735:13792   unclassified (taxid 0)    16   0:0
 11 | U  A00159:145:H75T2DMXX:1:1101:11216:13557  unclassified (taxid 0)    30   0:0
 12 | U  A00159:145:H75T2DMXX:1:1101:22688:14074  unclassified (taxid 0)    26   0:0
 13 | U  A00159:145:H75T2DMXX:1:1101:1325:14559   unclassified (taxid 0)    31   0:0
 14 | U  A00159:145:H75T2DMXX:1:1101:23719:15013  unclassified (taxid 0)    30   0:0
 15 | C  A00159:145:H75T2DMXX:1:1102:11388:8312   Ochrobactrum (taxid 528)  194  0:12 1224:16 28211:7 528:15
 16 | U  A00159:145:H75T2DMXX:1:1102:15465:8390   unclassified (taxid 0)    27   0:0
 17 | U  A00159:145:H75T2DMXX:1:1102:6343:7560    unclassified (taxid 0)    271  0:237
 18 | U  A00159:145:H75T2DMXX:1:1102:30101:11600  unclassified (taxid 0)    26   0:0
 19 | U  A00159:145:H75T2DMXX:1:1101:19678:2221   unclassified (taxid 0)    279  0:245
 20 | 
 21 | Unclassified are reported on one row with "Unclassified" specified at each rank. If names are included, like the example above, you would need to add the '--names-included' flag when running the program.
 22 | """
 23 | 
 24 | import sys
 25 | import argparse
 26 | import pandas as pd
 27 | import subprocess
 28 | import os
 29 | import re
 30 | 
 31 | parser = argparse.ArgumentParser(description = "This script parses the regular read-based output of a kraken2 run, producing a table of full standard taxonomic lineages of all \
 32 |                                               classifications (filling in NAs for lower ranks), with counts of how many reads went to that specific taxon. For version info, run `bit-version`.")
 33 | 
 34 | required = parser.add_argument_group('required arguments')
 35 | 
 36 | required.add_argument("-i", "--input-tsv", metavar = "<FILE>", help = "Input table produced by kraken2 run", action = "store", dest = "input_file", required = True)
 37 | parser.add_argument("-o", "--output-tsv", metavar = "<FILE>", help = 'Output table name (default: "output.tsv")', action = "store", dest = "output_file", default = "output.tsv")
 38 | parser.add_argument("--names-included", help = 'Add this flag if kraken2 was run with the `--use-names` flag', action = "store_true")
 39 | 
 40 | if len(sys.argv)==1:
 41 |     parser.print_help(sys.stderr)
 42 |     sys.exit(0)
 43 | 
 44 | args = parser.parse_args()
 45 | 
 46 | # initializing stuff
 47 | unclassified_count = 0
 48 | taxid_counts_dict = {}
 49 | 
 50 | # iterating through read classifications
 51 | with open(args.input_file, "r") as classifications:
 52 |     for line in classifications:
 53 | 
 54 |         # adding to unclassified count and moving on if unclassified
 55 |         if line.startswith("U"):
 56 |             unclassified_count += 1
 57 |             continue
 58 | 
 59 |         # gettig taxid classification of current read
 60 |         if args.names_included:
 61 |             classification = line.strip().split("\t")[2]
 62 |             taxid = re.split('\(taxid ', classification)[1].rstrip(")")
 63 |         else:
 64 |             taxid = line.strip().split("\t")[2]
 65 | 
 66 |         # adding count to taxid in taxid dictionary if present
 67 |         if taxid in taxid_counts_dict:
 68 |             taxid_counts_dict[taxid] += 1
 69 | 
 70 |         # adding to taxid dictionary if current taxid not yet present
 71 |         else:
 72 |             taxid_counts_dict[taxid] = 1
 73 | 
 74 | # getting standard lineage for each taxid
 75 |     # writing out taxids to temp file
 76 | with open("bit-convert-kraken2.tmp", "w") as tmp_taxid_file:
 77 |     for key in taxid_counts_dict:
 78 |         tmp_taxid_file.write(str(key) + "\n")
 79 | 
 80 | # getting taxid lineages
 81 | running_bit_get_lineages_from_taxids = subprocess.run(["bit-get-lineage-from-taxids", "-i", "bit-convert-kraken2.tmp", "-o", "bit-convert-kraken2-lineages.tmp"], stdout=subprocess.DEVNULL)
 82 | 
 83 | # reading in results as table
 84 | lineage_tab = pd.read_csv("bit-convert-kraken2-lineages.tmp", sep="\t")
 85 | 
 86 | # converting taxid dict to dataframe
 87 | taxid_counts_df = pd.DataFrame.from_dict(taxid_counts_dict, orient="index").reset_index()
 88 |  # moving index to column and setting column names
 89 | taxid_counts_df.rename(columns={"index":"taxid", 0:"read_counts"}, inplace=True)
 90 | 
 91 |  # setting to integer type so can be merged with lineage tab
 92 | taxid_counts_df = taxid_counts_df.astype({"taxid":'int64'})
 93 | 
 94 | # merging
 95 | combined_tab = lineage_tab.merge(taxid_counts_df).fillna("NA")
 96 | 
 97 | # adding in unclassified row
 98 |     # pandas append deprecated and dropped as of pandas 2.0, using concat below
 99 | # combined_tab = combined_tab.append({"taxid":0, "domain":"Unclassified", "phylum":"Unclassified", "class":"Unclassified", "order":"Unclassified", "family":"Unclassified", "genus":"Unclassified", "species":"Unclassified", "read_counts":unclassified_count}, ignore_index=True)
100 | unclassified_dict = {"taxid":0, "domain":"Unclassified", "phylum":"Unclassified", "class":"Unclassified", "order":"Unclassified", "family":"Unclassified", "genus":"Unclassified", "species":"Unclassified", "read_counts":unclassified_count}
101 | unclassified_tab = pd.DataFrame(unclassified_dict, index = [0]).reset_index()
102 | # dropping index column
103 | unclassified_tab = unclassified_tab.drop("index", axis = "columns")
104 | 
105 | combined_tab = pd.concat([combined_tab, unclassified_tab], ignore_index = True)
106 | 
107 | # adding in percent column
108 | combined_tab['percent_of_reads'] = combined_tab.read_counts / combined_tab.read_counts.sum() * 100
109 | 
110 | # sorting
111 | combined_tab.sort_values(by=["taxid"], inplace=True)
112 | 
113 | # writing out
114 | combined_tab.to_csv(args.output_file, sep="\t", header=True, index=False)
115 | 
116 | # removing intermediate files
117 | os.remove("bit-convert-kraken2.tmp")
118 | os.remove("bit-convert-kraken2-lineages.tmp")
119 | 


--------------------------------------------------------------------------------
/bit/bit-lineage-to-tsv:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import argparse
 4 | import re
 5 | 
 6 | parser = argparse.ArgumentParser(description = 'This script converts lineages in this format (e.g., "root;d__Bacteria;p__Proteobacteria") into consistent tsv format \
 7 |                                                (e.g., "Bacteria\tProteobacteria\tNA\tNA\tNA\tNA\tNA"). It expects as input a 2-column tab-delimited file with column \
 8 |                                                1 holding an identifier and column 2 holding the lineage. For version info, run `bit-version`.')
 9 | 
10 | required = parser.add_argument_group('required arguments')
11 | 
12 | required.add_argument("-i", "--input-tsv", metavar = "<FILE>", help = 'input table, first column needs to be an identifier, second column the lineage', action = "store", required = True)
13 | 
14 | parser.add_argument("-o", "--output-tsv", metavar = "<FILE>", help = 'output file (default: "formatted-tax.tsv")', action = "store", default = "formatted-tax.tsv")
15 | 
16 | parser.add_argument("--make-taxid", help = "Provide this flag to make a unique taxid (string of all rank fields) for each lineage \
17 |                                             (will be added as second column of output)", action = "store_true")
18 | 
19 | args = parser.parse_args()
20 | 
21 | 
22 | # helper function
23 | def get_rank(lineage, prefix):
24 | 
25 |     if lineage.startswith(prefix):
26 | 
27 |         curr_rank = lineage.split(";")[0].replace(prefix, "", 1)
28 | 
29 |         lineage = re.sub(f"^{prefix}{curr_rank};", "", lineage)
30 | 
31 |     else:
32 | 
33 |         curr_rank = "NA"
34 | 
35 |     return(lineage, curr_rank)
36 | 
37 | 
38 | # converting
39 | with open(args.input_tsv) as in_tab:
40 | 
41 |     with open(args.output_tsv, "w") as out_tab:
42 | 
43 |         # adding header
44 |         if args.make_taxid:
45 | 
46 |             out_tab.write("seq_ID\ttaxid\tdomain\tphylum\tclass\torder\tfamily\tgenus\tspecies\n")
47 | 
48 |         else:
49 | 
50 |             out_tab.write("seq_ID\tdomain\tphylum\tclass\torder\tfamily\tgenus\tspecies\n")
51 | 
52 |         for line in open(args.input_tsv):
53 | 
54 |             line = line.strip().split("\t")
55 |             ID = line[0]
56 | 
57 |             # this is if there is no second column (no lineage)
58 |             if len(line) == 1:
59 | 
60 |                 out_line = f"{ID}\tNA\tNA\tNA\tNA\tNA\tNA\tNA"
61 | 
62 |             else:
63 | 
64 |                 lineage = line[1]
65 | 
66 |                 # removing "root" if that's at the start
67 |                 if lineage.startswith("root;"):
68 |                     lineage = re.sub("^root;", "", lineage)
69 | 
70 |                 # getting all ranks present, setting to NA if not
71 |                 lineage, t_domain = get_rank(lineage, "d__")
72 |                 lineage, t_phylum = get_rank(lineage, "p__")
73 |                 lineage, t_class = get_rank(lineage, "c__")
74 |                 lineage, t_order = get_rank(lineage, "o__")
75 |                 lineage, t_family = get_rank(lineage, "f__")
76 |                 lineage, t_genus = get_rank(lineage, "g__")
77 |                 lineage, t_species = get_rank(lineage, "s__")
78 | 
79 |                 if args.make_taxid:
80 | 
81 |                     taxid_string = "_".join([t_domain, t_phylum, t_class, t_order, t_family, t_genus, t_species]).replace(" ", "_")
82 |                     out_line = "\t".join([ID, taxid_string, t_domain, t_phylum, t_class, t_order, t_family, t_genus, t_species])
83 | 
84 |                 else:
85 | 
86 |                     out_line = "\t".join([ID, t_domain, t_phylum, t_class, t_order, t_family, t_genus, t_species])
87 | 
88 |             out_tab.write(out_line + "\n")
89 | 


--------------------------------------------------------------------------------
/bit/bit-normalize-table:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """
  4 | Expects rows to be units (e.g. genes/KOs/etc.), and columns to be samples.
  5 | 
  6 | This script normalizes a table for sampling depth by either coverage per million (CPM) or based on the median-ratio
  7 | method as performed in DESeq2. But unlike DESeq2, we don't care here if there are floats in there.
  8 | 
  9 | I initially wrote this for normalizing metagenomic coverage data, like gene-level coverage, or summed KO coverages.
 10 | These are normalized for gene-length already because they are "coverages", but they are not yet normalized
 11 | for sampling depth – which is where this script comes in.
 12 | 
 13 | I also found myself wanting this because I wanted to do differential abundance testing of coverages
 14 | of KO terms. DESeq2 doesn't require normalizing for gene-length because it is the same unit being analyzed
 15 | across all samples – the same gene, so the same size. However, after grouping genes into their KO annotations,
 16 | (which we may need to compare across samples that don't all share the same underlying assembly or genes),
 17 | they no longer all represent the same units across all samples. It is because of this I decided to stick with
 18 | gene-level coverages (which are normalized for gene-length), and then sum those values based on KO annotations.
 19 | 
 20 | The CPM (coverage per million) normalization is just like a percent, except scaled to 1 million instead of 100.
 21 | So each row's entry (e.g. gene/KO/etc.) is the proportion out of 1 million for that column (sample),
 22 | and each column will sum to 1 million.
 23 | 
 24 | The median-ration normalization method (MR) was initially described in this paper
 25 | (http://dx.doi.org/10.1186/gb-2010-11-10-r106; e.q. 5), and this site is super-informative in general
 26 | about the DESeq2 process overall, and helped me understand the normalizaiton process better to implement it:
 27 | https://hbctraining.github.io/DGE_workshop/lessons/02_DGE_count_normalization.html. Columns will not sum to
 28 | the same amount when the median-ratio method is applied.
 29 | """
 30 | 
 31 | import os
 32 | import sys
 33 | import argparse
 34 | import pandas as pd
 35 | import numpy as np
 36 | from scipy.stats.mstats import gmean
 37 | 
 38 | parser = argparse.ArgumentParser(description = "This script normalizes a coverage table for sampling depth based on either \
 39 |                                              coverage per million (CPM) or the median-ratio method (MR) as performed \
 40 |                                              in DESeq2. See note at top of script for more info. It expects a \
 41 |                                              tab-delimited table with samples as columns and units (e.g. genes/KOs/etc.) \
 42 |                                              as rows. For version info, run `bit-version`.")
 43 | 
 44 | required = parser.add_argument_group('required arguments')
 45 | 
 46 | required.add_argument("-i", "--input-table", metavar = "<FILE>", help = "Input tab-delimited table", action = "store", required = True)
 47 | 
 48 | parser.add_argument("-n", "--normalization-method", help = 'Desired normalization method of either \
 49 |                     "CPM" as in coverage per million, or "MR" as in median-ratio as performed in DESeq2. \
 50 |                     See note at top of program for more info. (default: "CPM")', choices = ["CPM", "MR"], \
 51 |                     action = "store", default = "CPM")
 52 | 
 53 | parser.add_argument("-o", "--output-table", metavar = "<FILE>", help = 'Output filename (default: "Normalized.tsv")', action = "store", default = "Normalized.tsv")
 54 | 
 55 | if len(sys.argv)==1:
 56 |     parser.print_help(sys.stderr)
 57 |     sys.exit(0)
 58 | 
 59 | args = parser.parse_args()
 60 | 
 61 | ################################################################################
 62 | 
 63 | tab = pd.read_csv(args.input_table, sep = "\t", index_col = 0, low_memory = False)
 64 | 
 65 | 
 66 | ## removing columns if they have all zeroes in them prior to normalization to avoid problems (will put them back)
 67 | column_sums = tab.sum()
 68 | 
 69 | # getting all column names in order so can rearrange afterwards
 70 | ordered_columns = tab.columns.tolist()
 71 | 
 72 | # getting column names of those with all zeroes
 73 | zero_column_names = column_sums[column_sums == 0].index.tolist()
 74 | 
 75 | tab.drop(zero_column_names, axis = 1, inplace = True)
 76 | 
 77 | 
 78 | if args.normalization_method == "CPM":
 79 | 
 80 |     norm_tab = tab / tab.sum() * 1000000
 81 | 
 82 | else:
 83 | 
 84 |     ## calculating size factors
 85 |     # getting geometric means for each row
 86 |     with np.errstate(divide = 'ignore'):
 87 |         geomeans = gmean(tab, axis = 1)
 88 | 
 89 |     # getting ratios of gene values to geometric means
 90 |     ratios_tab = (tab.T / geomeans).T
 91 | 
 92 |     sizeFactors = ratios_tab[geomeans > 0].median().to_list()
 93 | 
 94 |     # dividing by size factors
 95 |     norm_tab = tab / sizeFactors
 96 | 
 97 | 
 98 | ## adding back on columns with all zeroes
 99 | if len(zero_column_names) > 0:
100 |     for col in zero_column_names:
101 |         norm_tab[col] = 0.0
102 | 
103 | # reordering to match input
104 | norm_tab = norm_tab[ordered_columns]
105 | 
106 | # writing out normalized table
107 | norm_tab.to_csv(args.output_table, sep = "\t")
108 | 


--------------------------------------------------------------------------------
/bit/bit-parse-fasta-by-headers:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from Bio import SeqIO
 4 | import sys
 5 | import argparse
 6 | import os
 7 | import gzip
 8 | 
 9 | parser = argparse.ArgumentParser(description = 'This script is for parsing a fasta file by pulling out sequences with the desired headers. If you want all sequences EXCEPT the ones with the headers you are providing, add the flag "--inverse". For version info, run `bit-version`.')
10 | 
11 | required = parser.add_argument_group('required arguments')
12 | 
13 | required.add_argument("-i", "--input-fasta", metavar = "<FILE>", help = "Original fasta file", action = "store", required = True)
14 | required.add_argument("-w", "--sequence-headers", metavar = "<FILE>", help = "Single-column file with target sequence headers", action = "store", dest = "headers", required = True)
15 | parser.add_argument("-o", "--output-fasta", metavar = "<FILE>", help = 'Output fasta file (default: "wanted.fasta")', action = "store", default = "wanted.fasta")
16 | parser.add_argument("--inverse", help = "Add this flag to pull out all sequences with headers NOT in the provided header file.", action = "store_true")
17 | parser.add_argument("--gz", help = "Add this flag if the input is gzipped (does not gzip output)", action = "store_true")
18 | 
19 | if len(sys.argv)==1:
20 |     parser.print_help(sys.stderr)
21 |     sys.exit(0)
22 | 
23 | args = parser.parse_args()
24 | 
25 | if not args.gz:
26 | 
27 |     fasta_in = open(args.input_fasta, "r")
28 | 
29 | else:
30 | 
31 |     fasta_in = gzip.open(args.input_fasta, "rt")
32 | 
33 | 
34 | headers_of_int = open(args.headers, "r")
35 | 
36 | headers_of_int_set = set(line.strip() for line in headers_of_int)
37 | headers_of_int.close()
38 | 
39 | fasta_out = open(args.output_fasta, "w")
40 | 
41 | if not args.inverse:
42 | 
43 |     for seq_record in SeqIO.parse(fasta_in, "fasta"):
44 |         if seq_record.id in headers_of_int_set:
45 |             fasta_out.write(">" + seq_record.id + "\n")
46 |             fasta_out.write(str(seq_record.seq) + "\n")
47 | 
48 | else:
49 | 
50 |     for seq_record in SeqIO.parse(fasta_in, "fasta"):
51 |         if seq_record.id not in headers_of_int_set:
52 |             fasta_out.write(">" + seq_record.id + "\n")
53 |             fasta_out.write(str(seq_record.seq) + "\n")
54 | 
55 | fasta_in.close()
56 | fasta_out.close()
57 | 


--------------------------------------------------------------------------------
/bit/bit-parse-fastq-by-headers:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import sys
  4 | import os
  5 | import argparse
  6 | import textwrap
  7 | from Bio.SeqIO.QualityIO import FastqGeneralIterator
  8 | import gzip
  9 | import subprocess
 10 | 
 11 | 
 12 | parser = argparse.ArgumentParser(description = 'This script is for parsing a single fastq file by pulling out sequences with the desired headers (paired-end not supported yet).\
 13 |                                               For version info, run `bit-version`.')
 14 | 
 15 | required = parser.add_argument_group('required arguments')
 16 | 
 17 | required.add_argument("-i", "--input-fastq", metavar = "<FILE>", help = "Starting fastq file", action = "store", required = True)
 18 | required.add_argument("-w", "--sequence-headers", metavar = "<FILE>", help = "Single-column file with target sequence headers (if the headers in the fastq file have whitespace in them, it is okay to provide just the part up to the whitespace in this input file)", action = "store", required = True)
 19 | parser.add_argument("-o", "--output-fastq", metavar = "<FILE>", help='Output fastq file name (default: "wanted.fq", ".gz" will be added if compressed)', action = "store", default = "wanted.fq")
 20 | parser.add_argument("--inverse", help = "Add this flag to pull out all sequences with headers NOT in the provided header file.", action = "store_true")
 21 | parser.add_argument("--gz", help = "Add this flag if the input is gzipped (output will be too)", action = "store_true")
 22 | 
 23 | if len(sys.argv)==1:
 24 |     parser.print_help(sys.stderr)
 25 |     sys.exit(0)
 26 | 
 27 | args = parser.parse_args()
 28 | 
 29 | ################################################################################
 30 | 
 31 | def main():
 32 | 
 33 |     check_all_inputs_exist([args.input_fastq, args.sequence_headers])
 34 | 
 35 |     check_if_output_already_exists(args.output_fastq)
 36 | 
 37 |     # reading headers into a set
 38 |     headers_of_int_set = set(line.strip() for line in open(args.sequence_headers, "r"))
 39 | 
 40 |     parse_fastq(headers_of_int_set, args.input_fastq, args.output_fastq, args.inverse, args.gz)
 41 | 
 42 |     if args.gz:
 43 | 
 44 |         subprocess.run(["pigz", args.output_fastq])
 45 | 
 46 | ################################################################################
 47 | 
 48 | 
 49 | # setting some colors
 50 | tty_colors = {
 51 |     'green' : '\033[0;32m%s\033[0m',
 52 |     'yellow' : '\033[0;33m%s\033[0m',
 53 |     'red' : '\033[0;31m%s\033[0m'
 54 | }
 55 | 
 56 | 
 57 | ### functions ###
 58 | def color_text(text, color='green'):
 59 |     if sys.stdout.isatty():
 60 |         return tty_colors[color] % text
 61 |     else:
 62 |         return text
 63 | 
 64 | 
 65 | def wprint(text):
 66 |     """ print wrapper """
 67 | 
 68 |     print(textwrap.fill(text, width=80, initial_indent="  ",
 69 |           subsequent_indent="  ", break_on_hyphens=False))
 70 | 
 71 | 
 72 | def check_all_inputs_exist(input_list):
 73 | 
 74 |     for file in input_list:
 75 |         if not os.path.exists(file):
 76 |             print("")
 77 |             wprint(color_text("It seems the specified input file '" + str(file) + "' can't be found.", "yellow"))
 78 |             print("\nExiting for now.\n")
 79 |             sys.exit(1)
 80 | 
 81 | 
 82 | def check_if_output_already_exists(planned_output):
 83 | 
 84 |     # making sure outputs don't already exist, exiting if they do
 85 | 
 86 |         if os.path.exists(planned_output):
 87 | 
 88 |             print("")
 89 |             wprint(color_text("It seems the expected output (or intermediate) file '" + str(planned_output) + "' already exists.", "yellow"))
 90 |             print("")
 91 |             wprint("We don't want to overwrite something accidentally, so rename or remove that first if wanting to proceed.")
 92 |             print("\nExiting for now.\n")
 93 |             sys.exit(1)
 94 | 
 95 | 
 96 | def parse_fastq(headers_of_int_set, input_fastq, output_fastq, inverse, gz):
 97 | 
 98 | 
 99 |     if gz:
100 | 
101 |         fastq_in = gzip.open(input_fastq, "rt")
102 | 
103 |     else:
104 | 
105 |         fastq_in = open(input_fastq, "rt")
106 | 
107 | 
108 |     if not args.inverse:
109 | 
110 |         with open(output_fastq, "w") as output_file:
111 | 
112 |             for header, seq, qual in FastqGeneralIterator(fastq_in):
113 | 
114 |                 if header in headers_of_int_set or header.split(" ")[0] in headers_of_int_set:
115 | 
116 |                     output_file.write("@%s\n%s\n+\n%s\n" % (header, seq, qual))
117 | 
118 |     else:
119 | 
120 |         with open(output_fastq, "w") as output_file:
121 | 
122 |             for header, seq, qual in FastqGeneralIterator(fastq_in):
123 | 
124 |                 if header not in headers_of_int_set and header.split(" ")[0] not in headers_of_int_set:
125 | 
126 |                     output_file.write("@%s\n%s\n+\n%s\n" % (header, seq, qual))
127 | 
128 |     fastq_in.close()
129 | 
130 | 
131 | if __name__ == "__main__":
132 |     main()
133 | 


--------------------------------------------------------------------------------
/bit/bit-prot-acc-to-taxid:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | import argparse
 5 | import os
 6 | 
 7 | class colors:
 8 |   GREEN = '\033[0;32m'
 9 |   NC = '\033[0m'
10 | 
11 | parser = argparse.ArgumentParser(description = 'This script takes NCBI protein accessions and returns a two-column \
12 |                                               tab-delimited file with protein accessions and taxids. It requires the \
13 |                                               "prot.accession2taxid" database (unzipped) that can be downloaded from here: \
14 |                                               ftp://ftp.ncbi.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.gz\
15 |                                               For version info, run `bit-version`.')
16 | 
17 | required = parser.add_argument_group('required arguments')
18 | 
19 | required.add_argument("-r", "--ref-map", metavar = "<FILE>", help = "reference prot_acc_to_taxid_map database", action = "store", dest = "input_ref", required = True)
20 | required.add_argument("-w", "--wanted_prot_accessions", metavar = "<FILE>", help = "Single-column file with protein accessions", action = "store", dest = "prot_accs", required = True)
21 | parser.add_argument("-o", "--output-file", metavar = "<FILE>", help = 'Output file of prot_acc and taxID (default: "wanted-prot-accs-and-taxids.tsv")', action = "store", dest = "file_out", default = "wanted-prot-accs-and-taxids.tsv")
22 | 
23 | if len(sys.argv)==1:
24 |     parser.print_help(sys.stderr)
25 |     sys.exit(0)
26 | 
27 | args = parser.parse_args()
28 | 
29 | wanted_accs = open(args.prot_accs, "r")
30 | 
31 | wanted_accs_set = set(line.strip() for line in wanted_accs)
32 | 
33 | output_file = open(args.file_out, "w")
34 | 
35 | num_found = 0
36 | 
37 | accs_found = []
38 | 
39 | output_file.write("prot_accession\ttaxid\n")
40 | 
41 | num = 0
42 | 
43 | print("\nNow beginning trek through reference mapping file.\n")
44 | 
45 | with open(args.input_ref) as refs:
46 | 
47 |     for line in refs:
48 |         line = line.split("\t")
49 |         num += 1
50 | 
51 |         if num % 1000000 == 0:
52 |             mega_num = num / 1000000
53 |             sys.stdout.write("\r  On line " + colors.GREEN + str(mega_num) + colors.NC + " million of prot_acc_to_taxid_map...")
54 |             sys.stdout.flush()
55 | 
56 |         if line[1] in wanted_accs_set:
57 |             output_file.write(line[1] + "\t" + line[2] + "\n")
58 |             num_found += 1
59 |             accs_found.append(line[1])
60 | 
61 | print("\n")
62 | 
63 | wanted_accs_list = list(wanted_accs_set)
64 | 
65 | print('  Adding in "NA"s for those protein accessions not found...\n')
66 | 
67 | for acc in wanted_accs_list:
68 |     if acc not in accs_found:
69 |         output_file.write(acc + "\tNA\n")
70 | 
71 | print(colors.GREEN + "  Done!" + colors.NC + "\n")
72 | print("  You were looking for " + str(len(wanted_accs_list)) + " protein accessions.")
73 | print("  " + str(num_found) + ' were found. The rest were given taxids of "NA\".')
74 | 
75 | output_file.close()
76 | wanted_accs.close()
77 | 


--------------------------------------------------------------------------------
/bit/bit-remove-wraps:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | RED='\033[0;31m'
 4 | GREEN='\033[0;32m'
 5 | NC='\033[0m'
 6 | 
 7 | if [ "$#" == 0 ] || [ $1 == "-h" ]; then
 8 |     printf "\n  This script removes line wraps from a fasta file. For version\n"
 9 |     printf "  info, run \`bit-version\`.\n\n"
10 |     printf "    Usage:\n\t bit-remove-wraps input.fasta > new.fasta\n\n"
11 |     exit
12 | fi
13 | 
14 | if [ -f $1 ]; then
15 |     awk '!/^>/ { printf "%s", $0; n="\n" } /^>/ { print n $0; n = "" } END { printf "%s", n }' "$1"
16 |     echo -e "     ${GREEN}Annoying line wraps removed! Cheers!${NC}" 1>&2
17 | 
18 | else
19 |     echo -e "     ${RED}Input file not found :/${NC}" 1>&2
20 |     exit 1
21 | fi
22 | 


--------------------------------------------------------------------------------
/bit/bit-rename-fasta-headers:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from Bio import SeqIO
 4 | import sys
 5 | import argparse
 6 | 
 7 | parser = argparse.ArgumentParser(
 8 |     description=(
 9 |         "This script facilitates renaming headers in a fasta file. "
10 |         "By default, each sequence is renamed to <wanted_text>_<n> for n=1,2,… "
11 |         "If you provide --prefix and/or --suffix, those strings will be "
12 |         "prepended/appended to each original header. "
13 |         "For version info, run `bit-version`."
14 |     )
15 | )
16 | 
17 | required = parser.add_argument_group('required arguments')
18 | 
19 | required.add_argument("-i", "--input-fasta", metavar = "<FILE>", help = "starting fasta file", action = "store", required = True)
20 | parser.add_argument("-w", "--wanted-text", metavar = "<STR>", help = 'base name to give seqs when renaming to "<wanted_text>_<n>" (default: "Seq")', action = "store", default = "Seq")
21 | parser.add_argument("-o", "--output-fasta", metavar = "<FILE>", help = 'output fasta file (default: "renamed.fasta").', default = "renamed.fasta")
22 | 
23 | parser.add_argument("--prefix", metavar = "<STR>", help = "prepend this text to the original header (include separator if wanted)", default = "")
24 | parser.add_argument("--suffix", metavar = "<STR>", help = "append this text to the original header (include separator if wanted)", default = "")
25 | 
26 | 
27 | if len(sys.argv)==1:
28 |     parser.print_help(sys.stderr)
29 |     sys.exit(0)
30 | 
31 | args = parser.parse_args()
32 | 
33 | do_numbering = (args.prefix == "" and args.suffix == "")
34 | 
35 | try:
36 |     input_fasta = open(args.input_fasta, "r")
37 | except FileNotFoundError:
38 |     print(f"Error: Input file '{args.input_fasta}' not found.")
39 |     sys.exit(1)
40 | 
41 | out_fasta = open(args.output_fasta, "w")
42 | 
43 | if do_numbering:
44 |     counter = 0
45 |     for seq_record in SeqIO.parse(input_fasta, "fasta"):
46 |         counter += 1
47 |         out_fasta.write(f">{args.wanted_text}_{counter}\n")
48 |         out_fasta.write(str(seq_record.seq) + "\n")
49 | 
50 | else:
51 | 
52 |     for seq_record in SeqIO.parse(input_fasta, "fasta"):
53 |         # using description so it preserves the full header
54 |         original_header = seq_record.description
55 | 
56 |         new_header = f"{args.prefix}{original_header}{args.suffix}"
57 |         out_fasta.write(f">{new_header}\n{seq_record.seq}\n")
58 | 
59 | input_fasta.close()
60 | out_fasta.close()
61 | 


--------------------------------------------------------------------------------
/bit/bit-reorder-fasta:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from Bio import SeqIO
 4 | import sys
 5 | import argparse
 6 | import os
 7 | 
 8 | parser = argparse.ArgumentParser('This script takes a multifasta file and reorders the sequences according to the headers provided. For version info, run `bit-version`.')
 9 | 
10 | required = parser.add_argument_group('required arguments')
11 | 
12 | required.add_argument("-i", "--input-fasta", metavar = "<FILE>", help = "Original fasta file", action = "store", required = True)
13 | required.add_argument("-w", "--wanted-sequence-order", metavar = "<FILE>", help = "Single-column file with headers in desired order", action = "store", dest = "ordered_headers", required = True)
14 | parser.add_argument("-o", "--output-fasta", help = 'Reordered output fasta (default: "reordered.fasta")', default = "reordered.fasta")
15 | 
16 | 
17 | if len(sys.argv)==1:
18 |     parser.print_help(sys.stderr)
19 |     sys.exit(0)
20 | 
21 | args = parser.parse_args()
22 | 
23 | ordered_seqs = open(args.ordered_headers, "r")
24 | 
25 | ordered_list = list(line.strip() for line in ordered_seqs)
26 | 
27 | fasta_dict = SeqIO.index(args.input_fasta, "fasta")
28 | 
29 | fasta_out = open(args.output_fasta, "wb")
30 | 
31 | for header in ordered_list:
32 |     fasta_out.write(fasta_dict.get_raw(header))
33 | 
34 | ordered_seqs.close()
35 | fasta_out.close()
36 | 


--------------------------------------------------------------------------------
/bit/bit-slim-down-go-terms:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | ## learned most goatools/python things from this great tutorial: GO Tutorial in Python - Solutions.ipynb, which comes from here: http://gohandbook.org/doku.php ; https://nbviewer.jupyter.org/urls/dessimozlab.github.io/go-handbook/GO%20Tutorial%20in%20Python%20-%20Solutions.ipynb
 4 | 
 5 | import os
 6 | import argparse
 7 | import pandas as pd
 8 | import sys
 9 | import subprocess
10 | 
11 | parser = argparse.ArgumentParser(description = 'This script wraps the goatools `map_to_slim.py` program (github.com/tanghaibao/Goatools#map-go-terms-to-goslim-terms). \
12 |                                               See there for more details, and if you use it in your work, be sure to properly cite them :) \
13 |                                               https://www.nature.com/articles/s41598-018-28948-z. It is included here to streamline integration with \
14 |                                               with the GO databases stored with `bit` and programs like `bit-summarize-go-annotations`. Stored databases \
15 |                                               can be updated with `bit-update-go-dbs`. For version info, run `bit-version`.')
16 | 
17 | required = parser.add_argument_group('required arguments')
18 | 
19 | required.add_argument("-a", "--association-file", metavar = "<FILE>",
20 |                       help = "Input annotations file. 2-column, tab-delimited, where the first column holds gene IDs, and the second column holds GO terms (can be multiple delimited with a semi-colon).",
21 |                       action = "store", dest = "input_ass_file", required = True)
22 | parser.add_argument("-g", "--initial-GO-obo-file", metavar = "<FILE>",
23 |                     help='Initial GO obo file holding relationships of all terms used to perform the annotation (e.g. from: geneontology.org/docs/download-ontology/). By default \
24 |                                                          this program will use "go-basic.obo" that is stored with `bit`. Or a different obo-formatted file can be specified here.',
25 |                     action = "store", dest = "initial_obo", default = "go_basic")
26 | parser.add_argument("-s", "--slimmed-GO-obo-file", metavar = "<FILE>",
27 |                     help = 'Slimmed GO obo file holding relationships to collapse GO terms (e.g. from: geneontology.org/docs/download-ontology/#subsets;). By default will \
28 |                                                          use "goslim_metagenomics.obo" that is stored with `bit`. Or a different obo-formatted file can be specified here.',
29 |                     action = "store", dest = "slimmed_obo", default = "goslim_metagenomics")
30 | parser.add_argument("-m", "--mode", help = 'Set if the slimmer should return only direct ancestors, or all ancestors. Default setting is to return all.',
31 |                     choices=["all", "direct"], action = "store", dest = "mode", default = "all")
32 | 
33 | parser.add_argument("-o", "--output-file", metavar = "<FILE>", help = 'Name for output slimmed annotation file. (default: "GO-slimmed.tsv").', action = "store", dest = "output_tab", default = "GO-slimmed.tsv")
34 | 
35 | 
36 | if len(sys.argv)==1:
37 |   parser.print_help(sys.stderr)
38 |   sys.exit(0)
39 | 
40 | args = parser.parse_args()
41 | 
42 | ### checking and setting up obo file locations
43 | go_data_dir = os.environ["GO_DB_DIR"]
44 | 
45 | ## downloading default GO databases if they are not present already
46 | checking_db_dir = subprocess.run(["helper-bit-setup-GO-dbs"])
47 | 
48 | if args.initial_obo == "go_basic":
49 |     initial_obo = go_data_dir + "go-basic.obo"
50 | 
51 | else:
52 |     initial_obo = args.initial_obo
53 | 
54 | if args.slimmed_obo == "goslim_metagenomics":
55 |     slim_obo = go_data_dir + "goslim_metagenomics.obo"
56 | 
57 | else:
58 |     slim_obo = args.slimmed_obo
59 | 
60 | ### building and running call to map_to_slim.py
61 | with open(args.output_tab, "w") as output:
62 |     map_to_slim = subprocess.run(["map_to_slim.py", "--association_file", args.input_ass_file, "--slim_out", args.mode, initial_obo, slim_obo], stdout=output)
63 |     map_to_slim
64 | 
65 | 


--------------------------------------------------------------------------------
/bit/bit-split-multifasta:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from Bio import SeqIO
 4 | from pathlib import Path
 5 | import sys
 6 | import argparse
 7 | import os
 8 | 
 9 | parser = argparse.ArgumentParser(description = 'This script will split a multifasta into individual fasta files, each file named with the header of the sequence within it, written to a new subdirectory. (It expects standard characters in the headers only, e.g. no spaces or special characters). For version info, run `bit-version`.')
10 | 
11 | required = parser.add_argument_group('required arguments')
12 | 
13 | required.add_argument("-i", "--input-fasta", metavar = "<FILE>", help = "Starting fasta file", action = "store", required = True)
14 | parser.add_argument("-d", "--subdirectory-name", help = 'Name of new subdirectory holding split sequences (default: "sub")', action = "store", dest = "subdirectory", default = "sub")
15 | 
16 | if len(sys.argv)==1:
17 |     parser.print_help(sys.stderr)
18 |     sys.exit(0)
19 | 
20 | args = parser.parse_args()
21 | 
22 | out_dir = Path(str(args.subdirectory))
23 | 
24 | Path.mkdir(out_dir, parents=True, exist_ok=True)
25 | 
26 | in_fasta = open(args.input_fasta, "r")
27 | 
28 | for seq_record in SeqIO.parse(in_fasta, "fasta"):
29 |     curr_header = str(seq_record.id)
30 | 
31 |     curr_out = open(str(args.subdirectory) + "/" + curr_header + ".fa", "w")
32 |     curr_out.write(">" + str(seq_record.id) + "\n" + str(seq_record.seq) + "\n")
33 |     curr_out.close()
34 | 
35 | in_fasta.close()
36 | 


--------------------------------------------------------------------------------
/bit/bit-summarize-assembly:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import pandas as pd
  4 | import pyfastx
  5 | import sys
  6 | import argparse
  7 | import os
  8 | 
  9 | 
 10 | parser = argparse.ArgumentParser(description = 'This script outputs general summary stats for an assembly provided \
 11 |                                               in fasta format. If given an output file, writes out a tsv, otherwise \
 12 |                                               prints to the screen. "Ambiguous characters" reports total counts of \
 13 |                                               of any letter that is not "A", "T", "C", or "G". For version info, run \
 14 |                                               `bit-version`.')
 15 | 
 16 | required = parser.add_argument_group('required arguments')
 17 | 
 18 | required.add_argument("input_assembly", metavar = "<FILE(s)>", type = str, nargs = "+", help = "Input assembly file(s).")
 19 | 
 20 | parser.add_argument("-o", "--output-tsv", metavar = "<FILE>", help = 'Name of output tsv file (if none provided, prints to screen)', action = "store", default = False)
 21 | parser.add_argument("-t", "--transpose-output-tsv", help = 'Set this flag if we want to have the output table have genomes as rows rather than columns.', action = "store_true")
 22 | 
 23 | 
 24 | if len(sys.argv)==1:
 25 |     parser.print_help(sys.stderr)
 26 |     sys.exit(0)
 27 | 
 28 | args = parser.parse_args()
 29 | 
 30 | ## setting up master dataframe
 31 | df_colnames = []
 32 | for assembly in args.input_assembly:
 33 | 
 34 |     assembly_base = os.path.basename(assembly)
 35 | 
 36 |     df_colnames.append(assembly_base.rsplit(".", 1)[0])
 37 | 
 38 | # checking for a situation where inputs may have the same basename, due to being from different directories
 39 | # if so, setting a flag and reporting them as the full input paths instead of just basenames
 40 | use_paths_instead_of_basenames = False
 41 | 
 42 | for assembly in df_colnames:
 43 | 
 44 |     num_occurences = 0
 45 | 
 46 |     for assembly_2 in df_colnames:
 47 | 
 48 |         if assembly_2 == assembly:
 49 | 
 50 |             num_occurences += 1
 51 | 
 52 |     if num_occurences > 1:
 53 | 
 54 |         use_paths_instead_of_basenames = True
 55 | 
 56 | if use_paths_instead_of_basenames:
 57 | 
 58 |     df_colnames = []
 59 | 
 60 |     for assembly in args.input_assembly:
 61 | 
 62 |         df_colnames.append(assembly.rsplit(".", 1)[0])
 63 | 
 64 | 
 65 | ## creating output table foundation
 66 | df_index = ["Assembly", "Total contigs", "Total length", "Ambiguous characters",
 67 |             "GC content", "Maximum contig length", "Minimum contig length", "N50",
 68 |             "N75", "N90", "L50", "L75", "L90", "Num. contigs >= 100",
 69 |             "Num. contigs >= 500", "Num. contigs >= 1000", "Num. contigs >= 5000",
 70 |             "Num. contigs >= 10000", "Num. contigs >= 50000", "Num. contigs >= 100000"]
 71 | 
 72 | df = pd.DataFrame(columns = df_colnames, index = df_index)
 73 | 
 74 | for assembly in args.input_assembly:
 75 | 
 76 |     if use_paths_instead_of_basenames:
 77 | 
 78 |         assembly_name = assembly.rsplit(".", 1)[0]
 79 | 
 80 |     else:
 81 |         assembly_base = os.path.basename(assembly)
 82 | 
 83 |         assembly_name = assembly_base.rsplit(".", 1)[0]
 84 | 
 85 |     try:
 86 |         df.at["Assembly", str(assembly_name)] = assembly_name
 87 |     except AttributeError:
 88 |         print("  An attribute exception was thrown by pandas. Maybe the inputs don't have unique names?")
 89 |         print("  As written, this cuts off the extension based on last period to generate names.")
 90 |         sys.exit(1)
 91 | 
 92 |     # putting in a catch if file is empty (which can happen if an assembly produced no contigs)
 93 |         # this will leave it in the table, but with NAs (written out as "NA")
 94 |     if os.stat(assembly).st_size == 0:
 95 |         continue
 96 | 
 97 |     fasta = pyfastx.Fasta(assembly)
 98 | 
 99 |     df.at["Total contigs", str(assembly_name)] = len(fasta)
100 |     df.at["Total length", str(assembly_name)] = fasta.size
101 | 
102 |     num_ambiguous_chars = 0
103 |     for key in fasta.composition:
104 |         if key not in ["A","T","G","C"]:
105 |             num_ambiguous_chars += fasta.composition[key]
106 | 
107 |     df.at["Ambiguous characters", str(assembly_name)] = num_ambiguous_chars
108 |     df.at["GC content", str(assembly_name)] = round(fasta.gc_content, 2)
109 |     df.at["Maximum contig length", str(assembly_name)] = len(fasta.longest)
110 |     df.at["Minimum contig length", str(assembly_name)] = len(fasta.shortest)
111 | 
112 |     info_at_50 = fasta.nl(50)
113 |     info_at_75 = fasta.nl(75)
114 |     info_at_90 = fasta.nl(90)
115 |     df.at["N50", str(assembly_name)] = info_at_50[0]
116 |     df.at["N75", str(assembly_name)] = info_at_75[0]
117 |     df.at["N90", str(assembly_name)] = info_at_90[0]
118 |     df.at["L50", str(assembly_name)] = info_at_50[1]
119 |     df.at["L75", str(assembly_name)] = info_at_75[1]
120 |     df.at["L90", str(assembly_name)] = info_at_90[1]
121 | 
122 |     df.at["Num. contigs >= 100", str(assembly_name)] = fasta.count(100)
123 |     df.at["Num. contigs >= 500", str(assembly_name)] = fasta.count(500)
124 |     df.at["Num. contigs >= 1000", str(assembly_name)] = fasta.count(1000)
125 |     df.at["Num. contigs >= 5000", str(assembly_name)] = fasta.count(5000)
126 |     df.at["Num. contigs >= 10000", str(assembly_name)] = fasta.count(10000)
127 |     df.at["Num. contigs >= 50000", str(assembly_name)] = fasta.count(50000)
128 |     df.at["Num. contigs >= 100000", str(assembly_name)] = fasta.count(100000)
129 | 
130 |     # removing intermediate index file
131 |     os.remove(assembly + ".fxi")
132 | 
133 | 
134 | if args.output_tsv:
135 |     # transposing if wanted:
136 |     if args.transpose_output_tsv:
137 |         df = df.T
138 |         df.to_csv(args.output_tsv, sep="\t", index=False, na_rep = "NA")
139 | 
140 |     else:
141 |         df.to_csv(args.output_tsv, sep="\t", header=False, na_rep = "NA")
142 | 
143 | else:
144 |     print("")
145 |     print(df.to_string(header=False))
146 |     print("")
147 | 


--------------------------------------------------------------------------------
/bit/bit-summarize-column:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import pandas as pd
  4 | import numpy as np
  5 | import sys
  6 | import argparse
  7 | import os
  8 | 
  9 | 
 10 | parser = argparse.ArgumentParser(description = 'This script outputs general summary stats for a numeric column. It can take stdin or \
 11 |                                               a file as input. It will run on the first (or only column) if not specified. Otherwise \
 12 |                                               you can indicate which column to summarize by column position or name. For version info, \
 13 |                                               run `bit-version`.')
 14 | 
 15 | required = parser.add_argument_group('required arguments')
 16 | 
 17 | required.add_argument("-i", "--input-file", metavar = "<FILE>", type = argparse.FileType('r'), default = '-', help = "Input file or stdin if none provided")
 18 | 
 19 | parser.add_argument("-c", "--column", metavar = "<TEXT>", help = "Specify the target column to summarize. Can be a number specifying the column index (1-based, like unix cut/awk), \
 20 |                                                      or can be a column name if also including the `--header` flag. (default: 1)", action = "store", default = 1)
 21 | 
 22 | parser.add_argument("-d", "--delimiter", metavar = "<STR>", help = "Specify the delimiter (default = '\\t')", action = "store", default = "\t")
 23 | 
 24 | parser.add_argument("--header", help = "Add this flag if your input has a header with column names", action = "store_true")
 25 | 
 26 | args = parser.parse_args()
 27 | 
 28 | ## help menu access ##
 29 | # this handles if no standard in was provided and no -i input file was provided
 30 | if sys.stdin.isatty():
 31 | 
 32 |     if args.input_file.name == "<stdin>":
 33 |         parser.print_help(sys.stderr)
 34 |         sys.exit(0)
 35 | 
 36 | ## reading in input
 37 | if args.header:
 38 |     input_header = 0
 39 | else:
 40 |     input_header = None
 41 | 
 42 | input_df = pd.read_csv(args.input_file, sep = args.delimiter, header = input_header)
 43 | 
 44 | ## getting target column
 45 | try:
 46 |     args.column = int(args.column)
 47 | except:
 48 |     pass
 49 | 
 50 | if isinstance(args.column, int):
 51 |     column = args.column - 1
 52 |     target_array = input_df.iloc[: , column].to_numpy()
 53 | 
 54 |     # removing first entry if it is a string, and therefore likely a header
 55 |     if isinstance(target_array.flat[0], str):
 56 | 
 57 |         target_array = np.delete(target_array, 0)
 58 | 
 59 |         # and converting all to integers
 60 |         target_array = target_array.astype(int)
 61 | 
 62 | elif isinstance(args.column, str):
 63 | 
 64 |     # checking header was set to true if the user specified a column by name
 65 |     if not args.header:
 66 |         print("\n    If trying to specify which column by header name, you need to explicity add the `--header` flag also.\n")
 67 |         parser.print_help(sys.stderr)
 68 |         sys.exit(1)
 69 | 
 70 |     target_array = input_df[args.column].to_numpy()
 71 | 
 72 | # getting wanted values
 73 | input_n = target_array.size
 74 | input_min = np.round(np.min(target_array), decimals = 2)
 75 | input_max = np.round(np.max(target_array), decimals = 2)
 76 | input_mean = np.round(np.mean(target_array), decimals = 2)
 77 | input_median = np.round(np.median(target_array), decimals = 2)
 78 | input_stdev = np.round(np.std(target_array), decimals = 2)
 79 | percentile_1 = np.round(np.percentile(target_array, 1), decimals = 2)
 80 | percentile_5 = np.round(np.percentile(target_array, 5), decimals = 2)
 81 | percentile_10 = np.round(np.percentile(target_array, 10), decimals = 2)
 82 | percentile_25 = np.round(np.percentile(target_array, 25), decimals = 2)
 83 | percentile_50 = np.round(np.percentile(target_array, 50), decimals = 2)
 84 | percentile_75 = np.round(np.percentile(target_array, 75), decimals = 2)
 85 | percentile_90 = np.round(np.percentile(target_array, 90), decimals = 2)
 86 | percentile_95 = np.round(np.percentile(target_array, 95), decimals = 2)
 87 | percentile_99 = np.round(np.percentile(target_array, 99), decimals = 2)
 88 | 
 89 | 
 90 | # reporting
 91 | print(f"\n  Column '{args.column}' summary\n")
 92 | 
 93 | print(f"    {'N:':<15} {input_n}")
 94 | print(f"    {'Min:':<15} {input_min}")
 95 | print(f"    {'Max:':<15} {input_max}")
 96 | print(f"    {'Mean:':<15} {input_mean}")
 97 | print(f"    {'Median:':<15} {input_median}")
 98 | print(f"    {'StDev:':<15} {input_stdev}\n")
 99 | print(f"    Percentiles:\n")
100 | print(f"        {'1st:':<11} {percentile_1}")
101 | print(f"        {'5th:':<11} {percentile_5}")
102 | print(f"        {'10th:':<11} {percentile_10}")
103 | print(f"        {'25th:':<11} {percentile_25}")
104 | print(f"        {'50th:':<11} {percentile_50}")
105 | print(f"        {'75th:':<11} {percentile_75}")
106 | print(f"        {'90th:':<11} {percentile_90}")
107 | print(f"        {'95th:':<11} {percentile_95}")
108 | print(f"        {'99th:':<11} {percentile_99}")
109 | print("")
110 | 


--------------------------------------------------------------------------------
/bit/bit-update-go-dbs:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | if [ ! -z $1 ]; then
 6 |     printf "\n  This script updates the GO obo files \"go-basic.obo\" \n"
 7 |     printf "  and \"goslim_metagenomics.obo\" from the GO reference site\n"
 8 |     printf "  (http://geneontology.org/docs/download-ontology/) to support\n"
 9 |     printf "  programs like \`bit-summarize-go-annotations\`. For version info,\n"
10 |     printf "  run \`bit-version\`.\n\n"
11 |     printf "    Usage:\n\t bit-update-go-dbs\n\n"
12 |     exit
13 | fi
14 | 
15 | # setting colors to use
16 | GREEN='\033[0;32m'
17 | NC='\033[0m'
18 | 
19 | printf "\n"
20 | 
21 | curl -L --retry 10 -o ${GO_DB_DIR}/go-basic.obo http://purl.obolibrary.org/obo/go/go-basic.obo
22 | curl -L --retry 10 -o ${GO_DB_DIR}/goslim_metagenomics.obo http://current.geneontology.org/ontology/subsets/goslim_metagenomics.obo
23 | 
24 | printf "\n\t\t${GREEN}The GO basic and metagenomics slim obo files have been updated!${NC}\n\n"
25 | 


--------------------------------------------------------------------------------
/bit/bit-update-ncbi-taxonomy:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | if [ $1 ]; then
 6 |     printf "\n  This script updates the NCBI taxonomy database taxonkit uses.\n"
 7 |     printf "  For version info, run \`bit-version\`.\n\n"
 8 |     printf "    Usage:\n\t bit-update-ncbi-taxonomy\n\n"
 9 |     exit
10 | fi
11 | 
12 | # setting colors to use
13 | GREEN='\033[0;32m'
14 | RED='\033[0;31m'
15 | NC='\033[0m'
16 | 
17 | printf "\n"
18 | 
19 | curl --retry 10 -o ${TAXONKIT_DB}/taxdump.tar.gz ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz
20 | 
21 | tar -xzf ${TAXONKIT_DB}/taxdump.tar.gz -C ${TAXONKIT_DB}
22 | 
23 | rm ${TAXONKIT_DB}/taxdump.tar.gz
24 | 
25 | printf "\n\t\t${GREEN}The NCBI taxonomy database info has been updated!${NC}\n\n"
26 | 


--------------------------------------------------------------------------------
/bit/bit-version:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | version='1.10.9'
 4 | 
 5 | GREEN='\033[0;32m'
 6 | NC='\033[0m'
 7 | 
 8 | printf "\n\t\tBioinformatics Tools ${GREEN}v${version}${NC}\n"
 9 | printf "\t\tgithub.com/AstrobioMike/bit\n\n"
10 | 
11 | printf "    If you happen to find this toolset useful in your work, please be sure to\n"
12 | printf "    cite it :)\n\n"
13 | 
14 | printf "  Lee M. bit: a multipurpose collection of bioinformatics tools. F1000Research 2022, 11:122\n"
15 | printf "  https://doi.org/10.12688/f1000research.79530.1\n\n"
16 | 
17 | today=$(date +'%A')
18 | 
19 | printf "                                                   ${GREEN}Happy $today :)${NC}\n\n"
20 | 


--------------------------------------------------------------------------------
/bit/helper-bit-check-or-setup-GTDB-files.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """
  4 | This is a helper program of bit, that I initially wrote for GToTree (https://github.com/AstrobioMike/GToTree/wiki).
  5 | It is for setting up reference files for the glorious Genome Taxonomy Database (gtdb.ecogenomic.org/).
  6 | 
  7 | For examples, please visit the GToTree wiki here: https://github.com/AstrobioMike/GToTree/wiki/example-usage
  8 | """
  9 | 
 10 | import sys
 11 | import os
 12 | import urllib.request
 13 | import pandas as pd
 14 | import textwrap
 15 | import argparse
 16 | 
 17 | parser = argparse.ArgumentParser(description = "This is a helper program to facilitate setting up the reference files for the \
 18 |                                               glorious Genome Taxonomy Database (gtdb.ecogenomic.org). It's really meant for internal \
 19 |                                               use only by other bit programs.")
 20 | 
 21 | args = parser.parse_args()
 22 | 
 23 | ################################################################################
 24 | 
 25 | def main():
 26 | 
 27 |     ## checking env variable is set and writable
 28 |     check_location_var_is_set_and_writable("GTDB_DIR")
 29 | 
 30 |     ## setting up ref GTDB files if needed
 31 |     check_and_or_get_gtdb_files(os.environ["GTDB_DIR"])
 32 | 
 33 | ################################################################################
 34 | 
 35 | 
 36 | # setting some colors
 37 | tty_colors = {
 38 |     'green' : '\033[0;32m%s\033[0m',
 39 |     'yellow' : '\033[0;33m%s\033[0m',
 40 |     'red' : '\033[0;31m%s\033[0m'
 41 | }
 42 | 
 43 | 
 44 | ### functions ###
 45 | def color_text(text, color='green'):
 46 |     if sys.stdout.isatty():
 47 |         return tty_colors[color] % text
 48 |     else:
 49 |         return text
 50 | 
 51 | 
 52 | def wprint(text):
 53 |     print(textwrap.fill(text, width=80, initial_indent="  ",
 54 |           subsequent_indent="  ", break_on_hyphens=False))
 55 | 
 56 | 
 57 | def check_location_var_is_set_and_writable(variable):
 58 | 
 59 |     # making sure there is an env variable
 60 |     try:
 61 |         path = os.environ[variable]
 62 | 
 63 |         if path == "":
 64 |             raise
 65 | 
 66 |     except:
 67 |         print()
 68 |         wprint(color_text("The environment variable '" + str(variable) + "' does not seem to be set :(", "red"))
 69 |         print()
 70 |         wprint("Try to set it with `bit-data-locations set`, then try again.")
 71 |         print("\nExiting for now.\n")
 72 |         sys.exit(1)
 73 | 
 74 |     # making sure path is writable for the user
 75 |     path_writable = os.access(path, os.W_OK)
 76 | 
 77 |     if not path_writable:
 78 |         print()
 79 |         wprint(color_text("The environment variable '" + str(variable) + "' does not seem to be writable :(", "red"))
 80 |         print()
 81 |         wprint("Try to set it somewhere else with `bit-data-locations set`, then try again.")
 82 |         print("\nExiting for now.\n")
 83 |         sys.exit(1)
 84 | 
 85 |     return()
 86 | 
 87 | 
 88 | def gen_gtdb_tab(location):
 89 |     """ downloads and parses the GTDB info tables """
 90 | 
 91 |     # getting archaea
 92 |     arc_tar_gz = urllib.request.urlopen("https://data.gtdb.ecogenomic.org/releases/latest/ar53_metadata.tar.gz")
 93 |     arc_tab = pd.read_csv(arc_tar_gz, sep="\t", compression="gzip", on_bad_lines = 'skip', header=0, low_memory=False)
 94 |     arc_tab.rename(columns={arc_tab.columns[0]:"accession"}, inplace=True)
 95 |     arc_tab.dropna(inplace=True, how="all")
 96 | 
 97 |     # getting bacteria
 98 |     bac_tar_gz = urllib.request.urlopen("https://data.gtdb.ecogenomic.org/releases/latest/bac120_metadata.tar.gz")
 99 |     bac_tab = pd.read_csv(bac_tar_gz, sep="\t", compression="gzip", on_bad_lines = 'skip', header=0, low_memory=False)
100 |     bac_tab.rename(columns={bac_tab.columns[0]:"accession"}, inplace=True)
101 |     bac_tab.dropna(inplace=True, how="all")
102 | 
103 |     # combining
104 |     gtdb_tab = pd.concat([arc_tab, bac_tab])
105 | 
106 |     # splitting gtdb taxonomy column into 7 and dropping the single column
107 |     domain, phylum, rclass, order, family, genus, species = [], [], [], [], [], [], []
108 | 
109 |     for index, row in gtdb_tab.iterrows():
110 |         curr_acc = row["accession"]
111 |         tax_list = row["gtdb_taxonomy"].split(";")
112 | 
113 |         if len(tax_list) != 7:
114 |             wprint(color_text("GTDB entry " + curr_acc + " doesn't seem to have 7-column lineage info. Something is likely wrong :(", "yellow"))
115 |             print("")
116 |             wprint("If this continues to happen, please file an issue at github.com/AstrobioMike/bit/issues")
117 |             print("")
118 |             wprint("Aborting for now.")
119 |             print("")
120 |             sys.exit(0)
121 | 
122 |         else:
123 |             domain.append(tax_list[0][3:])
124 |             phylum.append(tax_list[1][3:])
125 |             rclass.append(tax_list[2][3:])
126 |             order.append(tax_list[3][3:])
127 |             family.append(tax_list[4][3:])
128 |             genus.append(tax_list[5][3:])
129 |             species.append(tax_list[6][3:])
130 | 
131 |     gtdb_tab.insert(1, "species", species)
132 |     gtdb_tab.insert(1, "genus", genus)
133 |     gtdb_tab.insert(1, "family", family)
134 |     gtdb_tab.insert(1, "order", order)
135 |     gtdb_tab.insert(1, "class", rclass)
136 |     gtdb_tab.insert(1, "phylum", phylum)
137 |     gtdb_tab.insert(1, "domain", domain)
138 | 
139 |     # writing out
140 |     gtdb_tab.to_csv(location + "GTDB-arc-and-bac-metadata.tsv", index=False, sep="\t")
141 | 
142 |     gtdb_version_info = urllib.request.urlretrieve("https://data.gtdb.ecogenomic.org/releases/latest/VERSION", location + "GTDB-version-info.txt")
143 | 
144 | 
145 | def check_and_or_get_gtdb_files(GTDB_DIR):
146 |     """ checks for and sets up ref GTDB files if needed """
147 | 
148 |     if os.path.exists(GTDB_DIR + "GTDB-arc-and-bac-metadata.tsv") and os.path.exists(GTDB_DIR + "GTDB-version-info.txt"):
149 | 
150 |         sys.exit(0)
151 | 
152 |     # generating when table doesn't exist yet
153 |     else:
154 |         print("")
155 |         wprint(color_text("Downloading and parsing archaeal and bacterial metadata tables from GTDB (only needs to be done once, or when a new version is available)...", "yellow"))
156 |         print("")
157 | 
158 |         gen_gtdb_tab(GTDB_DIR)
159 | 
160 | 
161 | if __name__ == "__main__":
162 |     main()
163 | 


--------------------------------------------------------------------------------
/bit/helper-bit-combine-bracken.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import os
  4 | import argparse
  5 | import sys
  6 | 
  7 | parser = argparse.ArgumentParser(description = 'This script is for combining bracken output tables. It was modified\
  8 |                                               from the `combine_bracken_outputs.py` script provided by Jennifer\
  9 |                                               Lu (jlu26@jhmi.edu) that comes with bracken for use with the\
 10 |                                               `bit-combine-bracken-and-add-lineage` script. For version info,\
 11 |                                               run `bit-version`.')
 12 | 
 13 | 
 14 | required = parser.add_argument_group('required arguments')
 15 | 
 16 | required.add_argument("-i", "--input-files", metavar = "<FILE(s)>", nargs = "+", type = str, help = "space-delimited list of bracken output files", action = "store", required = True)
 17 | parser.add_argument("-n", "--sample-names", metavar = "<NAME(s)>", help = 'Sample names provided as a comma-delimited list (by default will use basename of input files)', action = "store", default = '')
 18 | parser.add_argument("-o", "--output-file", metavar = "<FILE>", help='Output file of combined tables (default: "combined-bracken.tsv")', action = "store", default = "combined-bracken.tsv")
 19 | 
 20 | if len(sys.argv)==1:
 21 |     parser.print_help(sys.stderr)
 22 |     sys.exit(0)
 23 | 
 24 | args = parser.parse_args()
 25 | 
 26 | # setting up variables
 27 | sample_counts = {}
 28 | total_counts = {}
 29 | all_samples = []
 30 | 
 31 | # setting sample names and intializing counts
 32 | if len(args.sample_names) == 0:
 33 |     for file in args.input_files:
 34 |         curr_sample = os.path.basename(file)
 35 |         total_counts[curr_sample] = 0
 36 |         all_samples.append(curr_sample)
 37 | 
 38 | else:
 39 |     for curr_sample in args.sample_names.split(","):
 40 |         total_counts[curr_sample] = 0
 41 |         all_samples.append(curr_sample)
 42 | 
 43 | 
 44 | # working on each file
 45 | # initialize level variable
 46 | level = ''
 47 | # initializiing iterator for grabbing sample names
 48 | i = 0
 49 | 
 50 | for file in args.input_files:
 51 | 
 52 |     # storing current sample name
 53 |     curr_name = all_samples[i]
 54 | 
 55 |     # incrementing iterator
 56 |     i += 1
 57 | 
 58 |     with open(file) as f:
 59 |         # skipping header
 60 |         next(f)
 61 |         for line in f:
 62 |             [name, taxid, taxlvl, kreads, areads, estreads, frac] = line.strip().split("\t")
 63 |             estreads = int(estreads)
 64 | 
 65 |             # error checks
 66 |             if name not in sample_counts:
 67 |                 sample_counts[name] = {}
 68 |                 sample_counts[name][taxid] = {}
 69 |             elif taxid != list(sample_counts[name].keys())[0]:
 70 |                 sys.exit("Taxonomy IDs not matching for species %s: (%s\t%s)" % (name, taxid, list(sample_counts[name].keys())[0]))
 71 |             if len(level) == 0:
 72 |                 level = taxlvl
 73 |             elif level != taxlvl:
 74 |                 sys.exit("Taxonomy level not matching between samples :(")
 75 | 
 76 |             # summing counts for current sample
 77 |             total_counts[curr_name] += estreads
 78 |             # adding read counts for that taxa for this sample to the dict holding all samples
 79 |             sample_counts[name][taxid][curr_name] = estreads
 80 | 
 81 | 
 82 | # opening output file
 83 | output_file = open(args.output_file, "w")
 84 | 
 85 | # writing header
 86 | output_file.write("name\ttax_id\ttax_level")
 87 | for name in all_samples:
 88 |     output_file.write("\t%s_num\t%s_frac" % (name, name))
 89 | output_file.write("\n")
 90 | 
 91 | # writing out each sample
 92 | for name in sample_counts:
 93 |     taxid = list(sample_counts[name].keys())[0]
 94 |     output_file.write("%s\t%s\t%s" % (name, taxid, level)) # seems like "level" variable is trusting the last thing was the same for all as it was for the last file's last line, probably true, but then not sure why we check. might return to this
 95 | 
 96 |     #Calculate and print information per sample
 97 |     for sample in all_samples:
 98 |         if sample in sample_counts[name][taxid]:
 99 |             num = sample_counts[name][taxid][sample]
100 |             perc = float(num)/float(total_counts[sample])
101 |             output_file.write("\t%i\t%0.5f" % (num, perc))
102 | 
103 |         # if sample doesn't have counts for this taxa, adding zeroes
104 |         else:
105 |             output_file.write("\t0\t0.00000")
106 | 
107 |     output_file.write("\n")
108 | 
109 | output_file.close()
110 | 


--------------------------------------------------------------------------------
/bit/helper-bit-dl-ncbi-assemblies-parallel.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | ### Helper script used with `bit-dl-ncbi-assemblies` when run in parallel; for version info, run or see `bit-version` ###
 4 | 
 5 | # setting colors to use
 6 | GREEN='\033[0;32m'
 7 | RED='\033[0;31m'
 8 | NC='\033[0m'
 9 | 
10 | my_ext=$2
11 | ext=$3
12 | format=$4
13 | http_flag=$5
14 | 
15 | assembly=$(echo "$1" | cut -f 1)
16 | downloaded_accession=$(echo "$1" | cut -f 2)
17 | 
18 | # storing and building links
19 | base_link=$(echo "$1" | cut -f 9)
20 | 
21 | 
22 | # checking link was actually present (sometimes, very rarely, it is not there)
23 | # if not there, attempting to build ourselves
24 | if [ $base_link == "na" ] || [ -z $base_link ]; then
25 | 
26 |     if [ $http_flag == "false" ]; then
27 |         p1=$(printf "ftp://ftp.ncbi.nlm.nih.gov/genomes/all")
28 |     else
29 |         p1=$(printf "https://ftp.ncbi.nlm.nih.gov/genomes/all")
30 |     fi
31 | 
32 |     # checking if GCF or GCA
33 |     if [[ $assembly == "GCF"* ]]; then
34 |         p2="GCF"
35 |     else
36 |         p2="GCA"
37 |     fi
38 | 
39 |     p3=$(echo $assembly | cut -f 2 -d "_" | cut -c 1-3)
40 |     p4=$(echo $assembly | cut -f 2 -d "_" | cut -c 4-6)
41 |     p5=$(echo $assembly | cut -f 2 -d "_" | cut -c 7-9)
42 | 
43 |     ass_name=$(echo "$1" | cut -f 3)
44 |     end_path=$(paste -d "_" <(echo "$assembly") <(echo "$ass_name"))
45 | 
46 |     base_link=$(paste -d "/" <(echo "$p1") <(echo "$p2") <(echo "$p3") <(echo "$p4") <(echo "$p5") <(echo "$end_path"))
47 | 
48 | else
49 | 
50 |     end_path=$(basename $base_link)
51 | 
52 | fi
53 | 
54 | curl --silent --retry 10 -o ${assembly}${my_ext} "${base_link}/${end_path}${ext}"
55 | 
56 | # grabbing this to check if " XML " is in there
57 |     # when this was first written, trying to download a link that wasn't there would fail
58 |     # now it can download an xml-formated file saying the link wasn't found at NCBI
59 |     # so this let's us check for that, and report and remove it if that's the case
60 | file_command_output=$(file ${assembly}${my_ext})
61 | 
62 | if [ -s ${assembly}${my_ext} ] && [[ ${file_command_output} != *" XML "* ]] && [[ ${file_command_output} != *" XHTML "* ]]; then
63 | 
64 |     printf "\r\t  Successfully downloaded: $assembly"
65 | 
66 | else
67 | 
68 |     printf "\n     ${RED}******************************* ${NC}NOTICE ${RED}*******************************${NC}  \n"
69 |     printf "\t    $assembly's $format file didn't download successfully.\n"
70 |     printf "\t    That file type may not exist for this accession.\n\n"
71 |     printf "\t    Written to \"NCBI-accessions-not-downloaded.txt\".\n"
72 |     printf "     ${RED}********************************************************************** ${NC}\n\n"
73 | 
74 |     echo ${assembly} >> NCBI-accessions-not-downloaded.txt
75 | 
76 |     rm -rf ${assembly}${my_ext}
77 | 
78 | fi


--------------------------------------------------------------------------------
/bit/helper-bit-get-ncbi-assembly-tables:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """
  4 | This is a helper program of bit taken from my GToTree package (https://github.com/AstrobioMike/GToTree/wiki)
  5 | to download the NCBI assembly summary tables if they are not present, or are more than 4 weeks old.
  6 | """
  7 | 
  8 | import sys
  9 | import os
 10 | import urllib.request
 11 | import argparse
 12 | import shutil
 13 | import textwrap
 14 | from datetime import date, timedelta
 15 | import filecmp
 16 | import tarfile
 17 | import gzip
 18 | 
 19 | parser = argparse.ArgumentParser(description="This is a helper program to download and setup the NCBI assembly summary tables if they are \
 20 |                                               not present, or are older than 4 weeks.", \
 21 |                                  epilog="Ex. usage: helper-bit-get-ncbi-assembly-tables\n")
 22 | 
 23 | parser.add_argument("-P", "--use-http", help='Use http instead of ftp', action = "store_true")
 24 | parser.add_argument("-f", "--force-update", help='Force an update regardless of last date retrieved', action = "store_true")
 25 | 
 26 | 
 27 | args = parser.parse_args()
 28 | 
 29 | 
 30 | ################################################################################
 31 | 
 32 | def main():
 33 | 
 34 |     NCBI_assembly_data_dir = check_location_var_is_set()
 35 | 
 36 |     data_present = check_if_data_present_and_less_than_4_weeks_old(NCBI_assembly_data_dir)
 37 | 
 38 |     if data_present and not args.force_update:
 39 |         exit()
 40 | 
 41 |     else:
 42 | 
 43 |         get_NCBI_assembly_summary_data(NCBI_assembly_data_dir)
 44 | 
 45 | ################################################################################
 46 | 
 47 | 
 48 | # setting some colors
 49 | tty_colors = {
 50 |     'green' : '\033[0;32m%s\033[0m',
 51 |     'yellow' : '\033[0;33m%s\033[0m',
 52 |     'red' : '\033[0;31m%s\033[0m'
 53 | }
 54 | 
 55 | 
 56 | ### functions ###
 57 | def color_text(text, color='green'):
 58 |     if sys.stdout.isatty():
 59 |         return tty_colors[color] % text
 60 |     else:
 61 |         return text
 62 | 
 63 | 
 64 | def wprint(text):
 65 |     print(textwrap.fill(text, width=80, initial_indent="                      ",
 66 |           subsequent_indent="  ", break_on_hyphens=False))
 67 | 
 68 | 
 69 | def check_location_var_is_set():
 70 | 
 71 |     # making sure there is a KO_data_dir env variable
 72 |     try:
 73 |         NCBI_data_dir = os.environ['NCBI_assembly_data_dir']
 74 |     except:
 75 |         wprint(color_text("The environment variable 'NCBI_assembly_data_dir'  does not seem to be set :(", "yellow"))
 76 |         wprint("This shouldn't happen, check on things with `bit-data-locations check`.")
 77 |         print("")
 78 |         sys.exit(0)
 79 | 
 80 |     return(NCBI_data_dir)
 81 | 
 82 | 
 83 | def check_if_data_present_and_less_than_4_weeks_old(location):
 84 | 
 85 |     # seeing if present already and if it was downloaded less than 4 weeks ago
 86 |     # if this function returns True, then we don't do anything
 87 |     # if it returns False, then we need to download things
 88 |     table_path = os.path.join(str(location), "ncbi-assembly-info.tsv")
 89 |     date_retrieved_path = os.path.join(str(location), "date-retrieved.txt")
 90 | 
 91 |     # if either file is missing, we are going to download, we also package the date-retrieved file empty with conda to retain directory, so checking it's not empty as well
 92 |     if not os.path.isfile(table_path) or not os.path.isfile(date_retrieved_path) or not os.path.getsize(date_retrieved_path) > 0:
 93 | 
 94 |         if os.path.exists(table_path):
 95 |             os.remove(table_path)
 96 |         if os.path.isdir(date_retrieved_path):
 97 |             shutil.rmtree(date_retrieved_path)
 98 | 
 99 |         return(False)
100 | 
101 |     # if both files are present (and not empty), we are checking if it was downloaded more than 4 weeks ago
102 |     # and will download if it was
103 |     if os.path.isfile(table_path) and os.path.isfile(date_retrieved_path):
104 | 
105 |         # getting current date
106 |         curr_date = date.today()
107 | 
108 |         # reading date it was downloaded
109 |         with open(date_retrieved_path, 'r') as file:
110 |             stored_date = file.read().strip()
111 | 
112 |         # setting to date object
113 |         stored_date_list = stored_date.split(",")
114 |         stored_date = date(int(stored_date_list[0]), int(stored_date_list[1]), int(stored_date_list[2]))
115 | 
116 |         # getting difference
117 |         diff = curr_date - stored_date
118 | 
119 |         # checking if difference is greater than 28 days
120 |         if diff.days > 28:
121 | 
122 |             return(False)
123 | 
124 |         else:
125 | 
126 |             return(True)
127 | 
128 |     else:
129 | 
130 |         return(True)
131 | 
132 | 
133 | def get_NCBI_assembly_summary_data(location):
134 | 
135 |     """ downloads the needed ncbi assembly summary tables and combines them """
136 | 
137 |     # setting links
138 |     if args.use_http:
139 | 
140 |         genbank_link = "https://ftp.ncbi.nlm.nih.gov/genomes/genbank/assembly_summary_genbank.txt"
141 |         refseq_link = "https://ftp.ncbi.nlm.nih.gov/genomes/refseq/assembly_summary_refseq.txt"
142 | 
143 |     else:
144 | 
145 |         genbank_link = "ftp://ftp.ncbi.nlm.nih.gov/genomes/genbank/assembly_summary_genbank.txt"
146 |         refseq_link = "ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/assembly_summary_refseq.txt"
147 | 
148 |     table_path = os.path.join(str(location), "ncbi-assembly-info.tsv")
149 |     refseq_temp_path = os.path.join(str(location), "refseq-assembly-info.tmp")
150 | 
151 |     print(color_text("    Downloading NCBI assembly summaries (only done once, or updated after 4 weeks)...\n", "yellow"))
152 | 
153 |     urllib.request.urlretrieve(genbank_link, table_path)
154 |     urllib.request.urlretrieve(refseq_link, refseq_temp_path)
155 | 
156 |     # combining
157 |     with open (table_path, "a") as final_table:
158 |         with open(refseq_temp_path, "r") as refseq:
159 |             final_table.write(refseq.read())
160 | 
161 |     # removing temp
162 |     if os.path.exists(refseq_temp_path):
163 |         os.remove(refseq_temp_path)
164 | 
165 |     # storing date retrieved
166 |     date_retrieved = str(date.today()).replace("-", ",")
167 |     date_retrieved.replace("-", ",")
168 | 
169 |     date_retrieved_path = os.path.join(str(location), "date-retrieved.txt")
170 | 
171 |     with open(date_retrieved_path, "w") as outfile:
172 |         outfile.write(date_retrieved + "\n")
173 | 
174 | ################################################################################
175 | 
176 | if __name__ == "__main__":
177 |     main()


--------------------------------------------------------------------------------
/bit/helper-bit-get-ncbi-tax-data:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """
  4 | This is a helper program of mine taken from GToTree (https://github.com/AstrobioMike/GToTree/wiki)
  5 | to download NCBI tax data for using TaxonKit (https://bioinf.shenwei.me/taxonkit/) with
  6 | bit-get-lineage-from-taxids.
  7 | """
  8 | 
  9 | import sys
 10 | import os
 11 | import urllib.request
 12 | import argparse
 13 | import shutil
 14 | import textwrap
 15 | import filecmp
 16 | import tarfile
 17 | import gzip
 18 | 
 19 | parser = argparse.ArgumentParser(description = "This is a helper program to setup NCBI tax data for programs that use TaxonKit (bioinf.shenwei.me/taxonkit/) \
 20 |                                               to retrieve taxonomy info.", \
 21 |                                  epilog = "Ex. usage: helper-bit-get-ncbi-tax-data\n")
 22 | 
 23 | args = parser.parse_args()
 24 | 
 25 | 
 26 | ################################################################################
 27 | 
 28 | def main():
 29 | 
 30 |     NCBI_data_dir = check_location_var_is_set()
 31 | 
 32 |     data_present = check_if_data_present(NCBI_data_dir)
 33 | 
 34 |     if data_present:
 35 |         exit()
 36 | 
 37 |     else:
 38 | 
 39 |         print("")
 40 |         print(color_text("    Downloading required NCBI taxonomy data (only needs to be done once)...\n", "yellow"))
 41 |         get_NCBI_tax_data(NCBI_data_dir)
 42 | 
 43 | 
 44 | ################################################################################
 45 | 
 46 | 
 47 | # setting some colors
 48 | tty_colors = {
 49 |     'green' : '\033[0;32m%s\033[0m',
 50 |     'yellow' : '\033[0;33m%s\033[0m',
 51 |     'red' : '\033[0;31m%s\033[0m'
 52 | }
 53 | 
 54 | 
 55 | ### functions ###
 56 | def color_text(text, color='green'):
 57 |     if sys.stdout.isatty():
 58 |         return tty_colors[color] % text
 59 |     else:
 60 |         return text
 61 | 
 62 | 
 63 | def wprint(text):
 64 |     print(textwrap.fill(text, width=80, initial_indent="  ",
 65 |           subsequent_indent="  ", break_on_hyphens=False))
 66 | 
 67 | 
 68 | def check_location_var_is_set():
 69 | 
 70 |     # making sure there is a KO_data_dir env variable
 71 |     try:
 72 |         NCBI_data_dir = os.environ['TAXONKIT_DB']
 73 |     except:
 74 |         wprint(color_text("The environment variable 'TAXONKIT_DB' does not seem to be set :(", "yellow"))
 75 |         wprint("This should have been handled automatically if things were installed with conda.")
 76 |         wprint("If you can't sort this out, please feel free to post an issue here:")
 77 |         print("        github.com/AstrobioMike/bit/issues\n\n")
 78 |         sys.exit(0)
 79 | 
 80 |     return(NCBI_data_dir)
 81 | 
 82 | 
 83 | def check_if_data_present(location):
 84 | 
 85 |     # seeing if present already
 86 |     # if this function returns True, then data is present
 87 |     # if it returns False, then we need to download things
 88 |     names_path = os.path.join(str(location) + "/names.dmp")
 89 |     nodes_path = os.path.join(str(location) + "/nodes.dmp")
 90 | 
 91 | 
 92 |     if not os.path.isfile(names_path) or not os.path.isfile(nodes_path):
 93 | 
 94 |         if os.path.exists(names_path):
 95 |             os.remove(names_path)
 96 |         if os.path.isdir(nodes_path):
 97 |             shutil.rmtree(nodes_path)
 98 | 
 99 |         return(False)
100 | 
101 |     else:
102 | 
103 |         return(True)
104 | 
105 | 
106 | def get_NCBI_tax_data(location):
107 |     """ downloads the needed ncbi tax data """
108 | 
109 |     taxdump_path = os.path.join(str(location) + "taxdump.tar.gz")
110 | 
111 |     urllib.request.urlretrieve("http://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz", taxdump_path)
112 | 
113 |     # unpacking
114 |     with tarfile.open(taxdump_path) as tarball:
115 |         tarball.extractall(location)
116 | 
117 |     # removing tarball
118 |     os.remove(taxdump_path)
119 | 
120 | 
121 | ################################################################################
122 | 
123 | if __name__ == "__main__":
124 |     main()
125 | 


--------------------------------------------------------------------------------
/bit/helper-bit-parse-assembly-summary-file.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | import argparse
 5 | import os
 6 | 
 7 | parser = argparse.ArgumentParser(description = 'This script is for parsing NCBI\'s assembly summary file down\
 8 |                                               to the provided accessions. It is used by the `bit-dl-ncbi-assemblies`\
 9 |                                               script. For version info, run `bit-version`.')
10 | 
11 | required = parser.add_argument_group('required arguments')
12 | 
13 | required.add_argument("-a", "--assembly-summary", metavar = "<FILE>", help = "NCBI's assembly summary file", action = "store", dest = "all_assemblies", required = True)
14 | required.add_argument("-w", "--wanted-accessions", metavar = "<FILE>", help = "Single-column file with wanted accessions", action = "store", dest = "wanted_accs", required = True)
15 | parser.add_argument("-o", "--output-file", help = 'Output file of wanted summary info only (default: "wanted.tsv")', action = "store", default = "wanted.tsv")
16 | 
17 | if len(sys.argv)==1:
18 |     parser.print_help(sys.stderr)
19 |     sys.exit(0)
20 | 
21 | args = parser.parse_args()
22 | 
23 | wanted_dict = {}
24 | 
25 | with open(args.wanted_accs, "r") as wanted_accs:
26 | 
27 |     for line in wanted_accs:
28 |         root_acc = line.strip().split(".")[0]
29 |         wanted_dict[str(root_acc)] = line.strip()
30 | 
31 | out_file = open(args.output_file, "w")
32 | 
33 | with open(args.all_assemblies) as assemblies:
34 | 
35 |     for line in assemblies:
36 |         line = line.split("\t")
37 | 
38 |         if line[0].split(".")[0] in wanted_dict:
39 |             dl_acc = str(line[0])
40 | 
41 |             if not dl_acc:
42 |                 dl_acc = "NA"
43 | 
44 |             ass_name = str(line[15])
45 |             if not ass_name:
46 |                 ass_name = "NA"
47 | 
48 |             taxid = str(line[5])
49 |             if not taxid:
50 |                 taxid = "NA"
51 | 
52 |             org_name = str(line[7])
53 |             if not org_name:
54 |                 org_name = "NA"
55 | 
56 |             infra_name = str(line[8])
57 |             if not infra_name:
58 |                 infra_name = "NA"
59 | 
60 |             version_status = str(line[10])
61 |             if not version_status:
62 |                 version_status = "NA"
63 | 
64 |             ass_level = str(line[11])
65 |             if not ass_level:
66 |                 ass_level = "NA"
67 | 
68 |             ftp_path = str(line[19])
69 |             if not ftp_path:
70 |                 ftp_path = "NA"
71 | 
72 |             out_file.write(str(wanted_dict[str(line[0].split(".")[0])]) + "\t" + str(dl_acc) + "\t"  + str(ass_name) + "\t" + str(taxid) + "\t" + str(org_name) + "\t" + str(infra_name) + "\t" + str(version_status) + "\t" + str(ass_level) + "\t" + str(ftp_path) + "\n")
73 | 


--------------------------------------------------------------------------------
/bit/helper-bit-setup-GO-dbs:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -e
 3 | 
 4 | ## This is a helper script to setup the GO databases if they aren't already present
 5 | 
 6 | YELLOW='\033[0;33m'
 7 | NC='\033[0m'
 8 | 
 9 | # checking env var is set
10 | 
11 | if [ -z ${GO_DB_DIR} ]; then
12 | 
13 |     printf "${YELLOW}  The environment variable 'GO_DB_DIR' does not seem to be set :(${NC}\n"
14 |     printf "  This should have been handled automatically if things were installed with conda.\n"
15 |     printf "  If you can't sort this out, please feel free to post an issue here:"
16 |     printf "        github.com/AstrobioMike/bit/issues\n\n"
17 | 
18 |     exit
19 | 
20 | fi
21 | 
22 | if [ ! -s ${GO_DB_DIR}/go-basic.obo ] || [ ! -s ${GO_DB_DIR}/goslim_metagenomics.obo ]; then
23 | 
24 |     rm -rf ${GO_DB_DIR}/go-basic.obo ${GO_DB_DIR}/goslim_metagenomics.obo ${GO_DB_DIR}/conda-placeholder
25 | 
26 |     printf "\n    ${YELLOW}Downloading required GO data (only needs to be done once)...${NC}\n"
27 | 
28 |     curl -L --silent --retry 10 -o ${GO_DB_DIR}/go-basic.obo http://purl.obolibrary.org/obo/go/go-basic.obo
29 |     curl -L --silent --retry 10 -o ${GO_DB_DIR}/goslim_metagenomics.obo http://current.geneontology.org/ontology/subsets/goslim_metagenomics.obo
30 | 
31 | fi
32 | 


--------------------------------------------------------------------------------
/bit/helper-bit-update-tax-table-for-seqscreen-go-tax-summary.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | ### Helper script for `bit-cov-summarize-go-annots-with-domains`; for version info, run or see `bit-version` ###
4 | ### generates a table grouping all taxids from Euks, Bacteria, Archaea, and viruses
5 | 
6 | # getting domain info for all taxids in ncbi files and storing in taxonkit data dir
7 | cut -f 1 ${TAXONKIT_DB}/nodes.dmp | taxonkit lineage | taxonkit reformat -r NA | cut -f 1,3 | tr ";" "\t" | cut -f 1,2 | grep -v "NA" > ${TAXONKIT_DB}/taxids-and-domains.tsv
8 | 


--------------------------------------------------------------------------------
/images/bit-cov-analyzer.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AstrobioMike/bit/c050fc3a649225c6f7a9a39e6e28f68831243b8f/images/bit-cov-analyzer.pdf


--------------------------------------------------------------------------------
/images/bit-cov-analyzer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AstrobioMike/bit/c050fc3a649225c6f7a9a39e6e28f68831243b8f/images/bit-cov-analyzer.png


--------------------------------------------------------------------------------
/images/bit-metagenomics-overview.afdesign:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AstrobioMike/bit/c050fc3a649225c6f7a9a39e6e28f68831243b8f/images/bit-metagenomics-overview.afdesign


--------------------------------------------------------------------------------
/images/bit-metagenomics-overview.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AstrobioMike/bit/c050fc3a649225c6f7a9a39e6e28f68831243b8f/images/bit-metagenomics-overview.pdf


--------------------------------------------------------------------------------
/images/bit-metagenomics-overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AstrobioMike/bit/c050fc3a649225c6f7a9a39e6e28f68831243b8f/images/bit-metagenomics-overview.png


--------------------------------------------------------------------------------
/test-data/ez-screen-assembly.fasta:
--------------------------------------------------------------------------------
 1 | >partial-NC_003131.1
 2 | CCTTAATTTCCGCGAGGATGACCTCGCTATTCCAGACATTCTCTGCCAGGCGCATGTCGATGTAGTCCAT
 3 | AAACGGTTTCAGCTTAACCATTTTGTGGCGAGTCTTTCTGGCTGGCGGTTCAGGGTATTTGAGGTAGCGT
 4 | CTGACAGTTCGTTCAGAGCAACCCACCTGAGTCGCAATATCGATAATGTACGCCCCCTGCTGGCGCATTT
 5 | GCTTTATCATGTAAAAGTCCTCTCTGCTCAGCATGTTGATGTCCTTTCTGGTGTGAGAACCTCAAGGAAA
 6 | CAACATGTTGGGTGGAGCGGACAATACTAATGGTGAATTACCGTCTTATATCACTGGCGCTAACACCGTG
 7 | AAGGGCTTCATGTTAATCATAAGCGCGTGTACCGGCTTTATCACCTCAGTGGCCTGGGCGTAAAACGCAG
 8 | AAGGCGTCGGAAAGGGCTGGCAACAGAACGTCTGCCGCTGCTCCGTCCGGCGGCGCCCAATCTGACCTGG
 9 | TCGATGGATTTCGTCATGGACGCATTGGCCACCGGTCGCAGGATAAAGTGCCTTACCTGCGTGGACGACT
10 | ACACGAAGGAATGCCTGACGGTCACTGTTGCCTTTGGGATTTCAGGCGTGCAGGTCACGCGTATTCTGGA
11 | CAGCATTGCGCTGTTTCGCGGCTATCCGGCGACGATAAGAACTGATCAGGGCCCGGAATTTACCTGCCGC
12 | GCGCTCGATCAATGGGCCTTTGAGCATGGCGTGGAACTGCGACTTATCCAGCCCGGCAAGCCGACACAGA
13 | ACGGATTTATTGAGAGTTTTAACGGACGCTTTCGCGATGAATGCCTGAATGAGCACTGGTTCAGTGACGT
14 | CAGTCATGCCAGGAAAACCATCAGTGAATGGCGTCAGGATTATAATGAGTGCCGCCCGCACTCTACGCTG
15 | AATTATCAGACGCCGTCTGAATTTGCGGCGGCCTGGAGAAAGGGTAATTCTGATAGTGAAGGATCCGACA
16 | TTACTAAGTGAGCGTTGTATCTAATCCTGGGGGCAGGTCATTCCGTATAATAAGGCAACAACCAAAAATC
17 | TACTCAACTAAATGACCGTGGTGGTGAGATTAGTGATGAGGTTTGTAGCCGTTCAGCCCCCTGCACCAGC
18 | ATCTCAAGCTGAGTATATAGTGAGTTATTATCCAGGCTGTTCAATGGTTGTCGATTCCATAACACTGGGT
19 | GCCCCCCAACCTCGTCCCAGGATAAGATGGGTTTTAATATATCTTGACTGAATATATTATGGCTAAGTAA
20 | GGTTTCCTTTTCATCATTATTGTCAAGAGAAGGTAGGGTAAACATTAATATTTGCCCGACAGGATGCTCT
21 | GTTATATGGCAGGCGAATTCCCCAACTTTGACACCGATAACCGGTTCAATAGTATCTGGAATAGACAACG
22 | AAAGTTGTTGAAATAATTGAGTGATAGCTTGTTCAAATGAATACATTATGATCTCATAATAGTTAGATAA
23 | AATATCAACTTAACCAAAGCACTCTCGGCAGACCATCAATTTTAGCCTATAATTTTTAGTTTTTGTTTTG
24 | TCTAATATAACAACAAAAACAGCAGCGATTTTTTATATAGCCATCGGCTATTTTCCCACTAAGATAACCT
25 | TGTTTTAATAGCCAAGGTAATAAATAGTCATGAAAATATCATCATTTATTTCTACATCACTGCCCCTGCC
26 | GACATCTGTGTCAGGATCTAGCAGCGTAGGAGAAATGTCTGGGCGCTCAGTCTCACAGCAAACAAGTGAT
27 | CAATATGCAAACAATCTGGCCGGGCGCACTGAAAGCCCTCAGGGTTCCAGCTTAGCCAGCCGTATCATTG
28 | AGAGGTTATCATCAGTGGCCCACTCTGTGATTGGGTTTATCCAACGCATGTTCTCGGAGGGGAGCCATAA
29 | ACCGGTGGTGACACCAGCACCCACACCTGCACAAATGCCAAGTCCTACGTCTTTCAGTGACAGTATCAAG
30 | CAACTTGCTGCTGAGACGCTGCCAAAATACATGCAGCAGTTGAATAGCTTGGATGCAGAGATGCTGCAGA
31 | AAAATCATGATCAGTTCGCTACGGGCAGCGGCCCTCTTCGTGGCAGTATCACTCAATGCCAAGGGCTGAT
32 | GCAGTTTTGTGGTGGGGAATTGCAAGCTGAGGCCAGTGCCATCTTAAACACGCCTGTTTGTGGTATTCCC
33 | TTCTCGCAGTGGGGAACTATTGGTGGGGCGGCCAGCGCGTACGTCGCCAGTGGCGTTGATCTAACGCAGG
34 | CAGCAAATGAGATCAAAGGGCTGGCGCAACAGATGCAGAAATTACTGTCATTGATGTGATATGGATAAAA
35 | ACAAGGGGATAGTGTTTCCCCCTTTTTCTATCAATATTGCGAATATCTTCGTCCCTGATCTTTCAGGGGC
36 | GAATCGTTTTTTAGCATGCTCATTGTTAGAATTTCTGACTTATCTCTCTTCTGTATTACTACTCATGCTC
37 | TGGAAAATCCTGAACATCTATATCTATGGATTGATGCAGCACTCGAGAAATCAAAATATCATTGCTAAGC
38 | GTTATATAGTATATACCGTGCTTTTTATACTGAAAACGGCGAATATCAGAGCAAATCCAGTTACACTCAG
39 | CCCCTAACTCTGGATTTTTAGCTAATAGCTCGAATACCTTTGCCAAGTTCTCATGGTATAACTTAGCCTG
40 | AGTCACACCGAAATGCCGGATAGTATAACTGGCAATATTATAAATATCCTCATCAGCTAGTTCAGACAGT
41 | TTATACACTAGTATCTTTCACCGCAGCAGAAAAAATCTCATCCATTAAACGATGGCTCACAGGTACATTT
42 | GTTCCTGCAAGCACCATATCGCGTACATGTTGAACACGCTGTTCACGTGCTTCCATTAGCCTTAAGGCAT
43 | CACGAAGCACTTCTGATATATTGCCATAACGACCAGACTGAATCATTTCCCCCACAAAACCTGTCAAATG
44 | CTCTCCAAGTGTTACGCTGGTTACGTGAGCCATATCCCCTCCGTTATGTATTACTGAGTAATACAATTAT
45 | 


--------------------------------------------------------------------------------
/test-data/ez-screen-targets.fasta:
--------------------------------------------------------------------------------
 1 | >yopE
 2 | ATGAAAATATCATCATTTATTTCTACATCACTGCCCCTGCCGACATCTGTGTCAGGATCTAGCAGCGTAG
 3 | GAGAAATGTCTGGGCGCTCAGTCTCACAGCAAACAAGTGATCAATATGCAAACAATCTGGCCGGGCGCAC
 4 | TGAAAGCCCTCAGGGTTCCAGCTTAGCCAGCCGTATCATTGAGAGGTTATCATCAGTGGCCCACTCTGTG
 5 | ATTGGGTTTATCCAACGCATGTTCTCGGAGGGGAGCCATAAACCGGTGGTGACACCAGCACCCACACCTG
 6 | CACAAATGCCAAGTCCTACGTCTTTCAGTGACAGTATCAAGCAACTTGCTGCTGAGACGCTGCCAAAATA
 7 | CATGCAGCAGTTGAATAGCTTGGATGCAGAGATGCTGCAGAAAAATCATGATCAGTTCGCTACGGGCAGC
 8 | GGCCCTCTTCGTGGCAGTATCACTCAATGCCAAGGGCTGATGCAGTTTTGTGGTGGGGAATTGCAAGCTG
 9 | AGGCCAGTGCCATCTTAAACACGCCTGTTTGTGGTATTCCCTTCTCGCAGTGGGGAACTATTGGTGGGGC
10 | GGCCAGCGCGTACGTCGCCAGTGGCGTTGATCTAACGCAGGCAGCAAATGAGATCAAAGGGCTGGCGCAA
11 | CAGATGCAGAAATTACTGTCATTGATGTGA
12 | >yopK
13 | ATGTTTATTAAAGATACTTATAACATGCGTGCTTTATGTACCGCTCTTGAACAGTCGGCTCCTGATACAA
14 | TAATAAATACATCTAAAGAAGAAAATAACAGTTACTACTGCGCTACTGCTCATTTACTGAGAACGGATGT
15 | TTGTTCATTGGTCAATAGAGTAGGGATTGAACCACTTAAAAGTGGATCAATATTATCTACTTTAGAAGAG
16 | TTATGGCAGGCTGTTGGTATAGTATATCGCTTATACGAATGGCAACATGTCAGCGATATTGACACCAATT
17 | TTAAGAAACTACCCAATAATTCTGATTTTGGTCTTGTGTTTTCTGTATTAGATTGTGATATAGAGTATGT
18 | GTTCATAGGGAAAAAAGACAGTGAAGGGAATATAGAATTTTATGATCCGAAAAACTCTCTACTTATAGAG
19 | AATGATGACATAAAAAAATATTTATATGATGAAGATTTTCATCGTTTTTGTATTATGCTGATCATCTCTA
20 | AATCTGAGTTGGAGGAATTGAGTCGCGAATCCTGCGATCAAGAATGTATTATGGGATGA
21 | 


--------------------------------------------------------------------------------
/test-data/kraken-example-out.tsv:
--------------------------------------------------------------------------------
 1 | U	A00159:145:H75T2DMXX:1:1101:7735:13792	unclassified (taxid 0)	16	0:0
 2 | U	A00159:145:H75T2DMXX:1:1101:11216:13557	unclassified (taxid 0)	30	0:0
 3 | U	A00159:145:H75T2DMXX:1:1101:22688:14074	unclassified (taxid 0)	26	0:0
 4 | U	A00159:145:H75T2DMXX:1:1101:1325:14559	unclassified (taxid 0)	31	0:0
 5 | U	A00159:145:H75T2DMXX:1:1101:23719:15013	unclassified (taxid 0)	30	0:0
 6 | C	A00159:145:H75T2DMXX:1:1102:11388:8312	Ochrobactrum (taxid 528)	194	0:12 1224:16 28211:7 528:15
 7 | U	A00159:145:H75T2DMXX:1:1102:15465:8390	unclassified (taxid 0)	27	0:0
 8 | U	A00159:145:H75T2DMXX:1:1102:6343:7560	unclassified (taxid 0)	271	0:237
 9 | U	A00159:145:H75T2DMXX:1:1102:30101:11600	unclassified (taxid 0)	26	0:0
10 | U	A00159:145:H75T2DMXX:1:1101:19678:2221	unclassified (taxid 0)	279	0:245
11 | 


--------------------------------------------------------------------------------
/workflows/genome-summarize-wf/README.md:
--------------------------------------------------------------------------------
 1 | # [bit](https://github.com/AstrobioMike/bit) genome-summarize workflow
 2 | This is a [snakemake](https://snakemake.github.io/) workflow for generating and combining genome assembly stats, quality estimates, and taxonomy info. Inputs are fasta files of genome assemblies. For all workflows available with _bit_, see [here](https://github.com/AstrobioMike/bit?tab=readme-ov-file#workflows).
 3 | 
 4 | ---
 5 | 
 6 | * [**Overview**](#overview)
 7 | * [**Usage**](#usage)
 8 |   * [Retrieving the workflow](#retrieving-the-workflow)
 9 |   * [Modifying the config.yaml](#modifying-the-configyaml)
10 |   * [Running the workflow](#running-the-workflow)
11 | * [**Version info**](#version-info)
12 | 
13 | ---
14 | 
15 | ## Overview
16 | 
17 | This workflow will summarize input genome assemblies, estimate quality, and assign taxonomy via the following programs:
18 | 
19 |   - [bit](https://github.com/AstrobioMike/bit#bioinformatics-tools-bit) for generating assembly summary stats
20 |   - [checkm2](https://github.com/chklovski/CheckM2#checkm2) for estimating quality of bacteria/archaea
21 |   - [GTDB-tk](https://github.com/Ecogenomics/GTDBTk#gtdb-tk) for assigning taxonomy of bacteria/archaea
22 |   - [eukcc](https://github.com/Finn-Lab/EukCC#eukcc) for estimating quality of eukarya
23 |   - [CAT](https://github.com/dutilh/CAT#cat-and-bat) with the NCBI nr database for assigning taxonomy of eukarya
24 | 
25 | It ultimately produce an output table like this:
26 | 
27 | ```bash
28 | Assembly        Total contigs  Total length  Ambiguous characters  GC content  Maximum contig length  Minimum contig length  N50        L50  Est. Completeness (%)  Est. Redundancy (%)  Domain    Phylum          Class                Order             Family             Genus           Species
29 | input_genome_A  1              5,276,633     0                     61.17       5,276,633              5,276,633              5,276,633  1    99.99                  0.96                 Bacteria  Proteobacteria  Gammaproteobacteria  Pseudomonadales   Pseudomonadaceae   Pseudomonas_E   Pseudomonas_E fulva
30 | input_MAG_B     4              2,702,105     0                     33.15       2,601,030              30,881                 2,601,030  1    90.22                  2.78                 Bacteria  Firmicutes      Bacilli              Staphylococcales  Staphylococcaceae  Staphylococcus  Staphylococcus saprophyticus
31 | ```
32 | 
33 | All required databases will be setup by the workflow if they don't exist already whenever they are used for the first time.
34 | 
35 | ---
36 | 
37 | ## Usage
38 | _bit_ should be installed via conda as described [here](https://github.com/AstrobioMike/bit?tab=readme-ov-file#conda-install).
39 | 
40 | ### Retrieving the worklfow
41 | 
42 | ```bash
43 | bit-get-workflow genome-summarize
44 | ```
45 | 
46 | ### Modifying the config.yaml
47 | Before running it, you first need to set some variables in the config.yaml file (there are notes in there). 
48 | 
49 | The workflow cannot currently run on a mix of input bacteria/archaea genomes and eukaryotic genomes, it can only run on bacteria/archaea by themselves, or eukarya by themselves (as set by a parameter in the config.yaml file).
50 | 
51 | In the config.yaml file, you mostly just need to point to where the input fasta files are, specify what their extensions are, where the reference databases are stored (or where you want them to go if this is the first time running the workflow), and then you can run the snakemake workflow as exemplified below. 
52 | 
53 | ### Running the workflow
54 | After variables are set in the config.yaml, here's an example of how it could be run (note that it should still be run inside the _bit_ conda environment):
55 |  
56 | ```bash
57 | snakemake --use-conda --conda-prefix ${CONDA_PREFIX}/envs -j 4 -p
58 | ```
59 | 
60 | - `--use-conda` – this specifies to use the conda environments included in the workflow
61 | - `--conda-prefix` – this allows us to point to where the needed conda environments should be stored. Including this means if we use the workflow on a different dataset somewhere else in the future, it will re-use the same conda environments rather than make new ones. The value listed here, `${CONDA_PREFIX}/envs`, is the default location for conda environments (the variable `${CONDA_PREFIX}` will be expanded to the appropriate location on whichever system it is run on).
62 | - `-j` – this lets us set how many jobs Snakemake should run concurrently (keep in mind that many of the thread and cpu parameters set in the config.yaml file will be multiplied by this)
63 | - `-p` – specifies to print out each command being run to the screen
64 | 
65 | See `snakemake -h` for more options and details.
66 | 
67 | ---
68 | 
69 | ## Version info
70 | Note that the workflows are version independently of the _bit_ package. When you pull one with `bit-get-workflow`, the directory name will have the version, and it is also listed at the top of the Snakefile.
71 | 
72 | All versions of programs used can be found in their corresponding conda yaml file in the envs/ directory. 
73 | 


--------------------------------------------------------------------------------
/workflows/genome-summarize-wf/config.yaml:
--------------------------------------------------------------------------------
  1 | ###################################################################################################
  2 | ## Config file for the "bit" genome summarize workflow.                                          ##
  3 | ## bit: https://github.com/AstrobioMike/bit                                                      ##
  4 | ##                                                                                               ##
  5 | ## If you use this workflow in a publication, please consider citing :)                          ##
  6 | ##   Lee M. bit: a multipurpose collection of bioinformatics tools. F1000Research 2022, 11:122.  ##
  7 | ##   https://doi.org/10.12688/f1000research.79530.1                                              ##
  8 | ###################################################################################################
  9 | 
 10 | 
 11 | # this is just a prefix for the final output table
 12 | output_prefix:
 13 |     "Output"
 14 | 
 15 | # set to True, with no quotes, if genomes are from eukarya, 
 16 | # set to False, with no quotes if genomes bacteria/archaea
 17 | is_euk:
 18 |     False
 19 | 
 20 | # path to where the genomes are located
 21 | genomes_dir:
 22 |     "../genomes"
 23 | 
 24 | # extension the fasta files have (must not be gzipped as currently written; include the period preceding, e.g., ".fasta", ".fna", ".fa")
 25 |     # gzip-compressed not accepted currently
 26 | assembly_extension:
 27 |     ".fasta"
 28 | 
 29 | ## reference database locations
 30 | # these should be full paths to the directories that will hold the databases (more info below)
 31 | CHECKM2_DATA_PATH:
 32 |     "/checkm2-ref-dir"
 33 | 
 34 | GTDB_DATA_PATH:
 35 |     "/GTDB-tk-ref-dir"
 36 | 
 37 | DIR_HOLDING_CAT_DIR:
 38 |     "/dir-holding-CAT-ref-dir"
 39 |     # actual directory name of CAT DB is below
 40 | 
 41 | DIR_HOLDING_eukcc_DIR:
 42 |     "/dir-holding-eukcc-db-dir"
 43 |     # actual directory name of eukcc db is below
 44 | 
 45 | ## number of threads or cpus (depending on how the program labeled them) to use per snakemake job (set with the -j parameter to the snakemake call)
 46 |     # passed to eukcc, CAT, checkm2, gtdb-tk
 47 | threads:
 48 |     20
 49 | 
 50 | ## number of cpus used by pplacer by gtdb-tk
 51 | # pplacer can have issues with memory with multiple cpus; see e.g. https://ecogenomics.github.io/GTDBTk/faq.html#gtdb-tk-reaches-the-memory-limit-pplacer-crashes
 52 | gtdb_tk_pplacer_cpus:
 53 |     4
 54 | 
 55 | logs_dir:
 56 |     "logs/"
 57 | 
 58 | checkm2_output_dir:
 59 |     "checkm2-output/"
 60 | 
 61 | gtdbtk_output_dir:
 62 |     "gtdb-tk-output/"
 63 | 
 64 | ## keep all files?
 65 | # set this to "YES" (all caps needed) if wanting to keep all produced files by all programs, anything else here
 66 | # means all be deleted upon completion except for the primary output summary table
 67 | keep_all_files:
 68 |     "no"
 69 | 
 70 | ################################################################################################################
 71 | ##### Resource specifications that may need to be changed (mostly only necessary if using a job scheduler) #####
 72 | ####### Could leave these as-is to start, but they are here to be increased if a job fails due to memory #######
 73 | ################################################################################################################
 74 | 
 75 | ### these are all passed in the "resources" directive of their respective rules in the Snakefile, going to
 76 |     # the "mem_mb" argument (so should be provided in terms of megabytes)
 77 | 
 78 | # passed to rule gtdbtk_classify
 79 | gtdbtk_memory_resources:
 80 |     100000
 81 | 
 82 | # passed to rule run_checkm2
 83 | checkm2_memory_resources:
 84 |     50000
 85 | 
 86 | # passed to the run_CAT rule
 87 | CAT_memory_resources:
 88 |     40000
 89 | 
 90 | # passed to the run_eukcc rule
 91 | eukcc_memory_resources:
 92 |     50000
 93 | 
 94 | #######################################################
 95 | ################# REFERENCE DATABASES #################
 96 | #######################################################
 97 | # The workflow will check the locations specified above for the corresponding refernence databases, 
 98 | # and install them if they are not already there. It looks for the below "TRIGGER" filenames (they 
 99 | # all end with "*_DB_SETUP") in the directory of each database, which it creates when
100 | # it sets them up initially. 
101 | # If we want to point to DBs that already exist on our setup, that were not prepared by this workflow,
102 | # we need to add these (empty) "TRIGGER" files to their respective directories. The
103 | # workflow just checks the file is there to know it doesn't need to setup the DB. This might tricky
104 | # to figure out, and easiest would be to let the workflow do it so all DB versions match the program
105 | # versions for sure. 
106 | 
107 | # there are some database filenames coded below that are noted as things that
108 | # we likely shouldn't change, so leave those unless you are sure you want to change them
109 | 
110 | ### checkm2 ###
111 | # likely shouldn't change
112 | CHECKM2_DB_FILENAME:
113 |     "uniref100.KO.1.dmnd"
114 | 
115 | # likely shouldn't change
116 | CHECKM2_TRIGGER_FILE:
117 |     "CHECKM2_DB_SETUP"
118 | 
119 | ### gtdb-tk ###
120 | # likely shouldn't change
121 | GTDB_TRIGGER_FILE:
122 |     "GTDBTK_DB_SETUP"
123 | 
124 | ### CAT ###
125 | # likely shouldn't change all of the below
126 | CAT_DIR:
127 |     "CAT_prepare_20210107"
128 | CAT_DB:
129 |     "2021-01-07_CAT_database"
130 | CAT_TAX:
131 |     "2021-01-07_taxonomy"
132 | CAT_DL_FILE:
133 |     "CAT_prepare_20210107.tar.gz"
134 | CAT_DL_LINK:
135 |     "https://tbb.bio.uu.nl/bastiaan/CAT_prepare/CAT_prepare_20210107.tar.gz"
136 | CAT_COMPRESSED_NR_FAA:
137 |     "2021-01-07.nr.gz"
138 | CAT_TRIGGER_FILE:
139 |     "CAT_DB_SETUP"
140 | 
141 | ### eukcc ###
142 | # likely shouldn't change all below
143 | eukcc_db_dir:
144 |     "eukcc2_db_ver_1.1"
145 | eukcc_DL_FILE:
146 |     "eukcc2_db_ver_1.1.tar.gz"
147 | eukcc_DL_LINK:
148 |     "http://ftp.ebi.ac.uk/pub/databases/metagenomics/eukcc/eukcc2_db_ver_1.1.tar.gz"
149 | eukcc_TRIGGER_FILE:
150 |     "eukcc_DB_SETUP"
151 | 
152 | ## example usage command ##
153 | # snakemake --use-conda --conda-prefix ${CONDA_PREFIX}/envs -j 4 -p
154 | 
155 | # `--use-conda` – this specifies to use the conda environments included in the workflow
156 | # `--conda-prefix` – this allows us to point to where the needed conda environments should be stored. Including this means if we use the workflow on a different dataset somewhere else in the future, it will re-use the same conda environments rather than make new ones. The value listed here, `${CONDA_PREFIX}/envs`, is the default location for conda environments (the variable `${CONDA_PREFIX}` will be expanded to the appropriate location on whichever system it is run on).
157 | # `-j` – this lets us set how many jobs Snakemake should run concurrently (keep in mind that many of the thread and cpu parameters set in the config.yaml file will be multiplied by this)
158 | # `-p` – specifies to print out each command being run to the screen
159 | 
160 | # See `snakemake -h` for more options and details.
161 | 


--------------------------------------------------------------------------------
/workflows/genome-summarize-wf/envs/bit.yaml:
--------------------------------------------------------------------------------
1 | name: bit
2 | channels:
3 |     - conda-forge
4 |     - bioconda
5 |     - defaults
6 |     - astrobiomike
7 | dependencies:
8 |     - bit=1.8.65
9 | 


--------------------------------------------------------------------------------
/workflows/genome-summarize-wf/envs/cat.yaml:
--------------------------------------------------------------------------------
1 | name: cat
2 | channels:
3 |     - conda-forge
4 |     - bioconda
5 |     - defaults
6 | dependencies:
7 |     - cat=5.2.2
8 | 


--------------------------------------------------------------------------------
/workflows/genome-summarize-wf/envs/checkm2.yaml:
--------------------------------------------------------------------------------
1 | name: checkm2
2 | channels:
3 |     - conda-forge
4 |     - bioconda
5 |     - defaults
6 | dependencies:
7 |     - checkm2=1.0.1
8 | 


--------------------------------------------------------------------------------
/workflows/genome-summarize-wf/envs/eukcc.yaml:
--------------------------------------------------------------------------------
1 | name: eukcc
2 | channels:
3 |     - conda-forge
4 |     - bioconda
5 |     - defaults
6 | dependencies:
7 |     - eukcc=2.1.0
8 | 


--------------------------------------------------------------------------------
/workflows/genome-summarize-wf/envs/gtdb-tk.yaml:
--------------------------------------------------------------------------------
1 | name: gtdb-tk
2 | channels:
3 |     - conda-forge
4 |     - bioconda
5 |     - defaults
6 | dependencies:
7 |     - gtdbtk=2.4.0
8 |     - numpy=1.23.1
9 | 


--------------------------------------------------------------------------------
/workflows/genome-summarize-wf/scripts/combine-euk-outputs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import pandas as pd
 4 | import re
 5 | import argparse
 6 | import sys
 7 | 
 8 | ## contact: Michael D. Lee (Mike.Lee@nasa.gov)
 9 | 
10 | parser = argparse.ArgumentParser(description='This script combines the outputs in our GeneLab genome standard processing.')
11 | 
12 | required = parser.add_argument_group('required arguments')
13 | 
14 | required.add_argument("-s", "--input-summary-tsv", help="Input assembly summary stats file", action="store", required=True)
15 | required.add_argument("-c", "--input-eukcc-tsv", help="Input eukcc summaries", action="store", required=True)
16 | required.add_argument("-t", "--input-tax-tsv", help="Input CAT taxonomies", action="store", required=True)
17 | 
18 | parser.add_argument("-o", "--output-tsv", help='Output table filename (default: "Genomes-summaries.tsv")', action="store", default="Genome-summaries.tsv")
19 | 
20 | args = parser.parse_args()
21 | 
22 | if len(sys.argv)==1:
23 |   parser.print_help(sys.stderr)
24 |   sys.exit(0)
25 | 
26 | # reading in summary stats
27 | stats_df = pd.read_csv(args.input_summary_tsv, sep="\t", index_col=0)
28 | 
29 | # slimming down to those we want
30 | wanted_summary_stats = ["Total contigs", "Total length", "Ambiguous characters", "GC content", "Maximum contig length", "Minimum contig length", "N50", "L50"]
31 | wanted_stats_df = stats_df.loc[wanted_summary_stats, ]
32 | 
33 | # transposing
34 | trans_df = wanted_stats_df.T
35 | 
36 | ## for the life of me i can't figure out how to do this the easy way, but formatting the numbers
37 | trans_df["Total contigs"] = trans_df["Total contigs"].map('{:,.0f}'.format)
38 | trans_df["Total length"] = trans_df["Total length"].map('{:,.0f}'.format)
39 | trans_df["Ambiguous characters"] = trans_df["Ambiguous characters"].map('{:,.0f}'.format)
40 | trans_df["Maximum contig length"] = trans_df["Maximum contig length"].map('{:,.0f}'.format)
41 | trans_df["Minimum contig length"] = trans_df["Minimum contig length"].map('{:,.0f}'.format)
42 | trans_df["N50"] = trans_df["N50"].map('{:,.0f}'.format)
43 | trans_df["L50"] = trans_df["L50"].map('{:,.0f}'.format)
44 | 
45 | # reading in checkm results
46 | eukcc_df = pd.read_csv(args.input_eukcc_tsv, sep="\t", index_col=0)
47 | 
48 | # slimming down to those we want
49 | wanted_eukcc_cols = ["Est. Comp.", "Est. Redund."]
50 | eukcc_df = eukcc_df.loc[:, wanted_eukcc_cols]
51 | 
52 | # renaming columns
53 | eukcc_df.columns = ["Est. Completeness (%)", "Est. Redundancy (%)"]
54 | eukcc_df.index.names = ["Assembly"]
55 | 
56 | # merging those two
57 | combined_df = trans_df.merge(eukcc_df, left_index=True, right_index=True)
58 | 
59 | # creating a dictionary to hold lineage info from CAT
60 | tax_dict = {}
61 | 
62 | ranks = ["Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"]
63 | 
64 | # iterating through that input file
65 | with open(args.input_tax_tsv, "r") as tax:
66 |     for line in tax:
67 |         if line.strip().startswith("Assembly"):
68 |             continue
69 | 
70 |         line = line.strip().split("\t")
71 |         ID = line[0]
72 |         tax_list = line[1:8]
73 | 
74 |         curr_dict = dict(zip(iter(ranks), iter(tax_list)))
75 | 
76 |         tax_dict[ID] = curr_dict
77 | 
78 | # creating dataframe from our tax dictionary
79 | tax_df = pd.DataFrame.from_dict(tax_dict, orient="index")
80 | tax_df.index.names = ["Assembly"]
81 | 
82 | # merging with summary stats table
83 | final_df = combined_df.merge(tax_df, left_index=True, right_index=True)
84 | final_df.index.names = ["Assembly"]
85 | 
86 | # changing empties to "Not Assigned"
87 | final_df.replace({"": "Not Assigned"}, inplace=True)
88 | 
89 | # writing out
90 | with open(args.output_tsv, "w") as out:
91 |     out.write(final_df.to_csv(index=True, sep="\t"))
92 | 


--------------------------------------------------------------------------------
/workflows/genome-summarize-wf/scripts/combine-outputs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import pandas as pd
 4 | import re
 5 | import argparse
 6 | import sys
 7 | 
 8 | ## contact: Michael D. Lee (Mike.Lee@nasa.gov)
 9 | 
10 | parser = argparse.ArgumentParser(description = 'This script combines the taxonomic classification, quality estimates, and summary stats into one table.')
11 | 
12 | required = parser.add_argument_group('required arguments')
13 | 
14 | required.add_argument("-s", "--input-summary-tsv", help = "Input assembly summary stats file from bit", action = "store", required = True)
15 | required.add_argument("-c", "--input-checkm2-tsv", help = "Input summary from checkm2", action = "store", required = True)
16 | required.add_argument("-t", "--input-tax-tsv", help = "Input slimmed results from GTDB-tk", action = "store", required = True)
17 | 
18 | parser.add_argument("-o", "--output-tsv", help = 'Output table filename (default: "Genomes-summaries.tsv")', action = "store", default = "Genome-summaries.tsv")
19 | 
20 | args = parser.parse_args()
21 | 
22 | if len(sys.argv)==1:
23 |     parser.print_help(sys.stderr)
24 |     sys.exit(0)
25 | 
26 | # reading in summary stats
27 | stats_df = pd.read_csv(args.input_summary_tsv, sep = "\t", index_col = 0)
28 | 
29 | # slimming down to those we want
30 | wanted_summary_stats = ["Total contigs", "Total length", "Ambiguous characters", "GC content", "Maximum contig length", "Minimum contig length", "N50", "L50"]
31 | wanted_stats_df = stats_df.loc[wanted_summary_stats, ]
32 | 
33 | # transposing
34 | trans_df = wanted_stats_df.T
35 | 
36 | ## for the life of me i can't figure out how to do this the easy way right now, but formatting the numbers here
37 | trans_df["Total contigs"] = trans_df["Total contigs"].map('{:,.0f}'.format)
38 | trans_df["Total length"] = trans_df["Total length"].map('{:,.0f}'.format)
39 | trans_df["Ambiguous characters"] = trans_df["Ambiguous characters"].map('{:,.0f}'.format)
40 | trans_df["Maximum contig length"] = trans_df["Maximum contig length"].map('{:,.0f}'.format)
41 | trans_df["Minimum contig length"] = trans_df["Minimum contig length"].map('{:,.0f}'.format)
42 | trans_df["N50"] = trans_df["N50"].map('{:,.0f}'.format)
43 | trans_df["L50"] = trans_df["L50"].map('{:,.0f}'.format)
44 | 
45 | # reading in checkm2 results
46 | checkm2_df = pd.read_csv(args.input_checkm2_tsv, sep = "\t", index_col = 0)
47 | 
48 | # slimming down to those we want
49 | wanted_checkm_cols = ["Completeness", "Contamination"]
50 | checkm2_df = checkm2_df.loc[:, wanted_checkm_cols]
51 | 
52 | # renaming columns
53 | checkm2_df.columns = ["Est. Completeness (%)", "Est. Redundancy (%)"]
54 | checkm2_df.index.names = ["Assembly"]
55 | 
56 | # merging those two
57 | combined_df = trans_df.merge(checkm2_df, left_index = True, right_index = True)
58 | 
59 | # creating a dictionary to hold lineage info from gtdb-tk
60 | tax_dict = {}
61 | 
62 | ranks = ["Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"]
63 | 
64 | # iterating through that input file
65 | with open(args.input_tax_tsv, "r") as tax:
66 |     for line in tax:
67 |         if line.strip().startswith("user_genome"):
68 |             continue
69 | 
70 |         line = line.strip().split("\t")
71 |         ID = line[0]
72 |         tax_str = line[1].replace(";", "")
73 | 
74 |         tax_list = re.split(".?__", tax_str)[1:8]
75 | 
76 |         # handling if nothing was at all classified
77 |         if not tax_list:
78 |             
79 |             tax_list = ["Not Assigned"] * 7
80 | 
81 |         curr_dict = dict(zip(iter(ranks), iter(tax_list)))
82 | 
83 |         tax_dict[ID] = curr_dict
84 | 
85 | # creating dataframe from our tax dictionary
86 | tax_df = pd.DataFrame.from_dict(tax_dict, orient = "index")
87 | tax_df.index.names = ["Assembly"]
88 | 
89 | # merging with summary stats table
90 | final_df = combined_df.merge(tax_df, left_index = True, right_index = True)
91 | final_df.index.names = ["Assembly"]
92 | 
93 | # changing empties to "Not Assigned"
94 | final_df.replace({"": "Not Assigned"}, inplace = True)
95 | 
96 | # writing out
97 | with open(args.output_tsv, "w") as out:
98 |     out.write(final_df.to_csv(index = True, sep = "\t"))
99 | 


--------------------------------------------------------------------------------
/workflows/genome-summarize-wf/scripts/slurm-status.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import subprocess
 3 | import sys
 4 | 
 5 | jobid = sys.argv[1]
 6 | 
 7 | # if wanting to use, this should be added to the snakemake call from the root workflow dir: `--cluster-status scripts/slurm-status.py`
 8 | 
 9 | output = str(subprocess.check_output("sacct -j %s --format State --noheader | head -1 | awk '{print $1}'" % jobid, shell=True).strip())
10 | 
11 | running_status=["PENDING", "CONFIGURING", "COMPLETING", "RUNNING", "SUSPENDED"]
12 | if "COMPLETED" in output:
13 |     print("success")
14 | elif any(r in output for r in running_status):
15 |     print("running")
16 | else:
17 |     print("failed")
18 | 


--------------------------------------------------------------------------------
/workflows/metagenomics-wf/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Metagenomics workflow change log
 2 | 
 3 | ## 1.0.2
 4 | - pinned specific version of diamond (2.0.6) to the CAT environment
 5 | 
 6 | ## 1.0.1
 7 | - can optionally skip binning and MAG recovery and characterization with new option in config.yaml, "perform_binning_and_MAG_recovery"
 8 | 
 9 | ## 1.0.0
10 | - initial workflow release
11 | 


--------------------------------------------------------------------------------
/workflows/metagenomics-wf/config/multiqc.config:
--------------------------------------------------------------------------------
1 | extra_fn_clean_exts:
2 |   - "_raw"
3 |   - "_HRremoved_raw"
4 |   - "_filtered"
5 | 
6 | show_analysis_paths: False
7 | show_analysis_time: False
8 | 


--------------------------------------------------------------------------------
/workflows/metagenomics-wf/envs/bit.yaml:
--------------------------------------------------------------------------------
 1 | name: bit
 2 | channels:
 3 |     - conda-forge
 4 |     - bioconda
 5 |     - defaults
 6 |     - astrobiomike
 7 | dependencies:
 8 |     - bit=1.8.65
 9 |     - numpy=1.26.4
10 | 


--------------------------------------------------------------------------------
/workflows/metagenomics-wf/envs/cat.yaml:
--------------------------------------------------------------------------------
1 | name: cat
2 | channels:
3 |     - conda-forge
4 |     - bioconda
5 |     - defaults
6 | dependencies:
7 |     - cat=5.2.2
8 |     - diamond=2.0.6
9 | 


--------------------------------------------------------------------------------
/workflows/metagenomics-wf/envs/checkm2.yaml:
--------------------------------------------------------------------------------
1 | name: checkm2
2 | channels:
3 |     - conda-forge
4 |     - bioconda
5 |     - defaults
6 | dependencies:
7 |     - checkm2=1.0.1
8 | 


--------------------------------------------------------------------------------
/workflows/metagenomics-wf/envs/gtdb-tk.yaml:
--------------------------------------------------------------------------------
1 | name: gtdb-tk
2 | channels:
3 |     - conda-forge
4 |     - bioconda
5 |     - defaults
6 | dependencies:
7 |     - gtdbtk=2.4.0
8 |     - numpy=1.23.1
9 | 


--------------------------------------------------------------------------------
/workflows/metagenomics-wf/envs/keggdecoder.yaml:
--------------------------------------------------------------------------------
1 | name: keggdecoder
2 | channels:
3 |     - conda-forge
4 | dependencies:
5 |     - python=3.6
6 |     - pip
7 |     - pip:
8 |         - KEGGDecoder==1.2.2
9 | 


--------------------------------------------------------------------------------
/workflows/metagenomics-wf/envs/kofamscan.yaml:
--------------------------------------------------------------------------------
 1 | name: kofamscan
 2 | channels:
 3 |     - conda-forge
 4 |     - bioconda
 5 |     - defaults
 6 |     - astrobiomike
 7 | dependencies:
 8 |     - kofamscan=1.3.0
 9 |     - hmmer=3.3.0
10 |     - bit=1.8.65
11 | 


--------------------------------------------------------------------------------
/workflows/metagenomics-wf/envs/mapping.yaml:
--------------------------------------------------------------------------------
 1 | name: mapping
 2 | channels:
 3 |     - conda-forge
 4 |     - bioconda
 5 |     - defaults
 6 | dependencies:
 7 |     - bowtie2=2.3.5.1
 8 |     - tbb=2020.2
 9 |     - bbmap=38.86
10 |     - samtools=1.9
11 | 


--------------------------------------------------------------------------------
/workflows/metagenomics-wf/envs/megahit.yaml:
--------------------------------------------------------------------------------
 1 | name: megahit
 2 | channels:
 3 |     - conda-forge
 4 |     - bioconda
 5 |     - defaults
 6 |     - astrobiomike
 7 | dependencies:
 8 |     - megahit=1.2.9
 9 |     - bit=1.8.65
10 | 


--------------------------------------------------------------------------------
/workflows/metagenomics-wf/envs/metabat.yaml:
--------------------------------------------------------------------------------
1 | name: metabat
2 | channels:
3 |     - conda-forge
4 |     - bioconda
5 |     - defaults
6 | dependencies:
7 |     - metabat2=2.15
8 | 


--------------------------------------------------------------------------------
/workflows/metagenomics-wf/envs/prodigal.yaml:
--------------------------------------------------------------------------------
 1 | name: prodigal
 2 | channels:
 3 |     - conda-forge
 4 |     - bioconda
 5 |     - defaults
 6 |     - astrobiomike
 7 | dependencies:
 8 |     - prodigal=2.6.3
 9 |     - bit=1.8.65
10 | 


--------------------------------------------------------------------------------
/workflows/metagenomics-wf/envs/qc.yaml:
--------------------------------------------------------------------------------
 1 | name: qc
 2 | channels:
 3 |     - conda-forge
 4 |     - bioconda
 5 |     - defaults
 6 | dependencies:
 7 |     - fastqc=0.11.9
 8 |     - multiqc=1.11
 9 |     - bbmap=38.86
10 |     - zip=3.0
11 |     - python=3.8
12 | 


--------------------------------------------------------------------------------
/workflows/metagenomics-wf/scripts/combine-benchmarks.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -e
 3 | 
 4 | ls benchmarks/ > benchmark-filenames.tmp
 5 | 
 6 | head -n 1 benchmarks/$( head -n 1 benchmark-filenames.tmp ) > benchmark-header.tmp
 7 | 
 8 | paste <( printf "process" ) benchmark-header.tmp > building-tab.tmp
 9 | 
10 | for file in $(cat benchmark-filenames.tmp)
11 | do
12 | 
13 |     cat <( paste <( echo ${file} | sed 's/-benchmarks.tsv//' ) <( tail -n +2 benchmarks/${file} ) ) >> building-tab.tmp
14 | 
15 | done
16 | 
17 | mv building-tab.tmp benchmarks/ALL-benchmarks.tsv
18 | rm -rf benchmark-filenames.tmp benchmark-header.tmp
19 | 


--------------------------------------------------------------------------------
/workflows/metagenomics-wf/scripts/download-gtdbtk-refs.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -e
 3 | 
 4 | ### This is modified from the "download-db.sh" script that came with the conda install of gtdbtk v2.4.0.
 5 |     # The primary download site, link commented out below, had been consistently taking over a week to download for me and others.
 6 |     # So I added this to the workflow for now to pull from the mirror site (new DB_URL below).
 7 | 
 8 | # Configuration
 9 | N_FILES_IN_TAR=241860
10 | # DB_URL="https://data.gtdb.ecogenomic.org/releases/release220/220.0/auxillary_files/gtdbtk_package/full_package/gtdbtk_r220_data.tar.gz"
11 | DB_URL="https://data.ace.uq.edu.au/public/gtdb/data/releases/release220/220.0/auxillary_files/gtdbtk_package/full_package/gtdbtk_r220_data.tar.gz"
12 | TARGET_TAR_NAME="gtdbtk_r220_data.tar.gz"
13 | 
14 | # Script variables (no need to configure)
15 | TARGET_DIR=${1:-$GTDBTK_DATA_PATH}
16 | TARGET_TAR="${TARGET_DIR}/${TARGET_TAR_NAME}"
17 | 
18 | # Check if this is overriding an existing version
19 | mkdir -p "$TARGET_DIR"
20 | n_folders=$(find "$TARGET_DIR" -maxdepth 1 -type d | wc -l)
21 | if [ "$n_folders" -gt 1 ]; then
22 |   echo "[ERROR] - The GTDB-Tk database directory must be empty, please empty it: $TARGET_DIR"
23 |   exit 1
24 | fi
25 | 
26 | # Start the download process
27 | # Note: When this URL is updated, ensure that the "--total" flag of TQDM below is also updated
28 | echo "[INFO] - Downloading the GTDB-Tk database to: ${TARGET_DIR}"
29 | wget $DB_URL -O "$TARGET_TAR"
30 | 
31 | # Uncompress and pipe output to TQDM
32 | echo "[INFO] - Extracting archive..."
33 | tar xvzf "$TARGET_TAR" -C "${TARGET_DIR}" --strip 1 | tqdm --unit=file --total=$N_FILES_IN_TAR --smoothing=0.1 >/dev/null
34 | 
35 | # Remove the file after successful extraction
36 | rm "$TARGET_TAR"
37 | echo "[INFO] - The GTDB-Tk database has been successfully downloaded and extracted."
38 | 
39 | # Set the environment variable
40 | if conda env config vars set GTDBTK_DATA_PATH="$TARGET_DIR"; then
41 |   echo "[INFO] - Added GTDBTK_DATA_PATH ($TARGET_DIR) to the GTDB-Tk conda environment."
42 | else
43 |   echo "[INFO] - Conda not found in PATH, please be sure to set the GTDBTK_DATA_PATH envrionment variable"
44 |   echo "export GTDBTK_DATA_PATH=$TARGET_DIR before running GTDB-Tk. "
45 | fi
46 | 
47 | exit 0
48 | 


--------------------------------------------------------------------------------
/workflows/metagenomics-wf/scripts/format-contig-tax-classifications.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | awk -F $'\t' ' BEGIN { OFS = FS } { if ( $2 == "classification" ) { print $1,$4,$6,$7,$8,$9,$10,$11,$12 } \
4 |     else if ( $2 == "no taxid assigned" ) { print $1,"NA","NA","NA","NA","NA","NA","NA","NA" } \
5 |     else { n=split($4,lineage,";"); print $1,lineage[n],$6,$7,$8,$9,$10,$11,$12 } } ' ${1} \
6 |     | sed 's/no support/NA/g' | sed 's/superkingdom/domain/' | sed 's/^# contig/contig_ID/' | sed 's/lineage/taxid/' > ${2}
7 | 


--------------------------------------------------------------------------------
/workflows/metagenomics-wf/scripts/format-gene-tax-classifications.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 |  awk -F $'\t' ' BEGIN { OFS = FS } { if ( $3 == "lineage" ) { print $1,$3,$5,$6,$7,$8,$9,$10,$11 } \
4 |      else if ( $2 == "ORF has no hit to database" || $2 ~ /^no taxid found/ ) { print $1,"NA","NA","NA","NA","NA","NA","NA","NA" } \
5 |      else { n=split($3,lineage,";"); print $1,lineage[n],$5,$6,$7,$8,$9,$10,$11 } } ' ${1} \
6 |      | sed 's/no support/NA/g' | sed 's/superkingdom/domain/' | sed 's/# ORF/gene_ID/' | sed 's/lineage/taxid/' > ${2}
7 | 


--------------------------------------------------------------------------------
/workflows/metagenomics-wf/scripts/generate-assembly-based-overview-table.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | sample_IDs_file=${1}
 4 | assemblies_dir=${2}
 5 | genes_dir=${3}
 6 | mapping_dir=${4}
 7 | bins_dir=${5}
 8 | MAGs_dir=${6}
 9 | output=${7}
10 | 
11 | # starting output file
12 | printf "Sample_ID\tassembly_produced\tgene_calls_identified\tread_mapping_successful\tbins_recovered\tMAGs_recovered\n" > ${output}
13 | 
14 | # looping through all input files and generating columns for final table
15 | for sample in $(cat ${sample_IDs_file})
16 | do
17 | 
18 |     # checking assembly
19 |     if [ ! -s ${assemblies_dir}/${sample}-assembly.fasta ]; then
20 |         printf "No\n" >> assembly-status.tmp
21 | 
22 |         # removing empty output fasta
23 |         rm -rf ${assemblies_dir}/${sample}-assembly.fasta
24 | 
25 |     else
26 |         printf "Yes\n" >> assembly-status.tmp
27 |     fi
28 | 
29 |     # checking gene calls
30 |     if [ ! -s ${genes_dir}/${sample}-genes.faa ]; then
31 |         printf "No\n" >> genes-status.tmp
32 | 
33 |         # removing empty output files
34 |         rm -rf ${genes_dir}/${sample}-genes.faa ${genes_dir}/${sample}-genes.fasta ${genes_dir}/${sample}-genes.gff
35 | 
36 |     else
37 |         printf "Yes\n" >> genes-status.tmp
38 |     fi
39 | 
40 |     # checking read-mapping outputs
41 |     if [ ! -s ${mapping_dir}/${sample}.bam ]; then
42 |         printf "No\n" >> mapping-status.tmp
43 | 
44 |         # removing empty output files
45 |         rm -rf ${mapping_dir}/${sample}.bam ${mapping_dir}/${sample}-metabat-assembly-depth.tsv
46 | 
47 |     else
48 |         printf "Yes\n" >> mapping-status.tmp
49 |     fi
50 | 
51 |     # getting number of bins recovered if any produced
52 |     if compgen -G "${bins_dir}*.fasta" > /dev/null; then
53 |         num_bins=$(ls ${bins_dir}*.fasta | grep -c "${sample}-bin.[0-9]*.fasta")
54 |         printf "${num_bins}\n" >> bins-status.tmp
55 |     else
56 |         printf "0\n" >> bins-status.tmp
57 |     fi
58 | 
59 |     # getting number of MAGs recovered
60 |     if compgen -G "${MAGs_dir}*.fasta" >/dev/null; then
61 |         num_MAGs=$(ls ${MAGs_dir}*.fasta | grep -c "${sample}-MAG-[0-9]*.fasta")
62 |         printf "${num_MAGs}\n" >> MAGs-status.tmp
63 |     else
64 |         printf "0\n" >> MAGs-status.tmp
65 |     fi
66 | 
67 | done
68 | 
69 | # combining, adding to output file and removing intermediates
70 | cat <( paste ${sample_IDs_file} assembly-status.tmp \
71 |              genes-status.tmp mapping-status.tmp \
72 |              bins-status.tmp MAGs-status.tmp ) >> ${output}
73 | 
74 | rm assembly-status.tmp genes-status.tmp mapping-status.tmp bins-status.tmp MAGs-status.tmp
75 | 


--------------------------------------------------------------------------------
/workflows/metagenomics-wf/scripts/parse-MAG-annots.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import argparse
 4 | 
 5 | parser = argparse.ArgumentParser(description='This script does whatever it needs to do.')
 6 | 
 7 | required = parser.add_argument_group('required arguments')
 8 | 
 9 | required.add_argument("-i", "--input-tsv", help='no help for you, come back, 2 years!', action="store", required=True)
10 | required.add_argument("-w", "--wanted-things", help="what'd i tell you?", action="store", required=True)
11 | required.add_argument("-M", "--MAG-ID", action="store", required=True)
12 | 
13 | parser.add_argument("-o", "--output_tsv", help='Default: "out.tsv"', action="store", dest="output_tsv", default="out.tsv")
14 | 
15 | args = parser.parse_args()
16 | 
17 | targets_set = set(line.strip() for line in open(args.wanted_things))
18 | 
19 | out_tab = open(args.output_tsv, "a")
20 | 
21 | for line in open(args.input_tsv):
22 |     line = line.strip().split("\t")
23 |     if line[2] != "NA":
24 | 
25 |         # dropping last coding seq # field so matches contig ID
26 |         if line[0].rsplit('_', 1)[0] in targets_set:
27 | 
28 |             out_tab.write(str(args.MAG_ID) + "\t" + line[2] + "\n")
29 | 
30 | out_tab.close()
31 | 


--------------------------------------------------------------------------------
/workflows/metagenomics-wf/scripts/slurm-status.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import subprocess
 3 | import sys
 4 | 
 5 | jobid = sys.argv[1]
 6 | 
 7 | # if wanting to use, this should be added to the snakemake call from the root workflow dir: `--cluster-status scripts/slurm-status.py`
 8 | 
 9 | output = str(subprocess.check_output("sacct -j %s --format State --noheader | head -1 | awk '{print $1}'" % jobid, shell=True).strip())
10 | 
11 | running_status=["PENDING", "CONFIGURING", "COMPLETING", "RUNNING", "SUSPENDED"]
12 | if "COMPLETED" in output:
13 |     print("success")
14 | elif any(r in output for r in running_status):
15 |     print("running")
16 | else:
17 |     print("failed")
18 | 


--------------------------------------------------------------------------------
/workflows/metagenomics-wf/scripts/swap-MAG-IDs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import argparse
 4 | import pandas as pd
 5 | 
 6 | parser = argparse.ArgumentParser(description='This script swaps the MAG IDs back to what they were prior to running KEGGDecoder.')
 7 | 
 8 | required = parser.add_argument_group('required arguments')
 9 | 
10 | required.add_argument("-i", "--input-tsv", help='Output table from KEGGDecoder', action="store", required=True)
11 | required.add_argument("-m", "--map-tsv", help='Tab-delimited map with 1st column holding original name, and 2nd column holding modified name', action="store", required=True)
12 | 
13 | parser.add_argument("-o", "--output-tsv", help='Output table with adjusted MAG IDs (Default: "out.tsv")', action="store", default="out.tsv")
14 | 
15 | args = parser.parse_args()
16 | 
17 | # reading in mapping file into dictionary
18 | map_dict = {}
19 | with open(args.map_tsv) as mapping:
20 |     for line in mapping:
21 |         line = line.strip().split("\t")
22 |         map_dict[line[1]] = line[0]
23 | 
24 | # reading in output table from KEGGDecoder
25 | in_tab = pd.read_csv(args.input_tsv, sep = "\t", index_col = 0)
26 | 
27 | # renaming back to what they were before modifying to be compliant with KEGGDecoder
28 | mod_tab = in_tab.rename(index = map_dict)
29 | 
30 | # writing out modified file
31 | mod_tab.to_csv(args.output_tsv, sep = "\t")
32 | 


--------------------------------------------------------------------------------
/workflows/sra-download-wf/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # SRA-download workflow change log
 2 | 
 3 | ## 1.1.0
 4 | - workflow can now also handle sra objects that hold single-end data
 5 | 
 6 | ## 1.0.1
 7 | - updates to `scripts/combine-sra-accessions.sh`
 8 |   - more efficient now by not cat'ing if there is only one SRR for a sample
 9 |   - default is to remove original files now, and `-k` needs to be added in order to keep them
10 | 
11 | ## 1.0.0
12 | - initial workflow release
13 | 


--------------------------------------------------------------------------------
/workflows/sra-download-wf/README.md:
--------------------------------------------------------------------------------
 1 | # [bit](https://github.com/AstrobioMike/bit) sra-download workflow
 2 | This is a [snakemake](https://snakemake.github.io/) workflow for downloading reads from [NCBI's SRA](https://www.ncbi.nlm.nih.gov/sra) in fastq format. For all workflows available with _bit_, see [here](https://github.com/AstrobioMike/bit?tab=readme-ov-file#workflows).
 3 | 
 4 | ---
 5 | 
 6 | * [**Overview**](#overview)
 7 | * [**Usage**](#usage)
 8 |   * [Retrieving the workflow](#retrieving-the-workflow)
 9 |   * [Creating the input file and modifying the config.yaml](#creating-the-input-file-and-modifying-the-configyaml)
10 |   * [Running the workflow](#running-the-workflow)
11 |   * [Combining SRRs if needed](#combining-srrs-if-needed)
12 | * [**Version info**](#version-info)
13 | 
14 | ---
15 | 
16 | ## Overview
17 | 
18 | This workflow will download reads from SRA based on input run accessions (i.e., the accessions starting with ERR..., SRR, or DRR) using prefetch and fasterq-dump.
19 | 
20 | ---
21 | 
22 | ## Usage
23 | _bit_ should be installed via conda as described [here](https://github.com/AstrobioMike/bit?tab=readme-ov-file#conda-install).
24 | 
25 | ### Retrieving the worklfow
26 | 
27 | ```bash
28 | bit-get-workflow sra-download
29 | ```
30 | 
31 | ### Creating the input file and modifying the config.yaml
32 | Before running it, you first need to make a file holding the target run accessions, one per line in a single-column.
33 | 
34 | The path to that file needs to be set for the "target_sra_accessions_file" variable in the config.yaml.
35 | 
36 | ### Running the workflow
37 | After the target run accessions file has been created and set in the config.yaml, here's an example of how it could be run (note that it should still be run inside the _bit_ conda environment):
38 |  
39 | ```bash
40 | snakemake --use-conda --conda-prefix ${CONDA_PREFIX}/envs -j 4 -p
41 | ```
42 | 
43 | - `--use-conda` – this specifies to use the conda environments included in the workflow
44 | - `--conda-prefix` – this allows us to point to where the needed conda environments should be stored. Including this means if we use the workflow on a different dataset somewhere else in the future, it will re-use the same conda environments rather than make new ones. The value listed here, `${CONDA_PREFIX}/envs`, is the default location for conda environments (the variable `${CONDA_PREFIX}` will be expanded to the appropriate location on whichever system it is run on).
45 | - `-j` – this lets us set how many jobs Snakemake should run concurrently (keep in mind that many of the thread and cpu parameters set in the config.yaml file will be multiplied by this)
46 | - `-p` – specifies to print out each command being run to the screen
47 | 
48 | See `snakemake -h` for more options and details.
49 | 
50 | ### Combining SRRs if needed
51 | 
52 | Sometimes multiple "runs" belong to the same sample, but we still need to download the runs independently from SRA. A helper script is included with the workflow to facilitate combining those multiple read files into one forward and one reverse for a given sample. We first need to prepare a tab-delimited mapping file with 2 columns that lists:
53 | 1. The ultimate sample name we want to have
54 | 2. The SRR accessions that belong with each sample name
55 | 
56 | Here is an example:
57 | 
58 | ```bash
59 | cat map.tsv
60 | ```
61 | 
62 | ```bash
63 | Sample-1    SRR123456
64 | Sample-1    SRR123457
65 | Sample-2    SRR123458
66 | Sample-3    SRR123459
67 | Sample-3    SRR123460
68 | ```
69 | 
70 | For example, SRR123456 and SRR123457 read files would be combined (via `cat`) into one forward and one reverse read file called "Sample-1_R1.fastq.gz" and "Sample-1_R2.fastq.gz". Since Sample-2 only has one input, those files would just be renamed.
71 | 
72 | The helper script takes two positional arguments: the first being the tsv mapping file; and the second being the path to the directory holding all the starting fastq files.
73 | 
74 | Example usage:
75 | ```bash
76 | bash scripts/combine-sra-accessions.sh -i map.tsv -d fastq-files/
77 | ```
78 | 
79 | Note that by default the original files will be removed after they are combined or renamed. If you want to keep them, provide the `-k` flag also. See `bash scripts/combine-sra-accessions.sh -h` for more info. This helper script is only suitable for paired-end data.
80 | 
81 | ---
82 | 
83 | ## Version info
84 | Note that the workflows are versioned independently of the _bit_ package. When you pull one with `bit-get-workflow`, the directory name will have the version, and it is also listed at the top of the Snakefile.
85 | 
86 | All versions of programs used can be found in their corresponding conda yaml file in the envs/ directory. 
87 | 


--------------------------------------------------------------------------------
/workflows/sra-download-wf/Snakefile:
--------------------------------------------------------------------------------
  1 | ###################################################################################################
  2 | ## Snakefile for the "bit" SRA download workflow                                                 ##
  3 | ## Version 1.1.0                                                                                 ##
  4 | ## bit: https://github.com/AstrobioMike/bit                                                      ##
  5 | ##                                                                                               ##
  6 | ## If you use this workflow in a publication, please consider citing :)                          ##
  7 | ##   Lee M. bit: a multipurpose collection of bioinformatics tools. F1000Research 2022, 11:122.  ##
  8 | ##   https://doi.org/10.12688/f1000research.79530.1                                              ##
  9 | ###################################################################################################
 10 | 
 11 | import os
 12 | import pandas as pd
 13 | 
 14 | configfile: "config.yaml"
 15 | 
 16 | 
 17 | ########################################
 18 | ############# General Info #############
 19 | ########################################
 20 | 
 21 | """
 22 | See the corresponding 'config.yaml' file for general use information.
 23 | Variables that may need to be adjusted should usually be changed there, not here.
 24 | """
 25 | 
 26 | 
 27 | ########################################
 28 | ######## Some colors and helpers #######
 29 | ########################################
 30 | 
 31 | tty_colors = {
 32 |     'green' : '\033[0;32m%s\033[0m',
 33 |     'yellow' : '\033[0;33m%s\033[0m',
 34 |     'red' : '\033[0;31m%s\033[0m'
 35 | }
 36 | 
 37 | def color_text(text, color='green'):
 38 |     if sys.stdout.isatty():
 39 |         return(tty_colors[color] % text)
 40 |     else:
 41 |         return(text)
 42 | 
 43 | 
 44 | ################################################
 45 | #### Reading target SRA accesions into list ####
 46 | ################################################
 47 | target_sra_accessions_list = [line.strip() for line in open(config["target_sra_accessions_file"])]
 48 | 
 49 | ## when i want to try integrating combinging runs that belong to the same sample given an input table, revisit what i did here:
 50 |     # https://github.com/AstrobioMike/NASA-Exo-N-project/blob/main/metagenomics/workflow/Snakefile
 51 | 
 52 | ################################################
 53 | ############## Pre-flight checks ###############
 54 | ################################################
 55 | 
 56 | # making sure there are all unique names
 57 | if len(set(target_sra_accessions_list)) != len(target_sra_accessions_list):
 58 | 
 59 |     print(color_text(f"\n    Not all sample IDs in the '{config['target_sra_accessions_file']}' file are unique :(\n", "yellow"))
 60 |     print("    Exiting for now.\n")
 61 |     exit(1)
 62 | 
 63 | # making sure they all start with an expected prefix
 64 | expected_prefixes = ["SRR", "ERR", "DRR"]
 65 | for acc in target_sra_accessions_list:
 66 |     if not any([acc.startswith(prefix) for prefix in expected_prefixes]):
 67 | 
 68 |         print(color_text(f"\n    At least one of the sample IDs in the '{config['target_sra_accessions_file']}' file (e.g., '{acc}') does not start with an expected prefix :(\n", "yellow"))
 69 |         print(f"    Acceptable SRA prefixes are: {', '.join(expected_prefixes)}\n")
 70 |         print("    Exiting for now.\n")
 71 |         exit(1)
 72 | 
 73 | 
 74 | ########################################
 75 | ######## Setting up directories ########
 76 | ########################################
 77 | 
 78 | triggers_dir = "logs/triggers"
 79 | dirs_to_create = ["fastq-files", "logs",
 80 |                   "benchmarks", triggers_dir]
 81 | 
 82 | if config["keep_sra_files"] == "TRUE":
 83 |     dirs_to_create.append("sra-files")
 84 | 
 85 | for dir in dirs_to_create:
 86 |     try:
 87 |         os.mkdir(dir)
 88 |     except:
 89 |         pass
 90 | 
 91 | 
 92 | ########################################
 93 | ############# Rules start ##############
 94 | ########################################
 95 | 
 96 | 
 97 | rule all:
 98 |     input:
 99 |         expand(f"{triggers_dir}/{{acc}}/all.done", acc = target_sra_accessions_list)
100 |     shell:
101 |         """
102 |         bash scripts/combine-benchmarks.sh
103 |         """
104 | 
105 | 
106 | rule prefetch:
107 |     """
108 |     This rule runs prefetch on all target SRA accessions.
109 |     """
110 |     conda:
111 |         "envs/sra-dl.yaml"
112 |     params:
113 |         max_size = config["prefetch_max_size"]
114 |     output:
115 |         "{acc}-tmp/{acc}/{acc}.sra"
116 |     benchmark:
117 |         "benchmarks/{acc}-prefetch-benchmarks.tsv"
118 |     log:
119 |         "logs/prefetch-{acc}.log"
120 |     shell:
121 |         """
122 |         prefetch --max-size {params.max_size} --progress -O {wildcards.acc}-tmp {wildcards.acc} > {log} 2>&1
123 |         """
124 | 
125 | 
126 | rule fasterq_dump:
127 |     """
128 |     This rule runs fasterq-dump on all target SRA accessions.
129 |     """
130 |     conda:
131 |         "envs/sra-dl.yaml"
132 |     input:
133 |         "{acc}-tmp/{acc}/{acc}.sra"
134 |     output:
135 |         touch(f"{triggers_dir}/{{acc}}/fq-dump.done")
136 |     params:
137 |         num_threads = config["num_threads"]
138 |     benchmark:
139 |         "benchmarks/{acc}-fasterq-dump-benchmarks.tsv"
140 |     log:
141 |         "logs/fasterq-dump-{acc}.log"
142 |     shell:
143 |         """
144 |         fasterq-dump --progress -O {wildcards.acc}-tmp/ --seq-defline '@$ac.$si/$ri $sn' --qual-defline '+' --threads {params.num_threads} {input} > {log} 2>&1
145 | 
146 |         # renaming the files to have R1/R2 in their names if they are paired end
147 |         if [ -f {wildcards.acc}-tmp/{wildcards.acc}_1.fastq ]; then
148 |             mv {wildcards.acc}-tmp/{wildcards.acc}_1.fastq {wildcards.acc}-tmp/{wildcards.acc}_R1.fastq.fastq
149 |             mv {wildcards.acc}-tmp/{wildcards.acc}_2.fastq {wildcards.acc}-tmp/{wildcards.acc}_R2.fastq.fastq
150 |         fi
151 |         """
152 | 
153 | 
154 | rule gzip_fastq_files:
155 |     """
156 |     This rule gzips the fastq files.
157 |     """
158 |     conda:
159 |         "envs/sra-dl.yaml"
160 |     input:
161 |         f"{triggers_dir}/{{acc}}/fq-dump.done"
162 |     output:
163 |         touch(f"{triggers_dir}/{{acc}}/all.done")
164 |     params:
165 |         num_threads = config["num_threads"],
166 |         initial_sra_dir = "{acc}-tmp/",
167 |         keep_sra_files = config["keep_sra_files"]
168 |     shell:
169 |         """
170 |         pigz -p {params.num_threads} {wildcards.acc}-tmp/{wildcards.acc}*.fastq
171 | 
172 |         # moving files to the final directory
173 |         mv {wildcards.acc}-tmp/{wildcards.acc}*.fastq.gz fastq-files/
174 | 
175 |         # removing initial SRA directory unless specified otherwise in config.yaml
176 |         if [ "{params.keep_sra_files}" == "TRUE" ]; then
177 |             mv $(find {wildcards.acc}-tmp -name "*.sra") sra-files/
178 |         fi
179 | 
180 |         rm -rf {params.initial_sra_dir}
181 |         """
182 | 


--------------------------------------------------------------------------------
/workflows/sra-download-wf/config.yaml:
--------------------------------------------------------------------------------
 1 | ###################################################################################################
 2 | ## Config file for the "bit" SRA download workflow                                               ##
 3 | ## bit: https://github.com/AstrobioMike/bit                                                      ##
 4 | ##                                                                                               ##
 5 | ## If you use this workflow in a publication, please consider citing :)                          ##
 6 | ##   Lee M. bit: a multipurpose collection of bioinformatics tools. F1000Research 2022, 11:122.  ##
 7 | ##   https://doi.org/10.12688/f1000research.79530.1                                              ##
 8 | ###################################################################################################
 9 | 
10 | ############################################################
11 | ##################### VARIABLES TO SET #####################
12 | ############################################################
13 | 
14 | ## single-column file with target sra accessions (these should start with SRR, ERR, or DRR)
15 | target_sra_accessions_file:
16 |     "target-sra-accs.txt"
17 | 
18 | 
19 | ######################################################################
20 | ###### These only need to be altered if we want to change them #######
21 | ######################################################################
22 | 
23 | ## for more info on prefetch and fasterq-dump options, see: https://github.com/ncbi/sra-tools/wiki/08.-prefetch-and-fasterq-dump
24 | 
25 | ## number of threads to use PER snakemake job (which is set with the -j parameter passed to snakemake call)
26 |     # passed to fasterq-dump and pigz (many may be running concurrently)
27 | num_threads:
28 |     8
29 | 
30 | ## prefetch --max-size argument
31 | prefetch_max_size:
32 |     "500G"
33 | 
34 | ## keep sra objects after download (TRUE for yes, anything else is treated as no)
35 | keep_sra_files:
36 |     "FALSE"
37 | 
38 | ## example usage command ##
39 | # snakemake --use-conda --conda-prefix ${CONDA_PREFIX}/envs -j 2 -p
40 | 
41 | # `--use-conda` – this specifies to use the conda environments included in the workflow
42 | # `--conda-prefix` – this allows us to point to where the needed conda environments should be stored. Including this means if we use the workflow on a different dataset somewhere else in the future, it will re-use the same conda environments rather than make new ones. The value listed here, `${CONDA_PREFIX}/envs`, is the default location for conda environments (the variable `${CONDA_PREFIX}` will be expanded to the appropriate location on whichever system it is run on).
43 | # `-j` – this lets us set how many jobs Snakemake should run concurrently (keep in mind that many of the thread and cpu parameters set in the config.yaml file will be multiplied by this)
44 | # `-p` – specifies to print out each command being run to the screen
45 | 
46 | # See `snakemake -h` for more options and details.
47 | 


--------------------------------------------------------------------------------
/workflows/sra-download-wf/envs/sra-dl.yaml:
--------------------------------------------------------------------------------
1 | name: sra-dl
2 | channels:
3 |     - conda-forge
4 |     - bioconda
5 |     - defaults
6 | dependencies:
7 |     - sra-tools=3.1.0
8 |     - pigz=2.8
9 | 


--------------------------------------------------------------------------------
/workflows/sra-download-wf/scripts/combine-benchmarks.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -e
 3 | 
 4 | ls benchmarks/ > benchmark-filenames.tmp
 5 | 
 6 | head -n 1 benchmarks/$( head -n 1 benchmark-filenames.tmp ) > benchmark-header.tmp
 7 | 
 8 | paste <( printf "process" ) benchmark-header.tmp > building-tab.tmp
 9 | 
10 | for file in $(cat benchmark-filenames.tmp)
11 | do
12 | 
13 |     cat <( paste <( echo ${file} | sed 's/-benchmarks.tsv//' ) <( tail -n +2 benchmarks/${file} ) ) >> building-tab.tmp
14 | 
15 | done
16 | 
17 | mv building-tab.tmp benchmarks/ALL-benchmarks.tsv
18 | rm -rf benchmark-filenames.tmp benchmark-header.tmp
19 | 


--------------------------------------------------------------------------------
/workflows/sra-download-wf/scripts/combine-sra-accessions.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | RED='\033[0;31m'
  4 | YELLOW='\033[0;33m'
  5 | GREEN='\033[0;32m'
  6 | NC='\033[0m'
  7 | 
  8 | # a function for providing help
  9 | print_help() {
 10 | 
 11 | 
 12 |     printf "\n                              ${YELLOW}HELP MENU${NC}"
 13 |     printf "\n  ${YELLOW}********************************************************************${NC}\n"
 14 | 
 15 |     printf "\n  Sometimes multiple SRA accessions comprise 1 sample. This script\n"
 16 |     printf "  is a helper to combine multiple SRA fastq files together.\n\n"
 17 |     printf "  It expects as input a tsv, with no header, where the first column\n"
 18 |     printf "  is the sample name, and the second column is the SRA accession, e.g.:\n\n"
 19 | 
 20 |     printf "              Sample-1\tSRR123456\n"
 21 |     printf "              Sample-1\tSRR123457\n"
 22 |     printf "              Sample-2\tSRR123458\n"
 23 |     printf "              Sample-3\tSRR123459\n"
 24 |     printf "              Sample-3\tSRR123460\n\n"
 25 | 
 26 |     printf "  It takes two positional arguments, the first being the tsv mapping file,\n"
 27 |     printf "  and the second being the path to the directory holding all the fastq files.\n\n"
 28 | 
 29 |     printf "    Ex. Usage:\n\t bash scripts/combine-sra-accessions.sh -i map.tsv -d fastq-files/ \n\n"
 30 | 
 31 |     printf "  Note that this is a simple bash script, it is for paired-end sample-sets only,\n"
 32 |     printf "  and there is not much checked to catch human error on the input table.\n\n"
 33 | 
 34 |     printf "  By default it will remove the initial fastq files. Provide the '-k' flag if you want to\n"
 35 |     printf "  keep them.\n"
 36 | 
 37 |     printf "\n  ${YELLOW}********************************************************************${NC}\n\n"
 38 | 
 39 |     exit
 40 | 
 41 | }
 42 | 
 43 | if [ "$#" == 0 ] || [ $1 == "-h" ]; then
 44 |     print_help
 45 | fi
 46 | 
 47 | 
 48 | ########################################
 49 | ### Setting up and parsing arguments ###
 50 | ########################################
 51 | remove_original_fastqs="true"
 52 | 
 53 | while getopts ":i:d:k" args; do
 54 |     case "${args}"
 55 |     in
 56 |         i) map_file=$OPTARG;;
 57 |         d) fastq_dir=$OPTARG;;
 58 |         k) remove_original_fastqs="false";;
 59 |         \?) printf "\n  ${RED}Invalid argument: -${OPTARG}${NC}\n" 1>&2
 60 |             print_help
 61 |             ;;
 62 |         :)
 63 |             echo "Invalid option: $OPTARG requires an argument" 1>&2
 64 |             print_help
 65 |             ;;
 66 |     esac
 67 | done
 68 | 
 69 | ##################################################
 70 | ## Making sure required arguments were provided ##
 71 | ##################################################
 72 | if [ ! -n "${map_file}" ] || [ ! -n "${fastq_dir}" ]; then
 73 | 
 74 |     printf "\n    ${RED}ERROR${NC}: The required arguments were not provided. See help below.\n"
 75 |     print_help
 76 | 
 77 | fi
 78 | 
 79 | 
 80 | ########################################
 81 | ########### Pre-flight checks ##########
 82 | ########################################
 83 | 
 84 | 
 85 | # check that the first positional argument is a file
 86 | if [ ! -f ${map_file} ]; then
 87 | 
 88 |     printf "\n    ${RED}ERROR${NC}: The file '${map_file}' does not exist.\n"
 89 |     print_help
 90 | 
 91 | fi
 92 | 
 93 | # check that the second positional argument is a directory
 94 | if [ ! -d ${fastq_dir} ]; then
 95 | 
 96 |     printf "\n    ${RED}ERROR${NC}: The directory '${fastq_dir}' does not exist.\n"
 97 |     print_help
 98 | 
 99 | fi
100 | 
101 | # checking input table has 2 columns
102 | if [ $(head -n 1 ${map_file} | awk '{print NF}') -ne 2 ]; then
103 | 
104 |     printf "\n    ${RED}ERROR${NC}: The input table must have 2 columns. See help below.\n"
105 |     print_help
106 | 
107 | fi
108 | 
109 | 
110 | ########################################
111 | ########## Getting to work #############
112 | ########################################
113 | 
114 | starting_dir=$(pwd)
115 | path_to_map=$(realpath ${map_file})
116 | 
117 | # moving into directory to make it easier to run cat
118 | cd ${fastq_dir}
119 | 
120 | printf "\n"
121 | 
122 | for sample in $(cut -f 1 ${path_to_map} | sort -u)
123 | do
124 | 
125 |     printf "  Currently working on: ${sample} ...\r"
126 | 
127 |     target_R1s=$(grep ${sample} ${path_to_map} | cut -f 2 | sed 's/$/_R1.fastq.gz/' | tr '\n' ' ' | sed 's/ $//')
128 |     target_R2s=$(grep ${sample} ${path_to_map} | cut -f 2 | sed 's/$/_R2.fastq.gz/' | tr '\n' ' ' | sed 's/ $//')
129 | 
130 | 
131 |     if [ ${remove_original_fastqs} == "true" ]; then
132 | 
133 |         # checking if there are multiple, if so we cat them; if just one, we just mv/rename it
134 |         if printf "${target_R1s}" | grep -q " "; then
135 | 
136 |             cat ${target_R1s} > ${sample}_R1.fastq.gz
137 |             cat ${target_R2s} > ${sample}_R2.fastq.gz
138 | 
139 |             rm ${target_R1s}
140 |             rm ${target_R2s}
141 | 
142 |         else
143 | 
144 |             mv ${target_R1s} ${sample}_R1.fastq.gz
145 |             mv ${target_R2s} ${sample}_R2.fastq.gz
146 | 
147 |         fi
148 | 
149 |     else
150 | 
151 |         cat ${target_R1s} > ${sample}_R1.fastq.gz
152 |         cat ${target_R2s} > ${sample}_R2.fastq.gz
153 | 
154 |     fi
155 | 
156 | done
157 | 
158 | # moving back to initial dir
159 | cd ${starting_dir}
160 | 
161 | printf "\n\n                     ${GREEN}DONE!${NC}\n\n"
162 | printf "  ${YELLOW}The combined fastq files are in the directory: ${fastq_dir}${NC}\n\n"
163 | 
164 | if [ ${remove_original_fastqs} == "true" ]; then
165 | 
166 |     printf "  ${YELLOW}The original fastq files were removed because the '-k' flag was not provided.${NC}\n\n"
167 | 
168 | else
169 | 
170 |     printf "  ${YELLOW}Note that the original fastq files were left because the '-k' flag was provided.${NC}\n\n"
171 | 
172 | fi
173 | 


--------------------------------------------------------------------------------