├── example
    ├── NJST258_1__CP006923.gbk.gz
    └── gbk2tbl_modifiers.txt
├── shell
    ├── unlinkFiles.sh
    ├── compile_ariba_reports.sh
    ├── gff2fasta.sh
    ├── shortenSPAdesContigNames.sh
    ├── cat_fasta.sh
    ├── renameFiles.sh
    ├── extractSingleChrFromVCF.sh
    ├── mkSymbolicLinks.sh
    ├── catContigStats.sh
    ├── rename_PE_readsets.sh
    ├── saveSPAdesOutputs.sh
    ├── blastShowRepeats.sh
    ├── extractSPAdesAssemblyStats.sh
    ├── catAssemblyStats.sh
    ├── catGzippedFASTQsPerDirectory.sh
    ├── linkFiles.sh
    ├── catGzippedFASTQsPerSample.sh
    ├── download_ena_pe_reads.sh
    └── download_reads_from_sra.sh
├── .gitignore
├── tabulateMUMmerCoordinates.py
├── extractSeqFromMultiFASTA.py
├── add_sample_name_FASTA.py
├── parse_ENA_sampleInfo_XML.py
├── rename_fasta_seqs.py
├── filename_generator.py
├── exclude_pseudo_seqs.py
├── seqlen.py
├── extractNuclRegionFromFASTA.py
├── gfa_stats.py
├── filterSPAdesContigs.py
├── gc.py
├── run_CutAdapt.py
├── mergeGenomicRegions.R
├── screen_genes_blast.py
├── linkPEreadsets.py
├── gbk2tsv.py
├── gbk2tbl.py
├── parse_biosample.py
├── README.md
├── extractSeqFromGBK.py
├── other_licence
    ├── Apache Licence-2.0.txt
    └── GPL-2.0.txt
├── downloadSeqFromNCBI.py
└── LICENSE


/example/NJST258_1__CP006923.gbk.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wanyuac/BINF_toolkit/HEAD/example/NJST258_1__CP006923.gbk.gz


--------------------------------------------------------------------------------
/example/gbk2tbl_modifiers.txt:
--------------------------------------------------------------------------------
1 | [organism=Klebsiella pneumoniae] [strain=NJST258_1] [topology=circular] [moltype=DNA] [tech=wgs] [gcode=11] [country=USA] [isolation-source=urine] [collection-date=2010]


--------------------------------------------------------------------------------
/shell/unlinkFiles.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Iterates the unlink command on multiple soft links.
 3 | # Example:
 4 | #	unlinkFiles.sh reads/*.fastq.gz
 5 | #	unlinkFiles.sh $(cat link_list.txt)
 6 | # Author: Yu Wan (14-15 April 2017) published at https://github.com/wanyuac/BINF_toolkit
 7 | # License: Apache-2.0
 8 | 
 9 | links=( $@ )
10 | for i in ${links[@]}; do
11 |     unlink $i
12 | done
13 | 


--------------------------------------------------------------------------------
/shell/compile_ariba_reports.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Run this script in conda environment 'ariba'
 3 | # Yu Wan (1/7/2021)
 4 | 
 5 | cd $1
 6 | 
 7 | while read -r g
 8 | do
 9 | 	f="ariba_out/${g}/report.tsv"
10 | 	if [ -f "$f" ]
11 | 	then	
12 | 		cp $f report/in.report.${g}.tsv
13 | 	else
14 | 		echo "Isolate $g did not have any report generated."
15 |     fi
16 | done < "$2"
17 | 
18 | ariba summary --no_tree --verbose summary report/in.report.*.tsv
19 | 


--------------------------------------------------------------------------------
/shell/gff2fasta.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This script extract sequence regions from GFF3 files which contain complete assembled sequences.
 3 | # In this kind of file, the sequences must be put at the end of each file. The sequence domain is
 4 | # separated from the annotation domain by the delimiter "###FASTA".
 5 | # Usage: bash gff2fasta.sh [input GFF file(s)]
 6 | # Examples:
 7 | #   bash gff2fasta.sh *.gff
 8 | #   bash gff2fasta.sh strain1.gff strain2.gff ...
 9 | # Licence: GNU GPL 2.1
10 | # Author: Yu Wan (wanyuac@gmail.com)
11 | # Development history: 21/7/2016
12 | 
13 | ext='fna'  # the file extension
14 | 
15 | for f in "$@"; do  # loop through each argument
16 |     base=`basename $f .gff`  # remove the path as well as the file extension
17 |     k=`grep -n '##FASTA' $f | cut -f1 -d ':'`
18 |     tail -n +$((k + 1)) $f > ${base}.$ext  # print lines starting with the kth
19 | done
20 | 


--------------------------------------------------------------------------------
/shell/shortenSPAdesContigNames.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (C) 2020 Yu Wan <wanyuac@126.com>
 3 | # Licensed under the GNU General Public Licence version 3 (GPLv3) <https://www.gnu.org/licenses/>.
 4 | # Release: 28 Aug 2020
 5 | 
 6 | display_usage(){
 7 |     echo "
 8 |     Shorten contig names from SPAdes by substituting white spaces for '_length_' in sequence headers,
 9 |     so the latter part of sequence description will be ignored by Prokka and some other software.
10 |     This is particularly useful for Prokka annotation because a long contig name consumes all space
11 |     between the contig name and length in the output GenBank file, causing a problem to SniEff and so on.
12 |     
13 |     Command line: bash shortenSPAdesContigNames.sh [input FASTA file] > [new FASTA file]
14 |     "
15 | }
16 | 
17 | if [ -z $1 ]; then
18 |     display_usage
19 |     exit
20 | else
21 |     sed 's/_length_/ /g' $1
22 | fi
23 | 


--------------------------------------------------------------------------------
/shell/cat_fasta.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This script concatenates reference sequences of the same bacterial strain into a single multi-FASTA file.
 3 | # The name of every FASTA file must follow the format: [strain name]__[accession number].fasta
 4 | # For example, AH0650_Sm1__LFJS01000001.fasta.
 5 | # Example command line:
 6 | #   bash concat_fasta.sh 'fasta/*__*.fasta'  # Quotes are necessary!
 7 | #   bash concat_fasta.sh 'fasta/strain__*.fasta'
 8 | #	bash concat_fasta.sh 'fasta/strain__*.faa'
 9 | # Licence: GNU GPL 2.1
10 | # Author: Yu Wan (wanyuac@gmail.com)
11 | # Development history: 9 Aug 2016, 12 Sep 2016
12 | 
13 | f=$1
14 | ext=${f##*.}  # get the file name extension
15 | 
16 | strains=$(ls -1 ${1} | xargs -I '{}' basename {} ".${ext}" | grep -oP '.+(?=__)' | sort -u)
17 | echo "$(echo $strains | tr " " "\n" | wc -l) strains are to be processed."
18 | 
19 | path=$(dirname "$1")
20 | for s in ${strains}; do
21 |     cat ${path}/${s}__*.${ext} > ${s}.${ext}
22 | done
23 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 | 
26 | # PyInstaller
27 | #  Usually these files are written by a python script from a template
28 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 | 
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 | 
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .coverage.*
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | *,cover
45 | 
46 | # Translations
47 | *.mo
48 | *.pot
49 | 
50 | # Django stuff:
51 | *.log
52 | 
53 | # Sphinx documentation
54 | docs/_build/
55 | 
56 | # PyBuilder
57 | target/
58 | *.lnk
59 | org.py
60 | 


--------------------------------------------------------------------------------
/shell/renameFiles.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (C) 2020 Yu Wan <wanyuac@126.com>
 3 | # Licensed under the GNU General Public Licence version 3 (GPLv3) <https://www.gnu.org/licenses/>.
 4 | # Publication: 3/3/2020
 5 | 
 6 | # Guidance #########################
 7 | display_usage() {
 8 |     echo "
 9 |     Usage:
10 |         Rename or move files according to a two-column TSV file: [old filename]\t[new filename]
11 |         Example command: renameFiles.sh names.tsv
12 |     "
13 | }
14 | 
15 | if [ -z $1 ]; then
16 |     display_usage
17 |     exit
18 | fi
19 | 
20 | # Implementation #########################
21 | while read line; do
22 |     # Split the delimited string into an arrary of two elements.
23 |     # Do not use IFS=$"\t" as it does not work correctly.
24 |     # # https://unix.stackexchange.com/questions/410710/splitting-a-line-into-array-in-bash-with-tab-as-delimiter
25 |     
26 |     IFS=$'\t' read -r -a names <<< "$line"
27 |     echo -e "Change or move: ${names[0]} --> ${names[1]}."
28 |     mv ${names[0]} ${names[1]}
29 | done < "$1"  # expect a file name as an input
30 | 


--------------------------------------------------------------------------------
/shell/extractSingleChrFromVCF.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # Copyright (C) 2021 Yu Wan <wanyuac@126.com>
 4 | # Licensed under the GNU General Public Licence version 3 (GPLv3) <https://www.gnu.org/licenses/>.
 5 | # Publication: 28/4/2021; latest update: 28/4/2021
 6 | 
 7 | display_useage() {
 8 |     echo "
 9 |     Extract variants in a specific chromosome from a VCF file.
10 |     Command: extractSingleChrFromVCF.sh [input VCF] [output VCF] [target chromosome name]
11 |     "
12 | }
13 | 
14 | if [ -z $1 ]; then
15 |     display_usage
16 |     exit
17 | fi
18 | 
19 | vcf_in=$1
20 | vcf_out=$2
21 | chr=$3
22 | outdir=$(dirname $vcf_out)
23 | tmpfile=$outdir/tmp.vcf
24 | 
25 | n=$(grep -n "##contig=<ID=$chr" $vcf_in | awk -F: '{print $1}')  # https://stackoverflow.com/questions/3213748/get-line-number-while-using-grep
26 | head -n $n $vcf_in > $vcf_out
27 | 
28 | # The following two lines avoid printing a duplicated "##contig=<ID=$chr ..." line.
29 | ((n++))
30 | tail -n +$n $vcf_in > $tmpfile
31 | 
32 | grep '#CHROM' $tmpfile >> $vcf_out
33 | grep "$chr" $tmpfile >> $vcf_out
34 | 
35 | rm -f $tmpfile
36 | 


--------------------------------------------------------------------------------
/tabulateMUMmerCoordinates.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Convert MUMmer's output into a CSV file.
 3 | Yu Wan (20 Apr 2017)
 4 | Example: python tabulateMUMerCoordinates.py input.coords > output.coords
 5 | Reference: David Edwards' script filterCoords.py in the RedDog (https://github.com/katholt/RedDog) suite.
 6 | Licence: Apache-2.0
 7 | Python version: 3.5.2 (but compatible to Python 2)
 8 | """
 9 | 
10 | import sys
11 | 
12 | def main():
13 |     count = 0
14 |     with open(sys.argv[1], "rU") as f:
15 |         print("Start,End,Identity")
16 |         for line in f:
17 |             if count <= 5:  # skip the first five lines, including the self-self comparison (100% identity)
18 |                 count += 1
19 |             else:
20 |                 data = line.split("|")
21 |                 identity = float(data[3])  # drops all white spaces
22 |                 coords = data[0].split()  # removes all white spaces as well
23 |                 start = coords[0]
24 |                 end = coords[1]
25 |                 print(",".join([start, end, str(identity)]))
26 | 
27 | if __name__ == "__main__":
28 |     main()


--------------------------------------------------------------------------------
/shell/mkSymbolicLinks.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | # Makes symbolic links of any kind of files under another directory in accordance with a list of sample names as inputs from
 3 | # either a single-column text file or stdin.
 4 | # Format of the command line:
 5 | #   bash mkSymbolicLins.sh [suffix] [source directory] [output directory] sample_names.txt
 6 | # Notice: 1. source and output directories must not be the same; 2. no forward slash ("/") should be attached to directory names.
 7 | # Examples:
 8 | #   sh mkSymbolicLinks.sh '_snps.vcf' ~/data ~/links strain_names.txt
 9 | #   sh mkSymbolicLinks.sh '_snps.vcf' . ~/links strain_names.txt
10 | #   cat strain_names.txt | sh mkVCFLinks.sh '_snps.vcf' ~/data ~/links
11 | #   In all examples, symbolic links [strain name]__snps.vcf will be created under the directory ~/links.
12 | # Limitation: every pair of original file and its symobolic link shares the same filename suffix. Hence users must separate them
13 | # with different directories.
14 | # Author: Yu Wan (20, 22 Apr 2017)
15 | # Licence: Apache-2.0
16 | 
17 | while IFS= read -r id; do
18 |   ln -s ${2}/${id}${1} ${3}/${id}${1}
19 | done < "${4:-/dev/stdin}"  # takes $4 if defined otherwise takes the stdin
20 | 


--------------------------------------------------------------------------------
/shell/catContigStats.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This script concatenates assembly statistics from our lab's previous assembly pipeline.
 3 | # Copyright (C) 2017 Yu Wan <wanyuac@gmail.com>
 4 | # Licensed under the GNU General Public License (GPL) version 3
 5 | # Creation: 31/10/2016; the latest version: 28/11/2017
 6 | 
 7 | display_usage(){
 8 |     echo "
 9 |     Concatenates *_contigStats.txt files that are generated using our lab's contigMetrics.py.
10 |     Usage: bash catContigStats.sh ./assemblies/
11 |     Outputs:
12 |         contigStats_files.txt
13 |         contigStats_combined.csv
14 |     "
15 | }
16 | 
17 | HEADER="contigFile,numContigs,totalBases,N50,smallest,lowerQ,median,upperQ,largest"
18 | FILE_LIST="contigStats_files.txt"
19 | STATS="contigStats_combined.csv"
20 | 
21 | # Check argument ##########
22 | if [ -z $1 ]; then
23 |     echo "Error: a subject directory must be provided."
24 |     display_usage
25 |     exit
26 | fi
27 | 
28 | # Find all contigStats.txt files ##########
29 | find $1 -name *_contigStats.txt -type f > $FILE_LIST
30 | echo "There are `cat ${FILE_LIST} | wc -l` genomes."
31 | 
32 | # Print contig statistics into a CSV file ##########
33 | echo $HEADER > $STATS
34 | 
35 | # Extract the second line of every file and appends it to the CSV file ==========
36 | files=`cat ${FILE_LIST}`
37 | n=0
38 | 
39 | for f in ${files}; do
40 |     r=`cat ${f} | wc -l`
41 |     if [ "$r" -eq "2" ]; then
42 |         tail -n 1 $f >> $STATS
43 |         ((n++))
44 |     else
45 |         echo "Warning: ${f} does not contain contig statistics."
46 |     fi
47 | done
48 | 
49 | echo "Success: ${n} lines of statistics have been transferred into ${STATS}."
50 | 


--------------------------------------------------------------------------------
/shell/rename_PE_readsets.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Renaming Illumina paired-end readsets via symbolic links.
 3 | # Copyright (C) 2023 Yu Wan <wanyuac@126.com>
 4 | # Licensed under the GNU General Public Licence version 3 (GPLv3) <https://www.gnu.org/licenses/>.
 5 | # Publication: 31 July 2023; latest update: 31 July 2023
 6 | 
 7 | display_usage() {
 8 |     echo "Rename paired-end readsets via symbolic links.
 9 |     Command line:
10 |     rename_PE_readsets.sh [mapping file] [directory of original readsets] [directory for links]
11 |     The mapping file is TSV-delimited and does not have any header. It consists of two columns:
12 |     original name and new name, respectively. This script assumes filenames of readsets have
13 |     suffices _1.fastq.gz and _2.fastq.gz"
14 | }
15 | 
16 | if [ -z "$1" ] || [ $1 = "-h" ]; then
17 |     display_usage
18 |     exit
19 | fi
20 | 
21 | dir_in="$2"
22 | dir_out="$3"
23 | 
24 | if [ ! -d "$dir_in" ]; then
25 |     echo "Error: input directory $dir_in was not found." >&2
26 |     exit
27 | fi
28 | 
29 | if [ ! -d "$dir_out" ]; then
30 |     echo "Create output directory $dir_out"
31 |     mkdir -p "$dir_out"
32 | fi
33 | 
34 | while read -r line; do
35 |     IFS=$'\t' read -r -a vals <<< "$line"
36 |     i="${vals[0]}"  # Original name
37 |     j="${vals[1]}"  # New name
38 |     r1="$dir_in/${i}_1.fastq.gz"
39 |     r2="$dir_in/${i}_2.fastq.gz"
40 |     if [ -f "$r1" ] && [ -f "$r2" ]; then
41 |         t1="$dir_out/${j}_1.fastq.gz"
42 |         t2="$dir_out/${j}_2.fastq.gz"
43 |         echo -e "$r1 -> $t1\t$r2 -> $t2"
44 |         ln -s "$r1" "$t1"
45 |         ln -s "$r2" "$t2"
46 |     else
47 |         echo "Error: $r1 or $r2 were not accessible. No links were created for sample ${i}." >&2
48 |     fi
49 | done < "$1"
50 | 


--------------------------------------------------------------------------------
/shell/saveSPAdesOutputs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright (C) 2021 Yu Wan <wanyuac@126.com>
 4 | # Licensed under the GNU General Public Licence version 3 (GPLv3) <https://www.gnu.org/licenses/>.
 5 | # Release: 23 Mar 2021
 6 | 
 7 | display_usage(){
 8 |     echo "
 9 |     Assuming SPAdes have produced assembly files for each genome under a subdirectory named by the genome.
10 |     For example, for genomes g1, g2, and g3, subdirectories g1/, g2/, and g3/ have been created under a
11 |     parental directory assemblies/.
12 |     
13 |     Command line: sh saveSPAdesOutputs.sh [parental directory]
14 |     Example command line: sh saveSPAdesOutputs.sh \$PWD
15 | 
16 |     Output: essential assembly files will be copied from subdirectories to the parental directory and be renamed
17 |     by subdirectory names (genome names). Then users may delete subdirectories to save space.
18 |     "
19 | }
20 | 
21 | d=$1  # Parental (output) directory
22 | cd $d
23 | for dsub in `ls -1 -d */`; do
24 |     g=`basename $dsub`  # Remove the end '/' character and use the subdirectory name as the genome name
25 | 
26 |     # Assembly graphs
27 |     cp $g/assembly_graph.fastg ./${g}.fastg
28 |     cp $g/assembly_graph_after_simplification.gfa ./${g}__simplified.gfa
29 |     cp $g/assembly_graph_with_scaffolds.gfa ./${g}__scaffolds.gfa  # May be the same as assembly_graph_after_simplification.gfa.
30 | 
31 |     # Contigs
32 |     cp $g/contigs.fasta ./${g}__contigs.fna  # It has less nodes than does assembly_graph.fastg.
33 |     cp $g/contigs.paths ./${g}__contigs.paths
34 | 
35 |     # Scaffolds
36 |     cp $g/scaffolds.fasta ./${g}__scaffolds.fna  # It has less contigs (scaffolded) than does contigs.fasta.
37 |     cp $g/scaffolds.paths ./${g}__scaffolds.paths
38 | 
39 |     # Supplementary information
40 |     cp $g/spades.log ./${g}.log
41 | done
42 | 


--------------------------------------------------------------------------------
/shell/blastShowRepeats.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # We BLAST a nucleotide sequence against itself to identify repetitive regions. Of course, every region
 3 | # matches to itself as well. This script is hence developed to remove such self-matches. It assumes
 4 | # the input crunch file follows default columns in the '-fmt 6' output format of BLAST:
 5 | #    qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore
 6 | #
 7 | # Example commands (First, use the 'chmod' command to make this script executable):
 8 | #   blastn -query sample.fasta -db ref -task megablast -evalue 0.01 -perc_identity 90 -max_target_seqs 10 -outfmt 6 | blastShowRepeats.sh sample_vs_ref.crunch
 9 | #   or: blastShowRepeats.sh sample_vs_ref.crunch > sample_vs_ref_repeats.crunch
10 | #
11 | # Copyright (C) 2020 Yu Wan <wanyuac@126.com>
12 | # Licensed under the GNU General Public Licence version 3 (GPLv3) <https://www.gnu.org/licenses/>.
13 | # Publication: 28 Apr 2020
14 | 
15 | # Parse tab-delimited lines into an array
16 | while IFS=$'\t' read -r -a line
17 | do
18 |     qstart="${line[6]}"
19 |     qend="${line[7]}"
20 |     sstart="${line[8]}"
21 |     send="${line[9]}"
22 |     
23 |     # The following statement is the same as [[ "$qstart" -ne "$sstart" && "$qend" -ne "$send") ]].
24 |     # This if statement ignores hits where qstart = sstart AND qend = send.
25 |     if [ "$qstart" -ne "$sstart" ] && [ "$qend" -ne "$send" ]
26 |     then
27 |         # We use a sub-shell to avoid overriding the current IFS: ( IFS=$'\t'; echo "${line[*]}" ).
28 |         # https://superuser.com/questions/461981/how-do-i-convert-a-bash-array-variable-to-a-string-delimited-with-newlines/462400
29 |         # It is also necessary here even though the IFS has been set to a tab character for read data.
30 |         # Otherwise, the output of echo is space-delimited.
31 |         ( IFS=$'\t'; echo "${line[*]}" )
32 |     fi
33 | done < "${1:-/dev/stdin}"
34 | 


--------------------------------------------------------------------------------
/shell/extractSPAdesAssemblyStats.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright (C) 2021 Yu Wan <wanyuac@126.com>
 4 | # Licensed under the GNU General Public Licence version 3 (GPLv3) <https://www.gnu.org/licenses/>.
 5 | # Publication: 16 Apr 2021; latest update: 11 Sep 2021
 6 | 
 7 | display_usage() {
 8 |     echo "
 9 |     Extract contig/scaffold names, lengths, and depths from a SPAdes output FASTA file and save them in a tab-delimited text file.
10 |     Command line:
11 |       extractSPAdesAssemblyStats.sh [input.fasta] > [isolate1.tsv]  # Single-assembly mode: print a header line
12 |       extractSPAdesAssemblyStats.sh [input.fasta] [isolate name] >> [fasta summary.tsv]  # Multi-assembly mode: do not print a header line and append
13 |       the isolate name in each line for the convenience of concatenating files. This mode is used in a loop that runs this script iteratively.
14 |       For the multi-assembly mode, users may run \`echo -e \"Isolate\tNode\tLength\tDepth\" > asm_stats.tsv\` before the loop. 
15 |       "
16 | }
17 | 
18 | if [ -z $1 ]; then
19 |     display_usage
20 |     exit
21 | fi
22 | 
23 | if [ -z "$2" ]; then  # Single-assembly mode
24 |     echo -e 'Node\tLength\tDepth'  # Print the header line. Note that the echo command automatically appends a newline character to the output string.
25 |     grep '>' $1 | sed -e 's/>//g' | sed -e 's/_length_/\t/g' | sed -e 's/_cov_/\t/g'
26 | else  # Multi-assembly mode (namely, to loop through multiple FASTA files, where each iteration calls this script)
27 |     IFS=$'\n'  # https://stackoverflow.com/questions/8768420/how-to-convert-command-output-to-an-array-line-by-line-in-bash
28 |     lines=( $(grep '>' $1 | sed -e 's/>//g' | sed -e 's/_length_/\t/g' | sed -e 's/_cov_/\t/g') )
29 |     for i in ${lines[@]}; do
30 |         echo -e "${2}\t${i}"  # Add the assembly name to the head of the result line; use 'echo', not 'printf' (which does not print a newline character at the end of the output)
31 |     done
32 | fi
33 | 


--------------------------------------------------------------------------------
/shell/catAssemblyStats.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Concatenates files of assembly statistics that are generated using our lab's script contigMetrics.py.
 4 | # Copyright (C) 2017 Yu Wan <wanyuac@gmail.com>
 5 | # Licensed under the GNU General Public License (GPL) version 3
 6 | # First edition: 31/10/2016; the latest edition: 28/11/2017
 7 | # Previous name: catContigStats.sh
 8 | 
 9 | display_usage(){
10 |     echo "
11 |     Concatenates files of assembly statistics that are generated using our lab's script contigMetrics.py.
12 |     Usage: bash catAssemblyStats.sh ./assemblies/
13 |     Outputs:
14 |         assemblyStats_files.txt
15 |         assemblyStats.tsv
16 |     "
17 | }
18 | 
19 | #Constants for the previous version of contigMetrics.py
20 | #HEADER="contigFile,numContigs,totalBases,N50,smallest,lowerQ,median,upperQ,largest"  # previous version
21 | #STATS="contigStats_combined.csv"
22 | #FILE_LIST="contigStats_files.txt"
23 | #
24 | #Previous outputs:
25 | #    contigStats_files.txt
26 | #    contigStats_combined.csv
27 | 
28 | HEADER="Assembly\tContig_number\tN50\tQ1\tQ2\tQ3\tMean\tSmallest\tLargest\tLength"
29 | STATS="assemblyStats.tsv"
30 | FILE_LIST="assemblyStats_files.txt"
31 | NAME_PATTERN="assembly_stats.tsv"  # previous name: *_contigStats.txt
32 | 
33 | # Check argument ##########
34 | if [ -z $1 ]; then
35 |     echo "Error: a subject directory must be provided."
36 |     display_usage
37 |     exit
38 | fi
39 | 
40 | # Find all contigStats.txt files ##########
41 | find $1 -name $NAME_PATTERN -type f > $FILE_LIST
42 | echo "There are `cat ${FILE_LIST} | wc -l` genomes."
43 | 
44 | # Print contig statistics into a CSV file ##########
45 | echo -e $HEADER > $STATS  # -e: convert each "\t" to a tab character
46 | 
47 | # Extract the second line of every file and appends it to the CSV file ==========
48 | files=`cat ${FILE_LIST}`
49 | n=0
50 | 
51 | for f in ${files}; do
52 |     r=`cat ${f} | wc -l`
53 |     if [ "$r" -eq "2" ]; then
54 |         tail -n 1 $f >> $STATS
55 |         ((n++))
56 |     else
57 |         echo "Warning: ${f} does not contain contig statistics."
58 |     fi
59 | done
60 | 
61 | echo "Success: ${n} lines of statistics have been transferred into ${STATS}."
62 | 


--------------------------------------------------------------------------------
/shell/catGzippedFASTQsPerDirectory.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright (C) 2021 Yu Wan <wanyuac@126.com>
 4 | # Licensed under the GNU General Public Licence version 3 (GPLv3) <https://www.gnu.org/licenses/>.
 5 | # First edition: 25 Aug 2021; the latest update: 5 Nov 2021
 6 | 
 7 | # User guide ####################
 8 | display_usage() {
 9 |     echo "
10 |     Concatenates *.fastq.gz in each subdirectory into a single fastq.gz file. Useful for concatenating
11 |     Guppy's demultiplexed (with options '--barcode_kits' and '--trim_barcodes' enabled) output sequence files.
12 |     Usage:
13 |         bash catGzippedFASTQsPerDirectory.sh [input parental directory] [output directory] [inputs.tsv]
14 |     For example: ./catGzippedFASTQsPerDirectory.sh ~/fastq/pass ~/fastq/concat barcodes.tsv > cat_fastqs.log
15 |     Input TSV file inputs.tsv consists of two columns: [subdirectory name]\t[output filename without '.fastq.gz' or so].
16 |     "
17 | }
18 | 
19 | if [ -z $1 ]; then
20 |     display_usage
21 |     exit
22 | fi
23 | 
24 | # Main utility ####################
25 | indir="$1"
26 | outdir="$2"
27 | 
28 | if [ ! -d "$indir" ]
29 | then
30 |     echo "Error: input parental directory $indir does not exist." >&2  # Print to standard error
31 |     exit
32 | fi
33 | 
34 | if [ ! -d "$outdir" ]
35 | then
36 |     echo "Making output directory $outdir"
37 |     mkdir $outdir
38 | fi
39 | 
40 | n=0  # Count the number of subdirectories visited
41 | while read line  # Please ensure every line in the input TSV file is ended with a newline character.
42 | do
43 |     if [ ! -z "$line" ]
44 |     then
45 |         IFS=$'\t' read -r -a fields <<< "$line"  # Parse the line into two fields by '\t'.
46 |         input_subdir="$indir/${fields[0]}"
47 |         if [ ! -d "$input_subdir" ]
48 |         then
49 |             echo "Skip inaccessible input directory $input_subdir" >&2
50 |         else
51 |             output="$outdir/${fields[1]}.fastq.gz"
52 |             k=$(ls -1 $input_subdir/*.fastq.gz | wc -l)
53 |             echo "Concatenate $k .fastq.gz files from $input_subdir into $output"
54 |             zcat $input_subdir/*.fastq.gz | gzip > $output  # Slower than 'cat *.fastq.gz' but generates a smaller file.
55 |             (( n++ ))
56 |         fi
57 |     fi
58 | done < "$3"
59 | 
60 | echo "FASTQ files of $n samples have been successfully concatenated."


--------------------------------------------------------------------------------
/extractSeqFromMultiFASTA.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | This script extracts sequences from a multi-FASTA file using sequence IDs, which are defined in sequence headers. It
 5 | basically filters contigs from a multi-FASTA file based on sequence IDs. The script ignores sequence annotation that
 6 | is separated from the sequence ID by a white space.
 7 | 
 8 | Input: a multi-FASTA file from stdin. It can be a gene-feature file (.ffn) downloaded from the NCBI nucleotide database
 9 | or an assembly file comprised of several contig sequences.
10 | 
11 | Argument: a comma-delimited string of target sequence IDs.
12 | 
13 | Usage:
14 |     cat input.fna | python extractSeqFromMultiFASTA.py "gene1,gene2,...,geneN" > output.fna
15 |     cat input.fna | python extractSeqFromMultiFASTA.py "contig1,contig2,...,contigM" > output.fna
16 |     Or,
17 |     targets=$(cat seqIDs.txt)  # seqIDs.txt contains a single comma-delimited line.
18 |     cat input.fna | python extractSeqFromMultiFASTA.py $targets > output.fna
19 |     For SPAdes assemblies:
20 |     targets=$(cat seqIDs.txt)
21 |     cat input__scaffolds.fna | sed 's/_length_/ /g' | python extractSeqFromMultiFASTA.py $targets > input__scaffolds_subset.fna
22 | 
23 | Author: Yu Wan (wanyuac@126.com, https://github.com/wanyuac)
24 | Python version 2 and 3 compatible
25 | License: GNU GPL 2.1
26 | First edition: 5 July 2016; the latest edition: 13 Sep 2021
27 | Previous name: extract_fasta_loci.py
28 | """
29 | 
30 | from __future__ import print_function
31 | import sys
32 | from Bio import SeqIO
33 | from Bio.Seq import Seq
34 | from Bio.SeqRecord import SeqRecord
35 | 
36 | def main():
37 |     # read the list of locus tags
38 |     try:
39 |         """
40 |         First, drop the newline character in any combinations of \r and \n.
41 |         Otherwise, the last ID does not match to any sequence ID.
42 |         """
43 |         loci = sys.argv[1].rstrip("\r\n")
44 |         loci = loci.split(",")  # Parse the string for target sequence IDs
45 |     except ValueError:
46 |         print("Error: missing argument. A comma-delimited string of sequence IDs is required.")
47 |     
48 |     for seq in SeqIO.parse(sys.stdin, "fasta"):  # read the input FASTA file from stdin
49 |         if seq.id in loci:
50 |             print(seq.format("fasta"))  # write the current sequence to the stdout
51 | 
52 | if __name__ == "__main__":
53 |     main()
54 |     


--------------------------------------------------------------------------------
/shell/linkFiles.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Creating symbolic links according to a table of two columns: original file path and link path, separated by tab characters.
 3 | # Copyright (C) 2017-2021 Yu Wan <wanyuac@126.com>
 4 | # Licensed under the GNU General Public Licence version 3 (GPLv3) <https://www.gnu.org/licenses/>.
 5 | # First edition: 18 Oct 2017, the latest update: 6 Sep 2021
 6 | # Update note: changed the input file format from CSV to TSV for convenience of users.
 7 | 
 8 | # Display help information ###############
 9 | display_usage(){
10 |     echo "
11 |     Usage:
12 |            chmod a+x linkFiles.sh  # before the first run
13 |            ./linkFiles.sh [input TSV file]
14 |            ./linkFiles.sh [input TSV file] 1  # Add the second argument '1' to renew existing links.
15 |     The TSV file should not contain a header line. The first column consists of original file paths, and
16 |     the second column consists of link paths:
17 | 	    [old name & path]\t[new name & path]\n
18 |     An example of the TSV file:
19 |     ~/data/genome1_1.fasta\t/scratch/input/genome1_unimelb.fna
20 |     ~/data/genome1_2.fasta\t/scratch/input/genome1_zju.fna
21 |     
22 |     Notice a user must ensure the directory is accessible for storing links.
23 |     "
24 | }
25 | 
26 | if [ -z $1 ]
27 | then
28 |     display_usage
29 |     exit
30 | fi
31 | 
32 | # Set the override mode ###############
33 | if [ -z $2 ]
34 | then
35 |     override=false
36 | elif [ "$2" -eq "1" ]
37 | then
38 |     override=true
39 | else
40 |     override=false
41 | fi
42 | 
43 | # Otherwise, make symbolic links following the input file ###############
44 | while read line; do
45 |     if [ ! -z "$line" ]  # Sometimes empty lines are present in the input TSV file, causing an error of ln if keep them untreated.
46 |     then
47 |         IFS=$'\t' read -ra paths <<< "$line"  # split the delimited string into an arrary of two elements
48 |         target="${paths[1]}"
49 |         origin="${paths[0]}"
50 |         if [ ! -L "$target" ]
51 |         then
52 |             ln -s $origin $target
53 |         elif [ "$override" = true ]
54 |         then
55 |             echo "Warning: redirecting existing link $target -> $(readlink ${target}) to ${origin}."
56 |             unlink $target
57 |             ln -s $origin $target
58 |         else
59 |             echo "Warning: skipped existing link $target"
60 |         fi
61 |     fi
62 | done < "$1"  # expect a file name as an input


--------------------------------------------------------------------------------
/add_sample_name_FASTA.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | '''
 4 | This script adds a sample name at the beginning of each sequence in a FASTA file. For example, the header ">g1 description" becomes
 5 | ">sample1__g1 description" after running this script.
 6 | 
 7 | Author: Yu Wan (wanyuac@gmail.com, github.com/wanyuac)
 8 | 
 9 | Example: python add_sample_name_FASTA.py -i filename.txt (or filename.fna) -o output_dir -n
10 | 
11 | License: GNU GPL 2.0
12 | 
13 | First edition: Fri 27 Nov 2015
14 | Last edition: Sat 28 Nov 2015
15 | '''
16 | 
17 | from argparse import ArgumentParser
18 | from Bio import SeqIO, SeqFeature
19 | from Bio.Seq import Seq
20 | from Bio.SeqRecord import SeqRecord
21 | 
22 | def parse_args():
23 | 	parser = ArgumentParser(description="Add a sample name to every header of sequences")
24 | 	parser.add_argument("-i", type = str, required = True, help = "A textual list of input files or the file name of a single FASTA file")
25 | 	parser.add_argument("-o", type = str, required = False, default = ".", help = "Output directory")
26 | 	parser.add_argument("-n", required = False, action="store_true", help = "Whether to extract the sample name from the file name rather than the path?")
27 | 	return parser.parse_args()
28 | 
29 | def main():
30 | 	args = parse_args()
31 | 	
32 | 	# read file names from a list
33 | 	if ".txt" in args.i:
34 | 		with open(args.i, "rU") as f:
35 | 			fasta_files = f.read().splitlines()
36 | 	else:
37 | 		fasta_files = [args.i]  # If there is just a single FASTA file to be processed.
38 | 		
39 | 	# read every FASTA file, change all sequence IDs and write into a new file
40 | 	for f in fasta_files:
41 | 		new_fasta = []
42 | 		fields = f.split("/")
43 | 		if args.n:
44 | 			sample = (fields[-1].split("__"))[0]  # get the sample name from the first part of the file name
45 | 		else:
46 | 			sample = fields[-3]  # split the path and get the second last field as the sample name
47 | 		extension = (fields[-1].split("."))[1]  # get the filename extension: faa, fna or ffn
48 | 		records = list(SeqIO.parse(open(f, "rU"), "fasta"))  # records of a single GenBank file
49 | 		
50 | 		# process each sequence
51 | 		for s in records:
52 | 			s.id = "__".join([sample, s.id])
53 | 			s.description = " ".join(s.description.split(" ")[1 : ])  # remove the first field, which is identical to the sequence ID
54 | 			new_fasta.append(SeqRecord(s.seq, id = s.id, name = "", description = s.description))
55 | 		
56 | 		SeqIO.write(new_fasta, "%s/%s.%s" % (args.o, sample, extension), "fasta")
57 | 
58 | if __name__ == "__main__":
59 | 	main()


--------------------------------------------------------------------------------
/parse_ENA_sampleInfo_XML.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This script parses an ENA metadata file in XML format and prints a subset of information.
 3 | 
 4 | Usage: python parse_ENA_sampleInfo_XML.py ERP000909.xml > samples.txt
 5 | 
 6 | Input: an XML file exported for a list of ERS accession numbers from ENA using the REST URLs API. For example, one can download an XML file
 7 | 	   for sample ERS086023 using http://www.ebi.ac.uk/ena/data/view/ERS086023&display=xml.
 8 | 
 9 | Output: a tab-delimited text file containing information retrieved from the XML file.
10 | 		study_accession, sample_accession, secondary_sample_accession, experiment_accession, run_accession, Isolate_ID, Host, Place_of_isolation, Year_of_isolation
11 | 
12 | Author of this version: Yu Wan (wanyuac@gmail.com, https://github.com/wanyuac)
13 | Edition history: 6-7, 11 August 2015
14 | 
15 | Licence: GNU GPL 2.1
16 | """
17 | 
18 | import sys
19 | import xml.etree.ElementTree as xmlTree
20 | 
21 | def get_domains(sample):
22 | 	study = BioSample = ERS = experiment = run = isolate = strain = host = place = year = "NA"  # default value of all fields
23 | 	for domain in sample:
24 | 		if domain.tag == "IDENTIFIERS":
25 | 			BioSample, ERS = sample[0][1].text, sample[0][0].text  # <tag>text</tag>
26 | 		if domain.tag == "SAMPLE_LINKS":
27 | 			study = sample[4][0][0][1].text  # visit nested elements with indices
28 | 			experiment = sample[4][1][0][1].text
29 | 			run = sample[4][2][0][1].text
30 | 		if domain.tag == "SAMPLE_ATTRIBUTES":  # This domain may be variable in terms of attributes
31 | 			for attribute in domain:
32 | 				if attribute[0].text == "collection_date":
33 | 					year = attribute[1].text
34 | 				elif attribute[0].text == "isolate":
35 | 					isolate = attribute[1].text
36 | 				elif attribute[0].text == "specific_host":
37 | 					host = attribute[1].text
38 | 				elif attribute[0].text == "country":
39 | 					place = attribute[1].text
40 | 				elif attribute[0].text == "strain":
41 | 					strain = attribute[1].text
42 | 	return [study, BioSample, ERS, experiment, run, isolate, strain, host, place, year]
43 | 
44 | def main():
45 | 	file = sys.argv[1]
46 | 	xml = xmlTree.parse(file).getroot()  # parse an XML into a tree of elements
47 | 	
48 | 	# print the header line
49 | 	print "\t".join(["study_accession", "sample_accession", "secondary_sample_accession", "experiment_accession", "run_accession", "Isolate_ID", "Strain", "Host", "Place_of_isolation", "Year_of_isolation"])
50 | 	for sample in xml:
51 | 		print "\t".join(get_domains(sample))
52 | 	return
53 | 
54 | if __name__ == '__main__':
55 | 	main()
56 | 


--------------------------------------------------------------------------------
/rename_fasta_seqs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | Rename sequences in a FASTA file. It filters out sequences that are not included in the target list,
 5 | when specified.
 6 | 
 7 | Author: Yu Wan (wanyuac@gmail.com, https://github.com/wanyuac)
 8 | Python version 2 and 3 compatible
 9 | License: GNU GPL 2.1
10 | First edition: 11 Nov 2018, the latest revision: 14 Nov 2021.
11 | Created and finished in Nara, Japan.
12 | """
13 | 
14 | from __future__ import print_function
15 | import sys
16 | from Bio import SeqIO
17 | from Bio.Seq import Seq
18 | from Bio.SeqRecord import SeqRecord
19 | from argparse import ArgumentParser
20 | 
21 | 
22 | def parse_arguments():
23 |     parser = ArgumentParser(description="Read options and arguments")
24 |     parser.add_argument("--fasta", "-f", dest = "fasta", type = str, required = True, help = "A FASTA file whose sequences will be renamed.")
25 |     parser.add_argument("--mapping", "-m", dest = "mapping", type = str, required = True, help = "A tab-delimited file mapping original sequence IDs to new IDs.")
26 |     parser.add_argument("--out", "-o", dest = "out", type = str, required = False, default = "./renamed.fasta", help = "Name and path for output FASTA file.")
27 |     parser.add_argument("--keep_all", "-k", dest = "keep_all", action = "store_true", required = False, help = "Set to keep all sequences when some IDs are not found in the rename table.")
28 |     parser.add_argument("--simple", "-s", dest = "simple", action = "store_true", required = False, help = "Drop original sequence names to make simple headers.")
29 | 
30 |     return parser.parse_args()
31 | 
32 | 
33 | def main():
34 |     args = parse_arguments()
35 |     mapping = import_mapping_table(args.mapping)
36 |     drop_prev_name = args.simple
37 |     to_rename = list(mapping.keys())
38 |     in_fasta = open(args.fasta, "r")
39 |     out = open(args.out, "w")
40 |     
41 |     for seq in SeqIO.parse(in_fasta, "fasta"):  # read the input FASTA file
42 |         if seq.id in to_rename:
43 |             if drop_prev_name:
44 |                 seq.description = ""
45 |             seq.id = mapping[seq.id]
46 |             print(seq.format("fasta"), file = out)
47 |         elif args.keep_all:
48 |             print(seq.format("fasta"), file = out)
49 | 
50 |     in_fasta.close()
51 |     out.close()
52 |     
53 |     return
54 | 
55 | 
56 | def import_mapping_table(rename):
57 |     # Read the tab-delimited table for renaming sequences.
58 |     with open(rename, "r") as f:
59 |         lines = f.read().splitlines()
60 |         
61 |     r = {}
62 |     for l in lines:
63 |         old_id, new_id = l.split("\t")
64 |         r[old_id] = new_id
65 |         
66 |     return(r)
67 | 
68 | 
69 | if __name__ == "__main__":
70 |     main()
71 | 


--------------------------------------------------------------------------------
/filename_generator.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | This script generates a list of file names based on a list of strings. It is useful if you want to generate a list of file names for read sets from a list of bacterial strain names.
 3 | 
 4 | Usage: python filename_generator.py -i <input file> -o <output file> -p <prefix> -s <suffix> -f <from> -l <to> -pe
 5 | 
 6 | Input: a list of filenames
 7 | 	Example: (inlist.txt)
 8 | 		sample1__genes__results.txt
 9 | 		sample2__genes__results.txt
10 | Command: python filename_generator.py -i inlist.txt -o outlist.txt -p /reads/ -s .fastq.gz -f 0 -l 7 -pe
11 | Output: a list of new file names generated on the basis of strings in inlist.txt
12 | 	Example: (outlist.txt)
13 | 		/reads/sample1_1.fastq.gz
14 | 		/reads/sample1_2.fastq.gz
15 | 		/reads/sample2_1.fastq.gz
16 | 		/reads/sample2_2.fastq.gz
17 | 		
18 | Author: Yu Wan (wanyuac@gmail.com, GitHub: https://github.com/wanyuac)
19 | First edition: 6 July 2015
20 | Last edition: 5 Nov 2015
21 | 
22 | License: GNU GPL 2.1
23 | '''
24 | 
25 | from argparse import ArgumentParser
26 | 
27 | def parse_args():
28 | 	# Read arguments from the command line
29 | 	parser = ArgumentParser(description='Regenerate filenames.')
30 | 	# Inputs
31 | 	parser.add_argument('-i', type = str, required = True, help = 'File name of the input list')
32 | 	parser.add_argument('-o', type = str, required = True, help = 'File name of the output list')
33 | 	parser.add_argument('-p', type = str, required = False, help = 'The prefix added to the base for new filenames')
34 | 	parser.add_argument('-s', type = str, required = False, default = '.fastq.gz', help = 'The suffix added to the base for new filenames')
35 | 	parser.add_argument('-f', type = int, required = True, help = 'From which character of the base')
36 | 	parser.add_argument('-l', type = int, required = True, help = 'How many characters of the base should be used; -1: use the whole base')
37 | 	parser.add_argument('-pe', required = False, action='store_true', help = 'Whether read sets are paired-end')
38 | 	return parser.parse_args()
39 | 
40 | def main():
41 | 	args = parse_args()
42 | 	with open(args.i, 'rU') as in_f:
43 | 		bases = in_f.read().splitlines()
44 | 	out_f = open(args.o, 'w')
45 | 	
46 | 	if args.l > -1:  # only use part of the base for constructing a new file name
47 | 		for i in range(0, len(bases)):
48 | 			bases[i] = bases[i][args.f : args.l]
49 | 		
50 | 	for item in bases:
51 | 		if args.pe:  # if input files are related to paired-ended libraries
52 | 			for i in range(1, 3):
53 | 				filename = '{prefix}{base}_{index}{suffix}\n'.format(prefix = args.p, base = item, index = i, suffix = args.s)
54 | 				out_f.write(filename)
55 | 		else:
56 | 			filename = args.p + item[args.f : args.l] + args.s + '\n'
57 | 			out_f.write(filename)
58 | 	out_f.close()
59 | 	print 'All filenames were generated from bases.'
60 | 
61 | if __name__ == '__main__':
62 | 	main()


--------------------------------------------------------------------------------
/shell/catGzippedFASTQsPerSample.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright (C) 2021 Yu Wan <wanyuac@126.com>
 4 | # Licensed under the GNU General Public Licence version 3 (GPLv3) <https://www.gnu.org/licenses/>.
 5 | # First edition: 8 Sep 2021; the latest update: 9 Sep 2021
 6 | # This script is derived from catGzippedFASTQsPerDirectory.sh.
 7 | 
 8 | # User guide ####################
 9 | display_usage() {
10 |     echo "
11 |     Concatenate *.fastq.gz of each sample into a single fastq.gz file.
12 |     Usage:
13 |         bash catGzippedFASTQsPerSample.sh [input parental directory] [output directory] [a list of input sample names]
14 |     For example: ./catGzippedFASTQsPerSample.sh ~/fastq/pass ~/fastq/concat isolates.txt &> cat_fastqs.log
15 |     There is one sample name per line in the input sample-name list.
16 |     "
17 | }
18 | 
19 | if [ -z $1 ]; then
20 |     display_usage
21 |     exit
22 | fi
23 | 
24 | # Main utility ####################
25 | 
26 | # 1. Set up directories ===============
27 | indir="$1"
28 | outdir="$2"
29 | 
30 | if [ ! -d "$indir" ]
31 | then
32 |     echo "Error: input parental directory $indir does not exist." >&2  # Print to standard error
33 |     exit
34 | fi
35 | 
36 | if [ ! -d "$outdir" ]
37 | then
38 |     echo "Making output directory $outdir"
39 |     mkdir $outdir
40 | fi
41 | 
42 | # 2. Concatenate read files ===============
43 | n=0  # The counter of samples processed
44 | while read i  # Please ensure every line in the input TSV file is ended with a newline character.
45 | do
46 |     if [ ! -z "$i" ]  # Skip empty lines
47 |     then
48 |         # Users may customise the following two commands to match their filenames.
49 |         ra="$indir/*_${i}A-[1,2].bacterial-fastq-only.ngsservice.processed.R"  # There should be only a single match.
50 |         rb="$indir/*_${i}B-[1,2].bacterial-fastq-only.ngsservice.processed.R"  # The same as above.
51 |         ra1=`ls -1 ${ra}1.fastq.gz`
52 |         ra2=`ls -1 ${ra}2.fastq.gz`
53 |         rb1=`ls -1 ${rb}1.fastq.gz`
54 |         rb2=`ls -1 ${rb}2.fastq.gz`
55 | 
56 |         # Concatenate the read files of the current sample
57 |         if [ -f "$ra1" ] && [ -f "$ra2" ] && [ -f "$rb1" ] && [ -f "$rb2" ]
58 |         then
59 |             echo "Process read files of isolate $i"
60 |             echo "    Concatenating $ra1 and $rb1"
61 |             zcat $ra1 $rb1 | gzip > $outdir/${i}_1.fastq.gz  # Slower than 'cat *.fastq.gz' but generates a smaller file.
62 |             echo -e "    Concatenating $ra2 and ${rb2}\n"
63 |             zcat $ra2 $rb2 | gzip > $outdir/${i}_2.fastq.gz
64 |             (( n++ ))
65 |         else
66 |             echo -e "Skip file concatenation for sample $i due to absence of one or more read files.\n" >&2
67 |         fi
68 |     fi
69 | done < "$3"  # Read sample names one-by-one from the input list
70 | 
71 | echo "FASTQ files of $n samples have been successfully concatenated."


--------------------------------------------------------------------------------
/exclude_pseudo_seqs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Exclude nucleotide/protein sequences of pseudo genes from a multi-Fasta file.
 4 | 
 5 | Copyright (C) 2025 Yu Wan <wanyuac@gmail.com>
 6 | First release: 2 Jan 2025; latest update: 3 Jan 2025
 7 | Licensed under the GNU General Public Licence version 3 (GPLv3) <https://www.gnu.org/licenses/>.
 8 | """
 9 | 
10 | from Bio import SeqIO
11 | from argparse import ArgumentParser
12 | import re
13 | 
14 | def parse_arguments():
15 |     parser = ArgumentParser(description = "Filter pseudo sequences from a multi-FASTA file.")
16 |     parser.add_argument('--input', '-i', dest = 'input', required = True, help = "Path to the input FASTA file.")
17 |     parser.add_argument('--output', '-o', dest = 'output', required = False, default = 'filtered_output.fasta', help = "Path to the output FASTA file.")
18 |     parser.add_argument('--pseudo', '-p', dest = 'pseudo', required = False, default = 'pseudo.fasta', help = "Path to the output FASTA file of excluded sequences")
19 |     parser.add_argument('--discard_annot', '-d', dest = 'discard_annot', required= False, action = 'store_true', help = "A flag to discard sequence annotations and only keep names")
20 |     return parser.parse_args()
21 | 
22 | def filter_pseudo_sequences(input_fasta, output_fasta, pseudo_fasta, discard_annot):
23 |     with open(input_fasta, 'r') as infile,\
24 |         open(output_fasta, "w") as outfile,\
25 |         open(pseudo_fasta, 'w') as pseudofile:
26 |         for record in SeqIO.parse(infile, 'fasta'):  # Iterate through sequences in the input FASTA file
27 |             if "[pseudo=true]" in record.description:  # Check if "[pseudo=true]" is in the header
28 |                 output_handle = pseudofile
29 |             else:
30 |                 output_handle = outfile
31 |             if discard_annot:
32 |                 record.id = rename_seq(record.description, record.id)
33 |                 record.description = record.id
34 |             SeqIO.write(record, output_handle, 'fasta')
35 | 
36 | def rename_seq(seq_description, seq_id):
37 |     match_locus_tag = re.search(r'\[locus_tag=([^\]]+)\]', seq_description)  # Extract locus_tag and protein_id from the description using regular expressions
38 |     match_protein_id = re.search(r'\[protein_id=([^\]]+)\]', seq_description)
39 |     locus_tag = match_locus_tag.group(1) if match_locus_tag else None
40 |     protein_id = match_protein_id.group(1) if match_protein_id else None
41 |     if locus_tag and protein_id:
42 |         new_id = f"{locus_tag}__{protein_id}"
43 |     else:
44 |         new_id = seq_id  # No change to the sequence ID
45 |     return new_id
46 | 
47 | def main():
48 |     args = parse_arguments()
49 |     filter_pseudo_sequences(args.input, args.output, args.pseudo, args.discard_annot)
50 |     print(f"Filtered sequences have been written to {args.output} and {args.pseudo}.")
51 | 
52 | if __name__ == '__main__':
53 |     main()
54 | 


--------------------------------------------------------------------------------
/seqlen.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | Calculate sequence lengths in a multiFASTA file.
 5 | This script was inspired by Lesley Sitter's code published on biostars (www.biostars.org/p/148815/).
 6 | 
 7 | Command: python seqlen.py -i [input FASTA file] (-a) (-n) > [output TSV file]
 8 | Examples:
 9 |     python seqlen.py -i input.fna -a > seq_lengths.tsv  # With sequence annotation in the sequence description
10 |     python seqlen.py -i input.fna -n > seq_lengths.tsv  # Only keep the sequence ID in the sequence description and ignore 'N' and '-' characters
11 | 
12 | Any character: a flag to keep sequence annotation in the output.
13 | 
14 | Copyright (C) 2021 Yu Wan <wanyuac@126.com>
15 | Licensed under the GNU General Public Licence version 3 (GPLv3) <https://www.gnu.org/licenses/>.
16 | Release: 2021; latest update: 26 Sep 2022
17 | """
18 | 
19 | import sys
20 | import re
21 | from argparse import ArgumentParser
22 | 
23 | def parse_argument():
24 |     parser = ArgumentParser(description = "Calculating lengths of sequences in a FASTA file")
25 |     parser.add_argument('-i', '--input', dest = 'i', type = str, required = True, help = "An input FASTA file")
26 |     parser.add_argument('-a', '--annot', dest = 'a', action = 'store_true', help = "Keep sequence annotations in addition to sequence names")
27 |     parser.add_argument('-n', '--nucl', dest = 'n', action = 'store_true', help = "Ignoring \'-\' and \'N\' in nucleotide sequences.")
28 |     return parser.parse_args()
29 | 
30 | def main():
31 |     args = parse_argument()
32 | 
33 |     with open(args.i, "r") as f:
34 |         input_fasta = f.read().splitlines()  # Newline characters are dropped.
35 |     print("\t".join(["Name", "Length"]))  # The header line
36 |     seq = ""
37 |     allow_write = False  # A flag indicating that the first sequence name has been completely loaded.
38 |     ignore_annot = not args.a
39 | 
40 |     for line in input_fasta:
41 |         if line.startswith(">"):  # A new sequence is encountered
42 |             if allow_write:
43 |                 if args.n:
44 |                     seq = re.sub('[N-]', '', seq.upper())  # Stripping multiple characters from a string. Ref: stackoverflow.com/questions/3900054/python-strip-multiple-characters.
45 |                 print("\t".join([seqid, str(len(seq))]))  # Write the name and length of the previous sequence
46 |                 seq = ""
47 |             if ignore_annot:
48 |                 seqid = line.split(" ")[0]  # Get the sequence ID and ignore the sequence annotation
49 |             else:
50 |                 seqid = line
51 |             seqid = seqid[1 : ]  # Drop the ">" character
52 |             allow_write = True
53 |         else:
54 |             seq += line  # concatenate lines of the current sequence
55 | 
56 |     print("\t".join([seqid, str(len(seq))]))  # Write the length of the last sequence
57 |     return
58 | 
59 | if __name__ == '__main__':
60 |     main()
61 | 


--------------------------------------------------------------------------------
/extractNuclRegionFromFASTA.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This script extracts a region of nucleotides by its genomic coordinates from a fasta file
 3 | 
 4 | Arguments
 5 | 	-i: the path of the input file
 6 | 	-n: the name of your selected contig
 7 | 	-f: feature name specified by the user
 8 | 	-s: the first nucleotide to be selected
 9 | 	-e: the last nucelotide to be selected
10 | 	-o: the filename of the output
11 | 	
12 | Requirements
13 | 	Only one region should be selected
14 | 	The start and end positions should not spill out
15 |     
16 | Author: Yu Wan (wanyuac@126.com)
17 | Date: 1 June and 17 July 2015
18 | GitHub: https://github.com/wanyuac/BINF_toolkit
19 | Licence: GNU GENERAL PUBLIC LICENSE Version 2
20 | Previous name: extract_nc_region.py, extract_nucl_region.py
21 | """
22 | 
23 | from argparse import ArgumentParser
24 | from Bio import SeqIO
25 | from Bio.SeqRecord import SeqRecord
26 | 
27 | def parse_args():
28 | 	# This function extracts arguments from the command line
29 | 	parser = ArgumentParser(description="Read arguments: input filename, start position, end position, and out filename")
30 | 	parser.add_argument("-i", type=str, required=True, help="Input path")  # append an argument to variable "parser"
31 | 	parser.add_argument("-c", type=str, default="", help="Name of your selected contig; the first contig will be chosen if -c is not set.")
32 | 	parser.add_argument("-f", type=str, default="feature", help="The feature name")
33 | 	parser.add_argument("-s", type=int, required=True, help="Start position")
34 | 	parser.add_argument("-e", type=int, required=True, help="End position")
35 | 	parser.add_argument("-o", type=str, default="selected_region.fasta", help="File name of the output")
36 | 	return parser.parse_args()
37 | 	
38 | def write_seq(contig, feature, start, end, output):
39 | 	# read and write FASTA files
40 | 	seqlen = end - start + 1  # the length of selected region
41 | 	contiglen = len(contig.seq)
42 | 	if start > contiglen or end > contiglen:  # the genetic coordinates are out bounded
43 | 		flag = False
44 | 	else:
45 | 		seq = contig.seq[start - 1: end]  # gets the selected sequence of this contig
46 | 		descr = feature + "|" + str(start) + ".." + str(end) + "|" + str(seqlen) + " bp\n" # gets the header of this contig
47 | 		new_rec = SeqRecord(seq=seq, id=contig.id, name=feature, description=descr)  # create a new SeqRecord instance. Note that the contig.name will not be written in a FASTA file (only in GenBank files).
48 | 		f = open(output, "w")
49 | 		SeqIO.write(new_rec, f, "fasta")  # saves the selection
50 | 		f.close()
51 | 		flag = True
52 | 	return flag
53 | 	
54 | def main():
55 | 	args = parse_args()  # read arguments from the command line
56 | 	f = open(args.i, "rU")  # supports universal newlines
57 | 	contigs = list(SeqIO.parse(f, "fasta"))
58 | 	found = False
59 | 	
60 | 	"""
61 | 	To-do: what happens when start > end?
62 | 	"""
63 | 	if args.c == "":
64 | 		found = write_seq(contig=contigs[0], feature=args.f, start=args.s, end=args.e, output=args.o)  # read the first contig if -n is not set
65 | 	else:
66 | 		for contig in contigs:
67 | 			if contig.id == args.c:  # if this is the selected contig
68 | 				found = write_seq(contig=contig, feature=args.f, start=args.s, end=args.e, output=args.o)
69 | 				break
70 | 	f.close()
71 | 	if found:
72 | 		print "The target sequence was extracted."
73 | 	else:
74 | 		print "No sequence was found."
75 | 	return
76 | 	
77 | # The main program
78 | if __name__ == "__main__":
79 |     main()
80 | 


--------------------------------------------------------------------------------
/gfa_stats.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | This script generates summary statistics of nodes for a list of input GFA files
 4 | that are produced by the assembler SPAdes. It prints a tab-delimited file to the
 5 | STDIN. See gfa-spec.github.io/GFA-spec/GFA1.html for specifications of the
 6 | GFA format.
 7 | 
 8 | Example command line:
 9 |     python gfa_stats.py *.gfa > gfa_stats.tsv
10 |     python gfa_stats.py *.gfa | sed 's/__scaffolds.gfa//g' > gfa_stats.tsv
11 |     python gfa_stats.py *.gfa 1> gfa_stats.tsv 2> gfa_stats.err
12 | 
13 | This script does not consider the overlap length (e.g., 77M), so every node length
14 | reported by this script includes the overlap length. For singleton nodes (namely,
15 | nodes that do not connect to any other nodes), the overlap length = 0.
16 | 
17 | Copyright (C) 2021 Yu Wan <wanyuac@126.com>
18 | Licensed under the GNU General Public Licence version 3 (GPLv3) <https://www.gnu.org/licenses/>.
19 | Creation: 15 July 2021; the latest update: 16 July 2021
20 | """
21 | 
22 | import os
23 | import sys
24 | import glob
25 | from collections import namedtuple
26 | 
27 | def main():
28 |     """
29 |     The way each OS deals with the wildcard differs. For example, Win10 passes the string '*.gfa' directly to the script
30 |     as sys.argv[1], whereas Linux replaces this wildcard express with all files globed.
31 |     """
32 |     if len(sys.argv) > 2:
33 |         gfas = sys.argv[1 : ]
34 |     else:
35 |         gfas = glob.glob(sys.argv[1])  # sys.argv[1] = "*.gfa"
36 |     print("Summarise nodes in %i GFA files" % len(gfas), file = sys.stderr)
37 | 
38 |     # Print summary statistics
39 |     print("\t".join(["Assembly", "Node", "Length", "Depth", "Kmers", "Singleton"]), file = sys.stdout)  # The header line
40 |     for g in gfas:  # Filenames are used for filling the first column.
41 |         if os.path.exists(g):
42 |             summarise_gfa(g)
43 |         else:
44 |             print("Error: skip the inaccessible file " + g, file = sys.stderr)
45 |     return
46 | 
47 | 
48 | def summarise_gfa(gfa):
49 |     """ Produces summary statistics for a GFA file """
50 |     nodes = dict()
51 |     linked_nodes = set()
52 |     Node = namedtuple("Node", ["length", "depth", "kmers"])
53 | 
54 |     # Extracts and transforms the S (segment) and L (link) fields of each GFA file
55 |     g = open(gfa, "r")
56 |     line = g.readline().strip()
57 |     while line:
58 |         if line.startswith("S"):  # A segment line
59 |             _, node_name, seq, depth, kmers = line.split("\t")
60 |             nodes[node_name] = Node(length = str(len(seq)), depth = depth[5 : ], kmers = kmers[5 : ])  # Drop "DP:f:" and "KC:i:"
61 |         elif line.startswith("L"):  # A link line, which always comes after the "S" lines
62 |             _, f, _, t, _, _ = line.split("\t")
63 |             if f != t:
64 |                 linked_nodes = linked_nodes.union({f, t})
65 |             else:
66 |                 linked_nodes.add(f)
67 |         else:
68 |             break  # The "P" lines make up the last section in the GFA file.
69 |         line = g.readline().strip()
70 |     g.close()
71 | 
72 |     # Mark non-singleton nodes
73 |     for node_name, node_stats in nodes.items():
74 |         is_singleton = "0" if node_name in linked_nodes else "1"  # "0": no; "1": yes.
75 |         print("\t".join([gfa, node_name, node_stats.length, node_stats.depth, node_stats.kmers, is_singleton]), file= sys.stdout)
76 |     return
77 | 
78 | 
79 | if __name__ == "__main__":
80 |     main()
81 | 


--------------------------------------------------------------------------------
/filterSPAdesContigs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Filters contigs/scaffolds in SPAdes output FASTA files for a minimum length (bp) and a range of
 4 | read depths (min <= d <= max).
 5 | 
 6 | Outputs: (1) a filtered FASTA file to stdout, (2) a summary of the filtering process to stderr.
 7 | Note that sequence headers in the output FASTA file differ from the original format for the convenience of
 8 | subsequent analyses:
 9 | 	Sequence headers in the input file: NODE_[n]_length_[L]_cov_[C]
10 | 	Sequence headers in the output file: NODE_[N] len=[L],cov=[C]
11 | Columns in the output from stderr:
12 | 	Input file name, number of contigs passed the filters, number of contigs failed the filters, names of contigs failed the filters
13 | 
14 | Example command:
15 | 	python filterSPAdesContigs.py --input input.fna --min_len 200 --min_d 1 --max_d 100 1>filtered.fna 2>filter.log
16 | 
17 | Dependencies: Biopython, Python v3
18 | 
19 | Copyright (C) 2022 Yu Wan <wanyuac@126.com>
20 | Licensed under the GNU General Public Licence version 3 (GPLv3) <https://www.gnu.org/licenses/>.
21 | Creation: 23 Jan 2022; the latest update: 23 Jan 2022.
22 | """
23 | from argparse import ArgumentParser
24 | import os
25 | import sys
26 | from Bio import SeqIO
27 | from collections import namedtuple
28 | 
29 | def parse_arguments():
30 | 	parser = ArgumentParser(description = "Read options and arguments")
31 | 	parser.add_argument('--input', '-i', dest = 'input', type = str, required = True, help = "Input FASTA file from SPAdes")
32 | 	parser.add_argument('--min_len', '-l', dest = 'min_len', type = int, required = False, default = 1, help = "Minimum contig length [default: 1 bp (no filter)]")
33 | 	parser.add_argument('--min_d', '-d0', dest = 'min_d', type = float, required = False, default = 0, help = "Minimum read depth per contig [default: 0 (no filter)]")
34 | 	parser.add_argument('--max_d', '-d1', dest = 'max_d', type = float, required = False, default = 0, help = "Maximum read depth per contig [default: 0 (no filter)]")
35 | 	return parser.parse_args()
36 | 
37 | def parse_seq_header(h):
38 | 	"""
39 | 	Parse sequence headers in SPAdes's output FASTA files
40 | 	Format of the headers: NODE_[n]_length_[L]_cov_[C].
41 | 	"""
42 | 	Contig = namedtuple('Contig', ['name', 'len', 'cov'])
43 | 	fields = h.split('_')
44 | 	return Contig(name = '_'.join(fields[0 : 2]), len = int(fields[3]), cov = float(fields[5]))
45 | 
46 | def main():
47 | 	args = parse_arguments()
48 | 	min_len = args.min_len
49 | 	min_d = args.min_d
50 | 	max_d = args.max_d
51 | 	filter_len = min_len > 1
52 | 	filter_min_d = min_d > 0
53 | 	filter_max_d = max_d > 0 and min_d < max_d
54 | 	fasta = os.path.basename(args.input)
55 | 	if not os.path.exists(args.input):
56 | 		print(f"Error: input file {fasta} does not exist.", file = sys.stderr)
57 | 		sys.exit(1)
58 | 	n_pass = 0
59 | 	n_fail = 0
60 | 	names_fail = []
61 | 	for contig in SeqIO.parse(args.input, 'fasta'):
62 | 		c = parse_seq_header(contig.id)
63 | 		keep = True
64 | 		if filter_len:
65 | 			keep = keep and c.len >= min_len
66 | 		if filter_min_d:
67 | 			keep = keep and c.cov >= min_d
68 | 		if filter_max_d:
69 | 			keep = keep and c.cov <= max_d
70 | 		if keep:
71 | 			contig.id = c.name
72 | 			contig.description = f'len={c.len},cov={c.cov}'
73 | 			SeqIO.write(contig, sys.stdout, 'fasta')
74 | 			n_pass += 1
75 | 		else:
76 | 			names_fail.append(contig.id)
77 | 			n_fail += 1
78 | 	if n_fail > 0:
79 | 		ns = ','.join(names_fail)
80 | 	else:
81 | 		ns = ''
82 | 	print(f'{fasta}\t{n_pass}\t{n_fail}\t{ns}', file = sys.stderr)
83 | 	
84 | if __name__ == '__main__':
85 | 	main()


--------------------------------------------------------------------------------
/shell/download_ena_pe_reads.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # A wrapper for downloading paired-end read sets from the ENA database using ena-file-downloader (github.com/enasequence/ena-ftp-downloader)
  3 | # Copyright (C) 2021-2023 Yu Wan <wanyu@microbialsystems.cn>
  4 | # Licensed under the GNU General Public Licence version 3 (GPLv3) <https://www.gnu.org/licenses/>.
  5 | # Publication: 29 Mar 2021; latest update: 3 Aug 2023
  6 | 
  7 | # Guidance #########################
  8 | display_usage() {
  9 |     echo "
 10 |     Download read files from the ENA database using ena-file-downloader. Please ensure java is accessible in your working environment.
 11 |     Command:
 12 |         download_ena_pe_reads.sh -d=[path to ena-file-downloader.jar] -t=[a TSV file with two columns: isolate names, read accessions] -o=[Output directory]
 13 |     Example commands:
 14 |         download_ena_pe_reads.sh -d=\"\$HOME/bin/ena-file-downloader.jar\" -t=\"readsets.tsv\" -o=\"\$PWD\" 1> download_ENA_reads.log 2> download_ENA_reads.err
 15 |     "
 16 | }
 17 | 
 18 | if [ -z "$1" ] || [ "$1" == "-h" ]; then
 19 |     display_usage
 20 |     exit 0
 21 | fi
 22 | 
 23 | # Functions ####################
 24 | check_dir() {
 25 |     if [ ! -d "$1" ]; then
 26 |         echo "Create directory $1"
 27 |         mkdir -p "$1"
 28 |     fi
 29 | }
 30 | 
 31 | download_reads() {
 32 |     p="$1"  # The downloader program
 33 |     i="$2"  # Isolate name
 34 |     a="$3"  # ENA accession
 35 |     tmp_dir="reads_fastq/$a"  # Output directory
 36 |     if [ -d "$tmp_dir" ]; then
 37 |         echo "Warning: existing temporary directory $tmp_dir is deleted." >&2
 38 |         rm -rf "$tmp_dir"
 39 |     fi
 40 |     java -jar "$p" --accessions="$a" --format=READS_FASTQ --location=$PWD --protocol=FTP --asperaLocation=null  # A directory 'reads_fastq' and a subdirectory "reads_fastq/$j" are created by this command.
 41 |     r1="$tmp_dir/${a}_1.fastq.gz"
 42 |     r2="$tmp_dir/${a}_2.fastq.gz"
 43 |     if [ -f "$r1" ] && [ -f "$r2" ]; then
 44 |         mv "$r1" "${i}_1.fastq.gz"
 45 |         mv "$r2" "${i}_2.fastq.gz"
 46 |         echo "Successfully downloaded paired-end readset of isolate $i (accession: $a)."
 47 |     else
 48 |         echo "Error: paired-end readset of isolate $i (accession: $a) could not be downloaded." >&2
 49 |         echo "Files in directory ${tmp_dir}:" >&2
 50 |         ls -1 "${tmp_dir}" >&2
 51 |     fi
 52 |     rmdir "$tmp_dir"
 53 |     sleep 1
 54 | }
 55 | 
 56 | # Main #########################
 57 | # Read arguments
 58 | for i in "$@"; do
 59 |     case "$i" in
 60 |     -t=*)
 61 |     accessions="${i#*=}"
 62 |     ;;
 63 |     -o=*)
 64 |     outdir="${i#*=}"
 65 |     ;;
 66 |     -d=*)
 67 |     downloader="${i#*=}"
 68 |     ;;
 69 |     *)
 70 |     ;;
 71 |     esac
 72 | done
 73 | 
 74 | # Check whether the downloader is accessible
 75 | if [ ! -f $downloader ]; then
 76 |     echo "Error: ena-file-downloader.jar is not accessible at location $downloader" >&2
 77 |     exit 1
 78 | fi
 79 | 
 80 | # Check accession file
 81 | if [ ! -f "$accessions" ]; then
 82 |     echo "Error: the TSV file of accession numbers is not found." >&2
 83 |     exit 1
 84 | fi
 85 | 
 86 | # Set up the output directory
 87 | if [ ! -z "$outdir" ]; then
 88 |     check_dir "$outdir"
 89 | else
 90 |     echo "Error: $outdir was not found." >&2
 91 |     exit 1
 92 | fi
 93 | 
 94 | # Download reads
 95 | cd "$outdir"
 96 | 
 97 | while read line; do  # Read through the input TSV file line-by-line
 98 |     if [ ! -z "$line" ]; then
 99 |         IFS=$'\t' read -r -a fields <<< "$line"
100 |         download_reads "$downloader" "${fields[0]}" "${fields[1]}"  # Downloader, isolate name, ENA accession
101 |     fi
102 |     rmdir reads_fastq  # A directory created by the downloader; directory 'logs' (also created by the downloader) is left in the output directory.
103 | done < "$accessions"  # Expect a file name as an input
104 | 


--------------------------------------------------------------------------------
/gc.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # Author: Yu Wan, University of Melbourne, 2014-3-21~4-1, 12 May 2015
 3 | # Contact: wanyuac@126.com
 4 | # This program calculates the length, GC content, and entropy for each record in a multi-fasta file.
 5 | # GitHub: https://github.com/wanyuac/BINF_toolkit
 6 | # Input: a fasta file which contains multiple sequences from the standard input
 7 | # Output: for each sequence, print: 1) header 2) total sequence length 3) percentage of G+C 4) entropy of the sequence
 8 | # Command line: python gc.py < filename.fasta
 9 | # Treatment of the extended alphabet:
10 | #    1) consider all of 15 characters
11 | #    2) construct a weighted-count table using dictionary
12 | #    3) for each character in the table, take the probability of being A, G, C or T as effective counts
13 | #    4) counts for A, G, C and T is computed by adding up the vectors for every character read from the sequence.
14 | # Licence: GNU GENERAL PUBLIC LICENSE 2.0
15 | 
16 | import sys
17 | import math
18 | 
19 | alphabet = {  # weighted-count table
20 | #                 A  G  C  T
21 |             'A': [1, 0, 0, 0],
22 |             'G': [0, 1, 0, 0],
23 |             'C': [0, 0, 1, 0],
24 |             'T': [0, 0, 0, 1],
25 |             'S': [0, 0.5, 0.5, 0],
26 |             'W': [0.5, 0, 0, 0.5],
27 |             'R': [0.5, 0.5, 0, 0],
28 |             'Y': [0, 0, 0.5, 0.5],
29 |             'M': [0.5, 0, 0.5, 0],
30 |             'K': [0, 0.5, 0, 0.5],
31 |             'V': [0.33, 0.33, 0.33, 0],
32 |             'H': [0.33, 0, 0.33, 0.33],
33 |             'D': [0.33, 0.33, 0, 0.33],
34 |             'B': [0, 0.33, 0.33, 0.33],
35 |             'N': [0.25, 0.25, 0.25, 0.25]
36 |             }
37 | 
38 | def read_fasta (fasta):
39 |     header = []  # starts from 0
40 |     s = 'start'
41 |     seq = []
42 |     for line in fasta:
43 |         line = line.rstrip('\n')  # remove '\n' at the end
44 |         if line.startswith('>'):  # find a new sequence
45 |             header.append(line)  # Do not use header = ... here because .append() returns None
46 |             seq.append(s.upper())  # Append the last row of last sequence, note that the first element is ''.
47 |             s = ''  # reset s
48 |         else:
49 |             s = s + line.upper()
50 |     seq.append(s)  # append the last string to list seq after the loop
51 |     seq = seq[1:]  # remove the first element
52 |     list = [header, seq]
53 |     return list
54 | 
55 | def base_count (seq):
56 |     #      A  G  C  T
57 |     num = [0, 0, 0, 0]  # numbers of A, G, C, T
58 |     L = len(seq)
59 |     for i in range(0, L):
60 |         num = [sum(j) for j in zip(num, alphabet[seq[i]])]  # addition of vectors: element by element
61 |     return num
62 | 
63 | def GC_content (num, seq_len):
64 |     GC = float(num[1] + num[2])  # sum of the numbers of G and C in the list num.
65 |     return GC / seq_len
66 | 
67 | def entropy (num, seq_len):
68 | #        P(A) P(G) P(C) P(T)
69 |     p = [0, 0, 0, 0]  # initiate a list
70 |     H = 0  # entropy
71 |     y = 0
72 |     for i in range(0, 4):
73 |         p[i] = float(num[i]) / seq_len  # estimates the probability by frequency
74 |         if p[i] == 0:  # cannot take log0
75 |             y = 0  # lim(x^x) = 0 when x -> 0
76 |         else:
77 |             y = p[i] * math.log(p[i], 2)
78 |         H = H - y
79 |     return H
80 | 
81 | #/////////////// Main program /////////////////////
82 | # read from the standard input
83 | content = sys.stdin.readlines()  # including '\n'
84 | fasta = read_fasta(content)  # fasta[0]: headers, fasta[1]: sequences
85 | header = fasta[0]
86 | seq = fasta[1]
87 | n = range(0, len(header))  # number of sequences
88 | for i in n:
89 |     s = seq[i]
90 |     L = len(s)
91 |     num = base_count(s)
92 |     print header[i]  # output 1: headers
93 |     print L  # output 2: the length of the sequence
94 |     print '%04.2f'%(GC_content(num, L) * 100)  # output 3: the G+C content of this sequence
95 |     print '%02.1f'%entropy(num, L)  # output 4: the entropy


--------------------------------------------------------------------------------
/run_CutAdapt.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | This script runs CutAdapt for a list of paired-end readsets.
 3 | 
 4 | Author: Yu Wan (wanyuac@gmail.com)
 5 | Edition history: 15 Dec 2015, 1-2 Jan 2016
 6 | Licence: GNU GPL 2.0
 7 | '''
 8 | 
 9 | import os, re
10 | from argparse import ArgumentParser
11 | 
12 | MEMORY = "2048" # 2 GB for each job 
13 | WALL_TIME = "1-0:0:0"
14 | 
15 | def parse_args():
16 |     parser = ArgumentParser(description= "Run CutAdapt to remove adapter sequences from reads.")
17 |     parser.add_argument("--reads", type = str, required = True, help = "A list of paired-end readsets")
18 |     parser.add_argument("--f_adapter", type = str, required = True, help = "Adapter sequences of forward reads")
19 |     parser.add_argument("--r_adapter", type = str, required = True, help = "Adapter sequences of reverse reads")
20 |     parser.add_argument("--pattern", type = str, required = False, default = "\d\d\d\d_\d#\d*", help = "A regular expression for pulling out sample names")
21 |     parser.add_argument("--side", type = str, required = False, default = "3'", help = "3'-end adapters or 5'-end adapters?")
22 |     parser.add_argument("--len", type = str, required = False, default = "108", help = "Minimun read length")
23 |     parser.add_argument("--overlap", type = str, required = False, default = "33", help = "Minimun overlap length between an adapter and a read")
24 |     parser.add_argument("--discrep", type = str, required = False, default = "0.03", help = "The discrepancy rate between the reference adapter sequences and subjects")
25 |     parser.add_argument("--outdir", type = str, required = True, default = ".", help = "The directory for outputs")
26 |     return parser.parse_args()
27 | 
28 | def submit_jobs(readsets, adapt_f, adapt_r, side, outdir, min_read_len, discrep, overlap):
29 |     for sample, reads in readsets.iteritems():
30 |         cmd = '#!/bin/bash'
31 |         cmd += '\n#SBATCH -p main'
32 |         cmd += '\n#SBATCH --job-name=CutAdapt'
33 |         cmd += '\n#SBATCH --ntasks=1'
34 |         cmd += '\n#SBATCH --mem-per-cpu=' + MEMORY
35 |         cmd += '\n#SBATCH --time=' + WALL_TIME
36 |         cmd += '\ncd ' + outdir + '\n'
37 |         if side == "3'":
38 |             cutadapt_cmd = 'cutadapt -a file:' + adapt_f + ' -A file:' + adapt_r + ' --minimum-length ' + min_read_len + ' -e ' + discrep + \
39 |                            ' --overlap ' + overlap + ' -o ' + sample + '_1.fastq.gz' + ' -p ' + sample + '_2.fastq.gz' + ' ' + reads[0] + ' ' + reads[1]
40 |         else:
41 |             cutadapt_cmd = 'cutadapt -g file:' + adapt_f + ' -G file:' + adapt_r + ' --minimum-length ' + min_read_len + ' -e ' + discrep + \
42 |                            ' --overlap ' + overlap + ' -o ' + sample + '_1.fastq.gz' + ' -p ' + sample + '_2.fastq.gz' + ' ' + reads[0] + ' ' + reads[1]
43 |         cmd += cutadapt_cmd + ' > ' + sample + '.log'
44 |         #print cmd
45 |         print cutadapt_cmd
46 |         os.system("echo '" + cmd + "' | sbatch")
47 |               
48 | def load_reads(f, pattern):
49 |     readsets = {}
50 |     samples = []
51 |     with open(f, "rU") as inputs:
52 |         lines = inputs.read().splitlines()
53 |         
54 |     # initialises the dictionary
55 |     for line in lines:
56 |         samples.append(re.findall(pattern, line)[0])
57 |     samples = sorted(list(set(samples)))  # remove redundancy and sort the list
58 |     for sample in samples:
59 |         readsets[sample] = ["", ""]
60 |     
61 |     # matches paired-end read sets to samples
62 |     for line in lines:
63 |         sample = re.findall(pattern, line)[0]
64 |         orentation = int(re.findall("_\d\.", line)[0][1])  # e.g. "_1." => 1
65 |         readsets[sample][orentation - 1] = line
66 |         
67 |     return(readsets)
68 |     
69 | def main():
70 |     args = parse_args()
71 |     readsets = load_reads(args.reads, args.pattern) # generates a dictionary {sample:[read1, read2],...}
72 |     submit_jobs(readsets, args.f_adapter, args.r_adapter, args.side, args.outdir, args.len, args.discrep, args.overlap) # submit a job for each pair of readsets
73 | 
74 | if __name__ == '__main__':
75 |     main()
76 | 


--------------------------------------------------------------------------------
/mergeGenomicRegions.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | # Merge a list of genomic regions and find out complementary regions afterwards.
  3 | # Commandline: Rscript mergeGenomicRegions.R [input file] [genome length] [output prefix]
  4 | # The input file is a CSV file comprised of two columns ("from" and "to") without a header.
  5 | # This script assumes that there are always >= 2 regions in any coordinate tables.
  6 | # A typical input is the coordinate file produced by the script filterCoords.py of RedDog.
  7 | # Example:
  8 | #   Rscript mergeGenomicRegions.R coord.txt 5248520
  9 | # Outputs:
 10 | #   (1) [input filename]__merged.csv, (2) [input filename]__comple.csv
 11 | #   Saved under the current working directory.
 12 | #
 13 | # Copyright 2017 Yu Wan <wanyuac@gmail.com>
 14 | # Licensed under the Apache License, Version 2.0
 15 | # Edition: 30 Apr 2017
 16 | 
 17 | mergeRegions <- function(x) {
 18 |     y <- x[1, ]  # take the first row of x to start with
 19 |     j <- 1  # row pointer of y
 20 |     ub1 <- y$to[1]  # upper bound of the current region
 21 |     for (i in 2 : nrow(x)) {  # must guarantee there are >= 2 rows in the table
 22 |         z <- x[i, ]
 23 |         lb2 <- z$from[1]  # lower bound of the new region
 24 |         ub2 <- z$to[1]
 25 |         # There are only two behavious: either merge two regions or adding a separate region.
 26 |         if (lb2 <= (ub1 + 1)) {  # two regions overlap or adjacent: merge them into a single one. lb1 <= lb2 because the data frame is sorted in an ascending order.
 27 |             if (ub2 > ub1) {  # extend the previous region
 28 |                 y$to[j] <- ub2
 29 |                 ub1 <- ub2
 30 |             }  # else, do nothing as the second range is a subset of the first one
 31 |         } else {  # push a new and non-overlapping region into the stack of regions
 32 |             y <- rbind(y, z)
 33 |             j <- j + 1  # move the point to the new row
 34 |             ub1 <- ub2
 35 |         }
 36 |     }
 37 |     
 38 |     return(y)
 39 | }
 40 | 
 41 | findComplementaryRegions <- function(x, L) {  # L: genome size
 42 |     n <- nrow(x)  # number of predefined regions
 43 |     r <- x[1, ]
 44 |     lb <- r$from[1]
 45 |     ub <- r$to[1]
 46 |     
 47 |     # initialise z
 48 |     if (lb > 1) {  # if the first region is not at the start of the genome
 49 |         z <- data.frame(from = 1, to = lb - 1)  # lb - 1 may equal s
 50 |     } else {
 51 |         z <- data.frame(from = integer(0), to = integer(0))
 52 |     }
 53 |     s <- ub + 1
 54 | 
 55 |     for (i in 2 : n) {  # must guarantee there are >= 2 rows in the table
 56 |         r <- x[i, ]
 57 |         lb <- r$from[1]
 58 |         ub <- r$to[1]
 59 |         # Notice the function mergeRegions guarantees that lb > s and lb - s >= 1.
 60 |         # So the width of any gaps >= 1.
 61 |         z <- rbind(z, data.frame(from = s, to = lb - 1))  # Notice lb - 1 may equal s
 62 |         s <- ub + 1
 63 |     }
 64 |     
 65 |     # Are there any bases left beyond the last predefined region?
 66 |     if (s <= L) {
 67 |         z <- rbind(z, data.frame(from = s, to = L))
 68 |     }
 69 |     
 70 |     return(z)
 71 | }
 72 | 
 73 | # Read arguments
 74 | args <- commandArgs(trailingOnly = TRUE)
 75 | input <- args[1]
 76 | genome.len <- args[2]
 77 | prefix <- ifelse(length(args) >= 3, args[3], "coords")
 78 | 
 79 | if (file.exists(input)) {
 80 |     x <- read.csv(input, header = FALSE)
 81 | } else {
 82 |     stop(paste("The input file", input, "is not found.", sep = " "))
 83 | }
 84 | 
 85 | names(x) <- c("from", "to")
 86 | 
 87 | # Sort beginnings of regions so that their heads do not go backwards
 88 | x <- x[order(x$from, decreasing = FALSE), ]
 89 | 
 90 | # Merge regions
 91 | y <- mergeRegions(x)
 92 | 
 93 | # Find out complementary regions
 94 | z <- findComplementaryRegions(y, genome.len)
 95 | 
 96 | # write results
 97 | write.table(y, file = paste0(prefix, "__merged.csv"), col.names = FALSE, row.names = FALSE, quote = FALSE, sep = ",")  # Merged regions
 98 | write.table(z, file = paste0(prefix, "__comple.csv"), col.names = FALSE, row.names = FALSE, quote = FALSE, sep = ",")  # Complementary regions
 99 | 
100 | # sumamrise regions
101 | y$base <- y$to - y$from + 1
102 | z$base <- z$to - z$from + 1
103 | 
104 | print(paste(nrow(x), "regions have been merged into", nrow(y), "regions of", sum(y$base), "bases.", sep = " "))
105 | print(paste("There are", nrow(z), "regions of", sum(z$base), "bases outside of the merged regions.", sep = " "))
106 | 


--------------------------------------------------------------------------------
/screen_genes_blast.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This script screens genes against a reference database using megaBLAST for every input FASTA file.
  3 | Specifically, it takes as input a list of FASTA files and searches every DNA sequence against the reference database. Obviously,
  4 | it performs a targeted analysis for input sets of DNA sequences.
  5 | For example, you may want to find out all resistance genes in every bacterial genome, for which you may create FASTA files of
  6 | coding sequences for every genome and use this script to profile this kind of genes.
  7 | 
  8 | Number of options: 6 (2 compulsory and 4 optional)
  9 | Usage:
 10 |     python screen_genes_blast.py --in *.fna --db [reference database] --strains [a comma-delimited string of strain names]
 11 |     --genomes [a comma-delimited string of genome names] --opt [options and arguments for BLAST]
 12 |     --outfmt [output format code] > [output file name]
 13 |     
 14 | Prerequisite: A BLAST nucleotide database should be made before using this script.
 15 |     makeblastdb -in your.fasta -dbtype nucl -out db_name -logfile your.log
 16 | 
 17 | Options "--strains" and "--genomes" are optional.
 18 | 
 19 | A spreadsheet can be created beforehand to ensure the strain name and the genome name to match each FASTA file:
 20 |     strain  genome  fasta_file
 21 |     AH0650_Sm1  chr chr.fna
 22 |     AH0650_Sm1  plasmid plasmid.fna
 23 | 
 24 | Author: Yu Wan (wanyuac@gmail.com, https://github.com/wanyuac)
 25 | Development history: 3 July 2016
 26 | Python version: 2.7.10
 27 | License: GNU GPL 2.1
 28 | """
 29 | 
 30 | from argparse import ArgumentParser
 31 | import sys, os, subprocess
 32 | 
 33 | def parse_arguments():
 34 |     # read arguments of options
 35 |     parser = ArgumentParser(description="Fix problems in SRST2's ARG-Annot database")
 36 |     parser.add_argument("--in", "-i", dest = "input", nargs = "+", type = str, required = True, default = "", help = "A list of input FASTA files")
 37 |     parser.add_argument("--db", "-d", dest = "db", type = str, required = True, default = "", help="A reference nucleotide database for BLAST")
 38 |     parser.add_argument("--strains", "-s", dest = "strains", type = str, required = False, default = "", help = "(optional) Comma-delimited names of bacterial strains")
 39 |     parser.add_argument("--genomes", "-g", dest = "genomes", type = str, required = False, default = "", help = "(optional) Comma-delimited genome names")
 40 |     parser.add_argument("--opt", "-o", dest = "opt", type = str, required = False, default = "-evalue 0.001 -max_target_seqs 2 -perc_identity 98",\
 41 |                         help = "Options and argument passed to BLAST")
 42 |     parser.add_argument("--outfmt", "-f", dest = "outfmt", type = str, required = False,\
 43 |                         default = "6 qseqid sseqid qstart qend sstart send qlen slen length bitscore pident qcovs gaps evalue",\
 44 |                         help = "The configuration of the 'outfmt' option for BLAST")
 45 |     return parser.parse_args()
 46 | 
 47 | def main():
 48 |     args = parse_arguments()
 49 |     
 50 |     n_fasta = len(args.input)
 51 |     
 52 |     # parse strain information
 53 |     if args.strains != "":
 54 |         strains = args.strains.split(",")
 55 |         n_str = len(strains)
 56 |     else:
 57 |         strains = None
 58 |         n_str = 0
 59 |         
 60 |     # parse genome information
 61 |     if args.genomes != "":
 62 |         genomes = args.genomes.split(",")
 63 |         n_gen = len(genomes)
 64 |     else:
 65 |         genomes = None
 66 |         n_gen = 0
 67 |     
 68 |     # check whether strains, genomes and files match
 69 |     if  n_str != n_fasta:
 70 |         sys.exit("Error: strain number is not equal to the number of FASTA files.")
 71 |         
 72 |     if n_gen != n_fasta:
 73 |         sys.exit("Error: genome number is not equal to the number of FASTA files.")
 74 |     
 75 |     # get column names of the output file
 76 |     colnames = args.outfmt.split(" ")[1 : ]  # remove the first element -- the format id
 77 |     
 78 |     # print the header line to the stdout
 79 |     if n_gen > 0:
 80 |         colnames = ["genome"] + colnames
 81 |     if n_str > 0:
 82 |         colnames = ["strain"] + colnames
 83 |     print "\t".join(colnames)
 84 |     
 85 |     # search every set of query sequences against the reference database
 86 |     i = 0  # the counter of FASTA files
 87 |     for fasta in args.input:
 88 |         cmd = ["blastn", "-task", "megablast", "-db", args.db, "-query", fasta] + \
 89 |                args.opt.split(" ") + ["-outfmt", args.outfmt]  # Each pair of the option and its argument must be separated as elements of a list.
 90 |         proc = subprocess.Popen(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
 91 |         out = proc.communicate()  # obtain the output of BLAST from the standard output
 92 |         hits = out[0].splitlines()  # stderr: out[1]
 93 |         
 94 |         # print all lines in the current output
 95 |         for line in hits:
 96 |             if n_gen > 0:
 97 |                 line = genomes[i] + "\t" + line  # add the genome name to each line
 98 |             if n_str > 0:
 99 |                 line = strains[i] + "\t" + line  # add the strain name to each line
100 |             print line
101 |         i += 1
102 |     
103 | if __name__ == "__main__":
104 |     main()
105 |     


--------------------------------------------------------------------------------
/linkPEreadsets.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | Creating symbolic links according to a table of five columns: sample name, input read directory, R1 read file, R2 read file, action (Copy or Link),
  4 | separated by tab characters.
  5 | Warning: existing symbolic links and files will be replaced by new links or files if --update/-u is flagged.
  6 | Dependencies: Python 3, bash environment
  7 | 
  8 | Copyright (C) 2021 Yu Wan <wanyuac@126.com>
  9 | Licensed under the GNU General Public Licence version 3 (GPLv3) <https://www.gnu.org/licenses/>.
 10 | Creation: 6 Oct 2021; the latest update: 15 Oct 2021
 11 | """
 12 | 
 13 | import os
 14 | import sys
 15 | import subprocess
 16 | from argparse import ArgumentParser
 17 | 
 18 | 
 19 | def parse_arguments():
 20 |     parser = ArgumentParser(description = "Create symbolic links for paired-end read sets")
 21 |     parser.add_argument("--tsv", "-t", dest = "tsv", type = str, required = True, help = "A tab delimited, headerless file of five columns: sample name, input read directory, R1 read file, R2 read file, action (Copy or Link)")
 22 |     parser.add_argument("--outdir", "-o", dest = "outdir", type = str, required = "True", help = "Path to the output directory (without the forward slash at the end) in which symbolic links will be created or files will be copied into")
 23 |     parser.add_argument("--update", "-u", dest = "update", action = "store_true", help = "Update existing links or override existing files using the new read files. Default: skip existing links or files.")
 24 |     parser.add_argument("--R", "-R", dest = "R", action = "store_true", help = "Create symbolic links with suffices _R[1,2].fastq.gz rather than the default _[1,2].fastq.gz")
 25 |     return parser.parse_args()
 26 | 
 27 | 
 28 | def main():
 29 |     params = parse_arguments()
 30 | 
 31 |     with open(params.tsv, "r") as f:
 32 |         readsets = f.read().splitlines()
 33 | 
 34 |     outdir = params.outdir
 35 |     update = params.update
 36 | 
 37 |     if params.R:
 38 |         suffix_r1 = "_R1.fastq.gz"
 39 |         suffix_r2 = "_R2.fastq.gz"
 40 |     else:
 41 |         suffix_r1 = "_1.fastq.gz"
 42 |         suffix_r2 = "_2.fastq.gz"
 43 |     
 44 |     if not os.path.exists(outdir):
 45 |         os.mkdir(outdir)
 46 |     
 47 |     line_num = 0
 48 |     readset_num = 0
 49 |     for r in readsets:
 50 |         line_num += 1
 51 |         try:
 52 |             i, in_dir, r1, r2, action = r.split("\t")
 53 |             proceed = True
 54 |         except ValueError:
 55 |             print(f"Warning: Line {line_num} '{r}' does not contain the five values required. Skip this line.", file = sys.stderr)
 56 |             proceed = False
 57 |         if proceed:
 58 |             r1 = os.path.join(in_dir, r1)
 59 |             r2 = os.path.join(in_dir, r2)
 60 |             t1 = os.path.join(outdir, i + suffix_r1)
 61 |             t2 = os.path.join(outdir, i + suffix_r2)
 62 |             if action == "Link":
 63 |                 readset_num += create_link(t1, r1, update) + create_link(t2, r2, update)
 64 |             else:
 65 |                 readset_num += copy_file(t1, r1, update) + copy_file(t2, r2, update)
 66 |     print(f"Symbolic links or files were created for {readset_num} read files.")
 67 |     return
 68 | 
 69 | 
 70 | def create_link(t, r, u):
 71 |     if os.path.exists(r):
 72 |         if os.path.exists(t):
 73 |             if u:  # Replace existing symbolic links and files with new symbolic links
 74 |                 if os.path.islink(t):
 75 |                     print(f"Updating link {t}")
 76 |                     subprocess.run(["unlink", t])
 77 |                 else:
 78 |                     print(f"Warning: {t} is an existing file. It is deleted and a symbolic link is created with {r}.", file = sys.stderr)
 79 |                     subprocess.run(["rm", t])
 80 |                 subprocess.run(["ln", "-s", r, t])
 81 |                 c = 1
 82 |             else:
 83 |                 print(f"Warning: skip existing link/file {t}.", file = sys.stderr)
 84 |                 c = 0
 85 |         else:
 86 |             subprocess.run(["ln", "-s", r, t])
 87 |             c = 1
 88 |     else:
 89 |         print(f"Error: read file {r} does not exist. So link {t} was not created.", file = sys.stderr)
 90 |         c = 0
 91 |     return c
 92 | 
 93 | 
 94 | def copy_file(t, r, u):
 95 |     if os.path.exists(r):
 96 |         if os.path.exists(t):
 97 |             if u:  # Replace existing links and files with copied files
 98 |                 if os.path.islink(t):
 99 |                     print(f"Warning: target {t} is a symbolic link. Remove this link and copy file {r}.", file = sys.stderr)
100 |                     subprocess.run(["unlink", t])
101 |                 else:
102 |                     print(f"Warning: overrode existing file {t} with file {r}.", file = sys.stderr)
103 |                     subprocess.run(["rm", t])
104 |                 subprocess.run(["cp", r, t])
105 |                 c = 1
106 |             else:
107 |                 print(f"Warning: skip existing link/file {t}.", file = sys.stderr)
108 |                 c = 0
109 |         else:
110 |             subprocess.run(["cp", r, t])
111 |             c = 1
112 |     else:
113 |         print(f"Error: read file {r} does not exist, so it is not copied.", file = sys.stderr)
114 |         c = 0
115 |     return c
116 | 
117 | 
118 | if __name__ == "__main__":
119 |     main()


--------------------------------------------------------------------------------
/gbk2tsv.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | """
  4 | Convert GenBank files to tab-delimited text files (*.tsv). Every GenBank file must contain the locus_tag qualifier and may
  5 | contain multiple contigs (LOCUS).
  6 | 
  7 | Usage:
  8 | python gbk2tsv.py --gbk 1.gbk --outdir . --features "CDS,rRNA,tRNA" --nucl_seq --prot_seq
  9 | python gbk2tsv.py --gbk 1.gbk 2.gbk 3.gbk --outdir . --features "CDS,rRNA,tRNA" --nucl_seq --prot_seq
 10 | python gbk2tsv.py --gbk $(ls *.gbk) --outdir . --features "CDS,rRNA,tRNA" --nucl_seq --prot_seq
 11 | 
 12 | An example showing columns in every output file:
 13 | Contig    Locus         Feature    Start    End    Strand    Pseudo    Product             Gene     Nucl_seq    Prot_seq
 14 | Contig_1     locus_tag_1   CDS        1        2200   +      N         dehydrogenase I     unknown  ...         ...
 15 | Contig_1     locus_tag_2   CDS        2230     3100   -      Y         homoserine kinase   unknown  ...         ...
 16 | ...
 17 | 
 18 | Dependency: BioPython
 19 | Python versions 2 and 3 compatible
 20 | Copyright 2019 Yu Wan (wanyuac@sina.cn)
 21 | Licensed under the Apache License, Version 2.0
 22 | First version: 13 Sep 2019 (Happy Mid-Autumn Festival)
 23 | Latest update: 3 Oct 2025
 24 | """
 25 | 
 26 | 
 27 | from __future__ import print_function
 28 | from __future__ import division
 29 | import os
 30 | import sys
 31 | import glob
 32 | from Bio import SeqIO, SeqFeature
 33 | from argparse import ArgumentParser
 34 | 
 35 | 
 36 | def parse_args():
 37 |     parser = ArgumentParser(description = "Convert GenBank files to tab-delimited text files")
 38 |     parser.add_argument("-g", "--gbk", nargs = "+", type = str, required = True, dest = "gbks", default = "", help = "Input GenBank files")
 39 |     parser.add_argument("-o", "--outdir", type = str, required = False, dest = "outdir", default = ".", help = "Output directory (no backslash or forward slash)")
 40 |     parser.add_argument("-f", "--features", type = str, required = False, dest = "features", default = "CDS,tRNA,rRNA", help = "Comma-separated features to store (default CDS,tRNA,rRNA)")
 41 |     parser.add_argument("-n", "--nucl_seq", action = "store_true", required = False, dest = "nucl_seq", help = "Turn on this option to print nucleotide sequences of features")
 42 |     parser.add_argument("-p", "--prot_seq", action = "store_true", required = False, dest = "prot_seq", help = "Turn on this option to print protein sequences of CDS")
 43 |     
 44 |     return parser.parse_args()  # An instance of the class ArgumentParser
 45 | 
 46 | 
 47 | def main():
 48 |     args = parse_args()
 49 |     gbk_list = get_input_filenames(args.gbks)
 50 |     if args.outdir and not os.path.exists(args.outdir):  # The first logical condition becomes "false" if args.outdir = "".
 51 |         os.makedirs(args.outdir)
 52 |     
 53 |     if (len(gbk_list) == 0):
 54 |         sys.exit("Invalid --gbk argument: no GenBank file is found.")
 55 |         
 56 |     header = ["Contig", "Locus", "Feature", "Start", "End", "Strand", "Pseudo", "Product", "Gene"]
 57 |     if args.nucl_seq:
 58 |         header += ["Nucl_seq"]
 59 |     if args.prot_seq:
 60 |         header += ["Prot_seq"]
 61 |     
 62 |     features = args.features.split(",")  # Features of interest
 63 |     if len(features) == 0:
 64 |         sys.exit("Invalid --features argument: there is no feature to be extracted.")
 65 |     
 66 |     for gbk in gbk_list:
 67 |         tsv_name = os.path.join(args.outdir, os.path.splitext(os.path.basename(gbk))[0] + ".tsv")  # Define the current output filename: pwd/1.gbk -> pwd/1.tsv
 68 |         tsv = open(tsv_name, "w")
 69 |         tsv.write("\t".join(header) + "\n")  # Write the header line
 70 |         records = list(SeqIO.parse(gbk, "genbank"))  # Read a GenBank file from the standard input and convert it into a list of SeqRecord objects
 71 |         for r in records:  # Each record (r) is a contig with a unique LOCUS name in the GenBank file.
 72 |             contig = r.name  # LOCUS name
 73 |             for f in r.features:  # Iterate through every feature of the current contig.
 74 |                 feature_type = f.type
 75 |                 if feature_type in features:
 76 |                     # Fetch the locus_tag
 77 |                     if "locus_tag" in f.qualifiers:
 78 |                         locus_tag = f.qualifiers["locus_tag"][0]
 79 |                     else:
 80 |                         locus_tag = "unnamed"
 81 |                     
 82 |                     # Determine which DNA strand the current feature is located in
 83 |                     if f.location.strand == 1:
 84 |                         strand = "+"
 85 |                     else:
 86 |                         strand = "-"
 87 |                     
 88 |                     # Determine whether the current gene is pesudo
 89 |                     if "pseudo" in f.qualifiers or "pseudogene" in f.qualifiers:
 90 |                         is_pesudo = "Y"  # Yes
 91 |                     else:
 92 |                         is_pesudo = "N"  # No
 93 |                     
 94 |                     # Determine the product name    
 95 |                     if "product" in f.qualifiers:
 96 |                         product = f.qualifiers["product"][0]
 97 |                     else:
 98 |                         product = "unknown"
 99 |                     
100 |                     if "gene" in f.qualifiers:
101 |                         gene = f.qualifiers["gene"][0]
102 |                     else:
103 |                         gene = "unknown"
104 |                             
105 |                     # Construct the line to be written into the output file                 
106 |                     line = [contig, locus_tag, f.type, str(f.location.start + 1), str(f.location.end), strand, is_pesudo, product, gene]
107 |                     if args.nucl_seq:
108 |                         line += [str(f.extract(r.seq))]
109 |                     if feature_type == "CDS" and args.prot_seq:
110 |                         if "translation" in f.qualifiers.keys():
111 |                             line += [f.qualifiers["translation"][0]]
112 |                         else:  # Pseudo genes may not have any translation.
113 |                             line += ["unknown"]
114 |                     
115 |                     tsv.write("\t".join(line) + "\n")
116 |         tsv.close()
117 |         
118 |     return
119 | 
120 | 
121 | def get_input_filenames(gbks):
122 |     gbk_list = list(gbks)
123 |     if len(gbk_list) == 1 and gbk_list[0].startswith("*"):  # *.gbk
124 |         gbk_list = glob.glob(os.path.join(".", gbk_list[0]))  # Get names of all GenBank files under the current working directory
125 |         
126 |     return(gbk_list)
127 | 
128 | 
129 | if __name__ == "__main__":
130 | 	main()
131 | 


--------------------------------------------------------------------------------
/gbk2tbl.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | """
  4 | This script converts a GenBank file (.gbk or .gb) from Stdin into a Sequin feature table (.tbl), which is an input file of tbl2asn used for creating an ASN.1 file (.sqn).
  5 | 
  6 | Package requirement: BioPython and argparse
  7 | 
  8 | Usage:
  9 | 	Simple command:
 10 | 		python gbk2tbl.py --mincontigsize 200 --prefix any_prefix --modifiers modifier_file.txt < annotation.gbk
 11 | 		cat annotation.gbk | python gbk2tbl.py --mincontigsize 200 --prefix any_prefix --modifiers modifier_file.txt  # integrate gbk2tbl into a pipeline
 12 | 	Redirecting error messages to a text file (optional):
 13 | 		python gbk2tbl.py --mincontigsize 200 --prefix any_prefix --modifiers modifier_file.txt < annotation.gbk 2> stderr.txt
 14 | 		cat annotation.gbk | python gbk2tbl.py --mincontigsize 200 --prefix any_prefix --modifiers modifier_file.txt 2> stderr.txt
 15 | 	Note that this script reads the GenBank file through the stdin ("< annotation.gbk") and you may want to redirect the stderr to a file via "> stderr.txt" (redirection).
 16 | 	
 17 | Inputs:
 18 | 	A GenBank file, which ought to be passed to the script through the standard input (stdin).
 19 | 	A modifier file: a plain text file containing modifiers for every FASTA definition line.
 20 | 		All FASTA header modifiers must be written in a single line and are separated by a space character. This line will
 21 | 		be copied and directly printed along with the record name as the definition line of every contig sequence.
 22 | 		No space should be placed besides the '=' sign. Check http://www.ncbi.nlm.nih.gov/Sequin/modifiers.html for choosing a proper format for modifiers.
 23 | 		For example, the content of a modifier file can be (no tab character):
 24 | 			[organism=Serratia marcescens subsp. marcescens] [sub-species=marcescens] [strain=AH0650_Sm1] [topology=linear] [moltype=DNA] [tech=wgs] [gcode=11] [country=Australia] [isolation-source=sputum]
 25 | 		Furthermore, regarding the modifier 'topology':
 26 | 			[topology=?]: the molecular topology (circular/linear) of the sequence if this information is not contained in records
 27 | 				For contigs: linear (the default value)
 28 | 				For finished genomes of plasmids and bacterial chromosomes: circular
 29 | 
 30 | Outputs:
 31 | 	any_prefix.tbl: the Sequin feature table
 32 | 	any_prefix.fsa: the corresponding fasta file
 33 | 	These files are inputs for tbl2asn which generates ASN.1 files (*.sqn).
 34 | 
 35 | Arguments:
 36 | 	--mincontigsize: the minimum contig size, default = 200 in accordance with NCBI's regulation
 37 | 	--prefix: the prefix of output filenames, default = 'seq'
 38 | 	--modifiers: the filename of the modifier file, default = 'modifiers.txt'
 39 | 	  
 40 | Development notes:
 41 | 	This script is derived from the one developed by SEQanswers users nickloman (https://gist.github.com/nickloman/2660685/genbank_to_tbl.py) and ErinL who modified nickloman's script and put it
 42 | 	on the forum post (http://seqanswers.com/forums/showthread.php?t=19975).
 43 | 
 44 | Author of this version: Yu Wan (wanyuac@gmail.com, github.com/wanyuac)
 45 | Creation: 20 June 2015 - 11 July 2015; the latest edition: 21 October 2019
 46 | 
 47 | Dependency: Python versions 2 and 3 compatible.
 48 | 
 49 | Licence: GNU GPL 2.1
 50 | """
 51 | 
 52 | from __future__ import print_function
 53 | import sys
 54 | from Bio import SeqIO
 55 | from argparse import ArgumentParser
 56 | 
 57 | def parse_args():
 58 | # Extract arguments from the command line
 59 | 	parser = ArgumentParser(description= 'Read arguments: species, strain, BioProject, prefix')
 60 | 	parser.add_argument('--mincontigsize', type = int, required = False, default = 200, help = 'The minimum contig length')
 61 | 	parser.add_argument('--prefix', type = str, required = False, default = 'seq', help = 'The prefix of output filenames')
 62 | 	parser.add_argument('--modifiers', type = str, required = True, default = 'modifiers.txt', help = 'The text file containing a single line of FASTA head modifiers')
 63 | 	return parser.parse_args()
 64 | 
 65 | def read_modifiers(file):
 66 | # This function only reads the first line of the modifier file. So please ensure that all modifiers are put in the first line.
 67 | 	with open(file, 'rU') as f:
 68 | 		s = f.readline()  # only read once
 69 | 	return s
 70 | 
 71 | allowed_qualifiers = ['locus_tag', 'gene', 'product', 'pseudo', 'protein_id', 'gene_desc', 'old_locus_tag', 'note', 'inference', \
 72 | 					  'organism', 'mol_type', 'strain', 'sub_species', 'isolation-source', 'country', \
 73 | 					  'collection_date']  # In GenBank files, the qualifier 'collection-date' is written as 'collection_date'.
 74 | '''
 75 | These are selected qualifiers because we do not want to see qualifiers such as 'translation', 'transl_table', or 'codon_start' in the feature table.
 76 | Qualifiers 'organism', 'mol_type', 'strain', 'sub_species', 'isolation-source', 'country' belong to the feature 'source'.
 77 | '''
 78 | 
 79 | def main():
 80 | 	args = parse_args()  # read arguments
 81 | 	contig_num = 0
 82 | 	fasta_fh = open(args.prefix + '.fsa', 'w')  # the file handle for the fasta file
 83 | 	feature_fh = open(args.prefix + '.tbl', 'w')  # the file handle for the feature table
 84 | 	modifiers = read_modifiers(args.modifiers)  # read the modifiers from a text file
 85 | 	records = list(SeqIO.parse(sys.stdin, 'genbank'))  # read a GenBank file from the standard input and convert it into a list of SeqRecord objects
 86 | 
 87 | 	for rec in records:  # for every SeqRecord object in the list 'records'
 88 | 		if len(rec) <= args.mincontigsize:  # filter out small contigs
 89 | 			print('skipping small contig %s' % (rec.id), file=sys.stderr)
 90 | 			continue  # start a new 'for' loop
 91 | 		contig_num += 1
 92 | 		print(rec.name)  # print the contig name to STDOUT
 93 | 		
 94 | 		# write the fasta file 
 95 | 		rec.description = modifiers
 96 | 		SeqIO.write([rec], fasta_fh, 'fasta')  # Prints this contig's sequence to the fasta file. The sequence header will be rec.description.
 97 | 
 98 | 		# write the feature table
 99 | 		print('>Feature %s' % (rec.name), file = feature_fh)  # write the first line of this record in the feature table: the LOCUS name
100 | 		for f in rec.features:
101 | 			# print the coordinates
102 | 			if f.strand == 1:
103 | 				print('%d\t%d\t%s' % (f.location.nofuzzy_start + 1, f.location.nofuzzy_end, f.type), file = feature_fh)
104 | 			else:
105 | 				print('%d\t%d\t%s' % (f.location.nofuzzy_end, f.location.nofuzzy_start + 1, f.type), file = feature_fh)
106 | 
107 | 			if (f.type == 'CDS') and ('product' not in f.qualifiers):
108 | 				f.qualifiers['product'] = 'hypothetical protein'
109 | 			# print qualifiers (keys and values)
110 | 			for (key, values) in f.qualifiers.items():
111 | 				'''
112 | 				Apply the iteritems() method of the dictionary f.qualifiers for (key, values) pairs
113 | 				iteritems() is a generator that yields 2-tuples for a dictionary. It saves time and memory but is slower than the items() method.
114 | 				'''
115 | 				if key not in allowed_qualifiers:
116 | 					continue  # start a new 'for' loop of f, skipping the following 'for' statement of v
117 | 				for v in values:  # else, write all values under this key (qualifier's name)
118 | 					print('\t\t\t%s\t%s' % (key, v), file = feature_fh)
119 | 	fasta_fh.close()  # finish the generation of the FASTA file
120 | 	feature_fh.close()  # finish the generation of the feature table
121 | 	print(str(contig_num) + ' records have been converted.')
122 | 
123 | # call the main function
124 | if __name__ == '__main__':
125 | 	main()
126 | 


--------------------------------------------------------------------------------
/parse_biosample.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Download records from the NCBI BioSample database and extract values of certain attributes. The main
  3 | output file is a tab-delimited file, which can be readily imported to Excel. The column order of this
  4 | file is determined by that of attribute names in the parameter '-a'.
  5 | 
  6 | Usage:
  7 |     samples='SAMN0001,SAMN0002,SAMN0003,SAMN0004,SAMN0005'
  8 |     attr='strain,collection_date,geo_loc_name,host,host_disease,isolation_source'
  9 |     python parse_biosample.py -i $samples -a $attr -e 'xx@xx.xx'
 10 |     python parse_biosample.py -i 'file:accessions.txt' -a $attr -e 'xx@xx.xx'  # One accession number a line in accession.txt
 11 | 
 12 | Copyright 2020 Yu Wan <wanyuac@126.com>
 13 | Publication: 15 March 2020
 14 | Licenced under GNU GPL 2.1.
 15 | """
 16 | 
 17 | from __future__ import print_function
 18 | from Bio import Entrez
 19 | from argparse import ArgumentParser
 20 | import os
 21 | import sys
 22 | import time
 23 | import xml.etree.ElementTree as xmlTree
 24 | 
 25 | 
 26 | def parse_arguments():
 27 |     parser = ArgumentParser(description = 'Download BioSample records and extract specific attributes')
 28 |     parser.add_argument('-i', type = str, required = True, help = 'Comma-delimited accessions or a file of a list of acceessions')
 29 |     parser.add_argument('-a', type = str, required = True, help = 'Comma-delimited names of attributes whose values will be extracted')
 30 |     parser.add_argument('-e', type = str, required = True, help = 'User\'s email address required for accessing the NCBI database')
 31 |     parser.add_argument('-o', type = str, required = False, default = 'metadata.tsv', help = 'Filename for extracted attribute values')
 32 |     parser.add_argument('-d', type = str, required = False, default = './record', help = 'Directory name for downloaded BioSample records')
 33 |     parser.add_argument('-r', action = 'store_true', required = False, help = 'Flag it to override existing XML files')
 34 |     parser.add_argument('-n', action = 'store_true', required = False, help = 'Flag it to not replace missing values with NAs')
 35 |     
 36 |     return parser.parse_args()
 37 | 
 38 | 
 39 | def main():
 40 |     args = parse_arguments()
 41 |     check_outdir(args.d)
 42 |     attributes = args.a.split(',')
 43 |     
 44 |     # Download BioSample records
 45 |     print('Start to download BioSample records.')
 46 |     records = download_records(accessions = get_accession_numbers(args.i), email = args.e, \
 47 |                                out_dir = args.d, override = args.r)  # Create an XML file [acc].xml in the record directory
 48 |     
 49 |     # Parse the records
 50 |     print('\nStart to parse BioSample records.')
 51 |     f = open(args.o, 'w')  # Create the output file
 52 |     f.write('\t'.join(['BioSample'] + attributes) + '\n')  # Print the header
 53 |     for a, p in records.items():  # Accession number and XML path
 54 |         if p == None:
 55 |             continue
 56 |         else:
 57 |             extract_attributes(accession = a, xml_path = p, out_file = f, attrs = attributes, \
 58 |                                fill_null = not args.n)  # By default, fill missing values with NAs.
 59 |     f.close()
 60 |     
 61 |     return
 62 | 
 63 | 
 64 | def extract_attributes(accession, xml_path, out_file, attrs, fill_null):
 65 |     print("Parsing record %s:" % accession)
 66 |     xml = xmlTree.parse(xml_path).getroot()  # Read and parse the XML file
 67 |     parental_domain = xml[0]  # Tag: BioSample
 68 |     
 69 |     for d in parental_domain:
 70 |         if d.tag == 'Attributes':
 71 |             attr_dict = parse_attribute_domain(d)  # Write attributes into the output file
 72 |             parsed_attrs = attr_dict.keys()  # Note that not all attrs can always be found in parsed_attrs.
 73 |             new_line = [accession]
 74 |             for a in attrs:
 75 |                 if a in parsed_attrs:
 76 |                     v = attr_dict[a]
 77 |                     if v == '' and fill_null:
 78 |                         new_line.append('NA')
 79 |                     else:
 80 |                         new_line.append(v)
 81 |                 else:
 82 |                     print('  Warning: attribute %s is not found in record %s.' % (a, accession))
 83 |                     if fill_null:
 84 |                         new_line.append('NA')
 85 |                     else:
 86 |                         new_line.append('')  # A NULL space holder.
 87 |             out_file.write('\t'.join(new_line) + '\n')
 88 |             print('  Record %s has been successfully parsed.' % accession)
 89 |     
 90 |     return
 91 | 
 92 | 
 93 | def parse_attribute_domain(d):
 94 |     """
 95 |     Converts the 'Attributes' domain into a dictionary.
 96 |     """
 97 |     attrs = {}
 98 |     for a in d:
 99 |         # Compared to display_name and attribute_name, harmonized_name is expected to be conserved across records.
100 |         attrs[a.attrib['harmonized_name']] = a.text  # For example: {isolation_source: urine}
101 |     
102 |     return attrs
103 | 
104 | 
105 | def download_records(accessions, email, out_dir, override):
106 |     """
107 |     Download and save BioSample records as XML files, and return a dictionary {accession: file path}.
108 |     Existing XML files are skipped by default, which saves lots of time. (Particularly when users re-run this script)
109 |     """
110 |     paths = {}
111 |     Entrez.email = email
112 |     
113 |     for a in accessions:
114 |         xml_path = os.path.join(out_dir, a + '.xml')
115 |         if os.path.exists(xml_path):
116 |             if override:
117 |                 print('Download and override existing record %s.' % a)
118 |                 try:
119 |                     handle = Entrez.efetch(db = 'biosample', id = a, rettype = 'xml', retmode = 'text')  # In XML format
120 |                     paths[a] = xml_path
121 |                     f = open(xml_path, 'w')
122 |                     f.write(handle.read())  # Save the current record as an XML file
123 |                     f.close()
124 |                 except:
125 |                     print('  Warning: record %s is not found. Skip this record.')
126 |                     paths[a] = None
127 |                 time.sleep(1)
128 |             else:
129 |                 print('Skip existing record %s.' % a)
130 |                 paths[a] = xml_path
131 |         else:
132 |             print('Download record %s.' % a)
133 |             try:
134 |                 handle = Entrez.efetch(db = 'biosample', id = a, rettype = 'xml', retmode = 'text')  # In XML format
135 |                 paths[a] = xml_path
136 |                 f = open(xml_path, 'w')
137 |                 f.write(handle.read())  # Save the current record as an XML file
138 |                 f.close()
139 |             except:
140 |                 print('  Warning: record %s is not found. Skip this record.')
141 |                 paths[a] = None
142 |             time.sleep(1)
143 |     
144 |     return paths
145 | 
146 | 
147 | def get_accession_numbers(s):
148 |     """
149 |     Returns a list of BioSample accession numbers
150 |     """
151 |     PREFIX = 'file:'
152 |     if s.startswith(PREFIX):
153 |         s = s[len(PREFIX) : ]  # Drop the prefix
154 |         try:
155 |             with open(s, 'r') as f:
156 |                 accs = f.read().splitlines()
157 |         except:
158 |             sys.exit('Error: accession file %s is not accessible.' % s)
159 |     else:
160 |         accs = s.split(',')
161 |     print('  Altogether %i accession numbers have been imported.' % len(accs))
162 | 
163 |     return accs
164 | 
165 | 
166 | def check_outdir(d):
167 |     if os.path.exists(d):
168 |         print('Output directory %s exists.' % d)
169 |     else:
170 |         os.system('mkdir ' + d)
171 |         
172 |     return
173 | 
174 | 
175 | if __name__ == '__main__':
176 |     main()
177 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # BINF_toolkit
  2 | This directory consists of scripts developed by Yu Wan for routine bioinformatic analysis.  
  3 | 
  4 | ## A list of scripts  
  5 | * [add_sample_name_FASTA.py](#add_sample\_name\_FASTA)  
  6 | * [downloadSeqFromNCBI.py](#download\_NCBI\_records)  
  7 | * [extractNuclRegionFromFASTA.py](#extract\_nucl\_region)  
  8 | * [gbk2tbl.py](#gbk2tbl)  
  9 | * [gbk2tsv.py](#gbk2tsv)  
 10 | * [gc.py](#gc)  
 11 | * [extractSeqFromGBK.py](#get\_gene\_seq)  
 12 | * [parse_ENA_sampleInfo_XML.py](#parse\_ENA\_sampleInfo\_XML)  
 13 | * [run_CutAdapt.py](#run_CutAdapt)  
 14 | * [filename_generator.py](#filename_generator)  
 15 | 
 16 | ## Manual
 17 | ### <a name="add_sample_name_FASTA"></a>add\_sample\_name\_FASTA.py
 18 | This script appends a sample name at the beginning of each sequence in a FASTA file. For example, the header "\>g1 description" becomes "\>sample1__g1 description" after running this script.  
 19 | 
 20 | Command example: ```python add_sample_name_FASTA.py -i filename.txt (or filename.fna) -o output_dir -n```  
 21 | <br />
 22 | 
 23 | ### <a name="download_NCBI_records"></a>downloadSeqFromNCBI.py
 24 | This script takes as input a list of NCBI accession numbers (one for each line) from the STDIN and downloads corresponding entries (either GenBank files or FASTA files) under the target directory.  
 25 | 
 26 | **Examples**  
 27 | 
 28 | ```shell
 29 | python downloadSeqFromNCBI.py --records "file:objects.txt" --format fasta --email xxx@xxx.com --suffix fna --outdir ./ref --skip > download.log  
 30 | 
 31 | python downloadSeqFromNCBI.py --records "NC_0001,NC_0002" --format genbank --email xxx@xxx.com --suffix gbk --outdir ./ref --skip > download.log 
 32 | ```
 33 | 
 34 | Type ```python downloadSeqFromNCBI.py -h``` or ```--help``` for help information.  
 35 | 
 36 | **Notes about options and option arguments**  
 37 | * --records: can be either a file (must contain a suffix of ".txt") listing targets to be downloaded, or a string of accession IDs separated by commas (no space is allowed).  
 38 | * --format or -f: the format of files to be downloaded  
 39 | * --suffix or -s: the file extension, can be "fasta" (default), "fna", "gb", or "gbk". No dot preceding the extension is needed.  
 40 | * --outdir or -o: output directory, no backslash at the end.  
 41 | 
 42 | An example of the input list: seq_list.txt. Note that accession IDs may not include version numbers such as ".1" (HG326223.1, CP011642).  
 43 | 
 44 | **References**  
 45 | 1. This script is inspired by Mark Schultz's (dr.mark.schultz@gmail.com, GitHub: schultzm) script "downloadGenbankByAccessions.py".  
 46 | 
 47 | 2. [A post on the BioStars forum](https://www.biostars.org/p/63506/).  
 48 | 
 49 | 3. Damien Farrell's blog post: [Retrieving genome assemblies via Entrez with Python](https://dmnfarrell.github.io/bioinformatics/assemblies-genbank-python).  
 50 | 
 51 | <br />
 52 | 
 53 | ### <a name="extract_nucl_region"></a>extractNuclRegionFromFASTA.py
 54 | This script extracts a region of nucleotides by positions from a fasta file.  
 55 | 
 56 | **Arguments**  
 57 | -i: the path of the input file  
 58 | -n: the name of your selected contig  
 59 | -f: feature name specified by the user  
 60 | -s: the first nucleotide to be selected  
 61 | -e: the last nucelotide to be selected  
 62 | -o: the filename of the output  
 63 | 	
 64 | **Requirements**  
 65 | * Only one genomic region should be selected;  
 66 | * the start and end positions should not spill out.  
 67 | <br />
 68 | 
 69 | ### <a name="gbk2tbl"></a>gbk2tbl.py
 70 | 
 71 | This script converts a GenBank file (.gbk or .gb) from Stdin into a Sequin feature table (.tbl), which is an input file of tbl2asn used for creating an ASN.1 file (.sqn).  
 72 | 
 73 | Package requirement: BioPython and argparse  
 74 | 
 75 | **Usage**   
 76 | ```shell
 77 | python gbk2tbl.py --mincontigsize 200 --prefix any_prefix --modifiers modifier_file.txt < annotation.gbk 2> stderr.txt 
 78 | ```
 79 | 
 80 | Note that this script reads the GenBank file through the stdin ("\< annotation.gbk") and you may want to redirect the stderr to a file via "\> stderr.txt" (redirection).  
 81 | 
 82 | **Inputs**  
 83 | A GenBank file, which ought to be passed to the script through the standard input (stdin).  
 84 | 
 85 | A modifier file: a plain text file containing modifiers for every FASTA definition line.  
 86 | * All modifiers must be written in a single line and are separated by a single space character.
 87 | * No space should be placed besides the '=' sign. Check [NCBI help](http://www.ncbi.nlm.nih.gov/Sequin/modifiers.html) for choosing a proper format for modifiers.
 88 | * For example: a line "[organism=Serratia marcescens subsp. marcescens] [sub-species=marcescens] [strain=AH0650_Sm1] [topology=linear] [moltype=DNA] [tech=wgs] [gcode=11] [country=Australia] [isolation-source=sputum]" will be copied and printed along with the record name as the definition line of every contig sequence.  
 89 | 
 90 | **Outputs**  
 91 | * any_prefix.tbl: the Sequin feature table  
 92 | * any_prefix.fsa: the corresponding fasta file  
 93 | These files are inputs for tbl2asn which generates ASN.1 files (*.sqn).  
 94 | 
 95 | **Arguments**  
 96 | * --mincontigsize: the minimum contig size, default = 200 in accordance with NCBI's regulation  
 97 | * --prefix: the prefix of output filenames, default = 'seq'  
 98 | * --modifiers: the filename of the modifier file, default = 'modifiers.txt'  
 99 | 
100 | **Demonstration**
101 | A test data set for this script is provided in the directory _example_. This data set is composed of a compressed GenBank file *NJST258\_1\_\_CP006923.gbk.gz* and a modifier file *gbk2tbl\_modifiers.txt*. Users can run the following command line to produce a TBL file as well as a FASTA file:
102 | 
103 | ```shell
104 | zcat ./example/NJST258_1__CP006923.gbk.gz | python gbk2tbl.py --mincontigsize 200 --prefix Kp --modifiers gbk2tbl_modifiers.txt
105 | ```
106 | <br />
107 | 
108 | ### <a name="gbk2tsv"></a>gbk2tsv.py
109 | This script converts one or multiple GenBank files into tab-delimited feature tables (plain text), which can be imported to Excel or R afterwards.  
110 | 
111 | Relevant blog [post](https://microbialsystems.cn/en/post/gbk2tsv/).  
112 | <br />
113 | 
114 | ### <a name="gc"></a>gc.py
115 | This program calculates the length, GC content, and entropy for each record in a multi-fasta file.  
116 | 
117 | Input: a fasta file which contains multiple sequences from the standard input  
118 | 
119 | Output: for each sequence, the script prints: the header line, total sequence length, (G+C)% and entropy of the input sequence.  
120 | 
121 | Command line:
122 | ```bash
123 | python gc.py < filename.fasta
124 | ```
125 | 
126 | Treatment of the extended alphabet in this script:  
127 | 1. consider all of 15 characters  
128 | 2. construct a weighted-count table using dictionary  
129 | 3. for each character in the table, take the probability of being A, G, C or T as effective counts  
130 | 4. counts for A, G, C and T is computed by adding up the vectors for every character read from the sequence.  
131 | <br />
132 | 
133 | ### <a name="get_gene_seq"></a>extractSeqFromGBK.py
134 | This script extracts gene sequences from a GenBank file, in accordance with a list of (locus_tag, feature type) tuples.  
135 | 
136 | Required module: Bio, argparse, csv  
137 | 
138 | **Usage**
139 | ```bash
140 | python extractSeqFromGBK.py --tags locus_tag.tsv --gb demo.gbk > genes.fna
141 | ```
142 | 
143 | **Inputs**  
144 | 1. A GenBank file.  
145 | 2. A text file listing selected locus_tags in the following format: locus_tag"\t"feature_type. This file MUST use ASCII codes because [the module csv/2.3 does not support Unicode inputs](https://docs.python.org/2/library/csv.html).  
146 | 3. Allowed feature types are: CDS, tRNA, rRNA and tmRNA. For example:  
147 | 	SMDB11_RS00910	rRNA<br/>
148 | 	SMDB11_RS21915	rRNA<br/>
149 | 	SMDB11_RS00015	CDS<br/>
150 | 
151 | **Output**
152 | Nucleotide sequences in FASTA format with the header in the format: \>feature type|contig name|locus_tag|position|length|product
153 | 
154 | **Warnings**  
155 | 1. Although it is unlikely in a GenBank file, but please always ensure that there is no duplication of locus_tags in the table because this script treats locus_tag"s as keys for retrieving feature types.  
156 | 2. An "IndexError: list index out of range" will arise if the tag list uses Unicode codes.  
157 | <br />
158 | 
159 | ### <a name="parse_ENA_sampleInfo_XML"></a>parse_ENA_sampleInfo_XML.py
160 | This script parses an ENA metadata file in XML format and prints a subset of information.  
161 | 
162 | **Usage**
163 | 
164 | ```bash
165 | python parse_ENA_sampleInfo_XML.py ERP000909.xml > samples.txt
166 | ```
167 | 
168 | Input: an XML file exported for a list of ERS accession numbers from ENA using the REST URLs API. For example, one can download an XML file for sample ERS086023 using the link [http://www.ebi.ac.uk/ena/data/view/ERS086023&display=xml](http://www.ebi.ac.uk/ena/data/view/ERS086023&display=xml).
169 | 
170 | **Outputs**  
171 | 
172 | * tab-delimited text file containing information retrieved from the XML file.  
173 | * study_accession, sample_accession, secondary_sample_accession, experiment_accession, run_accession, Isolate_ID, Host, Place_of_isolation, Year_of_isolation  
174 | <br />
175 | 
176 | ### <a name="run_CutAdapt"></a>run_CutAdapt.py
177 | 
178 | This script runs [CutAdapt](https://github.com/marcelm/cutadapt) for a list of paired-end readsets.  
179 | 
180 | Dependency: [slurm](http://slurm.schedmd.com) on a computational cluster (Linux OS)  
181 | <br />
182 | 
183 | ### <a name="filename_generator"></a>filename_generator.py
184 | This script generates a list of file names based on a list of strings. It is useful if you want to generate a list of file names for read sets from a list of bacterial strain names.  
185 | 
186 | Usage  
187 | ```shell
188 | python filename_generator.py -i input_file -o output_file -p prefix -s suffix -f from -l to -pe
189 | ```
190 | 
191 | Input: a plain-text file consists of a list of filenames
192 | 
193 | Example input files: (inlist.txt)  
194 | &nbsp;&nbsp;&nbsp;&nbsp;sample1\_\_genes\_\_results.txt  
195 | &nbsp;&nbsp;&nbsp;&nbsp;sample2\_\_genes\_\_results.txt  
196 | 
197 | Command  
198 | ```shell
199 | python filename_generator.py -i inlist.txt -o outlist.txt -p /reads/ -s .fastq.gz -f 0 -l 7 -pe
200 | ```
201 | 
202 | Output: a list of new file names generated on the basis of strings in inlist.txt  
203 | 
204 | Example output items: (outlist.txt)  
205 | &nbsp;&nbsp;&nbsp;&nbsp;/reads/sample1\_1.fastq.gz  
206 | &nbsp;&nbsp;&nbsp;&nbsp;/reads/sample1\_2.fastq.gz  
207 | &nbsp;&nbsp;&nbsp;&nbsp;/reads/sample2\_1.fastq.gz  
208 | &nbsp;&nbsp;&nbsp;&nbsp;/reads/sample2\_2.fastq.gz  
209 | 


--------------------------------------------------------------------------------
/extractSeqFromGBK.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """
  4 | This script extracts nucleotide or protein sequences from a GenBank file in accordance with a list of
  5 | (locus_tag/gene, feature type) tuples.
  6 | 
  7 | Required modules: Python 3, BioPython, argparse, csv.
  8 | 
  9 | Usage:
 10 |     python extractSeqFromGBK.py --targets [target file] --gbk [GenBank file(s)] > [output file name]
 11 |     python extractSeqFromGBK.py --targets [target file] --gbk [GenBank file(s)] --usegene > [output file name]
 12 |     python extractSeqFromGBK.py --targets [target file] --gbk [GenBank file(s)] --usegene --aa --extname > [output file name]
 13 | 
 14 | Inputs
 15 |     1. --gbk: A list of GenBank files. Each filename is a genome name, which will be parsed and put into output sequence headers.
 16 |     2. --targets:
 17 |         (1) A tab-delimited text file (.tsv) listing selected locus_tags/gene names in the format: [tag]'\t'[feature_type].
 18 |         (2) Single-target mode: an ID, preceded by a '^' sign and followed by a feature type with a colon as the delimiter. For example, 'repB,CDS'.
 19 | 	
 20 |     Allowed feature types are: CDS, tRNA, rRNA, tmRNA.
 21 |         For example
 22 |             SMDB11_RS00910	rRNA
 23 |             SMDB11_RS21915	rRNA
 24 |             SMDB11_RS00015	CDS
 25 |         Or:
 26 |             gene1	CDS
 27 |             gene2	CDS
 28 |             gene3	CDS
 29 | 
 30 | Output (to stdout)
 31 |     Nucleotide sequences in FASTA format with the header in the format:
 32 |         >[sequence ID] [gene name]|[Genome name]|[NCBI nucleotide accession or contig name]|[Coding strand (+/-)]|[Coordinates]|[Coordinate strand (+/-)]|[Locus tag]|[NCBI protein accession/NA]|[Product name]
 33 | 
 34 | Example commands
 35 |     python extractSeqFromGBK.py --targets loci.tsv --gbk genome1.gb > genome1_genes.fna
 36 |     python extractSeqFromGBK.py --targets ^geneA:CDS --gbk *.gbk --usegene > genes.fna
 37 |     python extractSeqFromGBK.py --targets genes.tsv --gbk *.gbk --usegene --aa --extname > proteins.faa
 38 | 
 39 | Notes
 40 |     1. Locus tags are recommended when users want to extract sequences of exact features, since some features in a record
 41 |     may share the same gene name. (Check GenBank files before using this script). Nonetheless, users may want to use gene
 42 |     names rather than locus tags to include sequences of the same gene name.
 43 |     2. Multiple sequences of any target that is shared by different loci will be extracted. For example, two sequences are
 44 |     printed for a gene when both sequences share the same gene name.
 45 |     3. Note that some features, such as 'source', does not contain a qualifier "locus_tag". A KeyError will arise if call
 46 |     qualifiers["locus_tag"] for those features. Moreover, the 'gene' feature, although it shares the same locus_tag with
 47 |     its CDS, it does not contain a nucleotide sequence and hence should not be used as a legit type of target features.
 48 |     4. Coordinate strand (+/-) always equals '+' when the GenBank file is created by Prokka or NCBI's Prokaryotic Genome
 49 |     Annotation Pipeline (PGAP). This scripts presumes that this is the case.
 50 | 
 51 | Explanation of warning(s)
 52 |     An "IndexError: list index out of range" will arise if the tag list uses Unicode codes.
 53 | 	
 54 | Copyright (C) Yu Wan 2020 <wanyuac@126.com>
 55 | Publication: 19 June 2015; latest update: 26 May 2020
 56 | Licence: GNU General Public License v3.0
 57 | Previous filename: get_gene_seq.py
 58 | 
 59 | References
 60 |     Mark Schultz, https://github.com/schultzm/parseGenbank_extractGenes.py
 61 |     martineau, http://stackoverflow.com/questions/14734604/python-dictionary-of-lists-from-tab-delimited-file
 62 | """
 63 | 
 64 | from Bio import SeqIO
 65 | import os
 66 | import sys
 67 | import csv  # User"s Python script name should not be the module name, otherwise, the former will be loaded and cause an error of no attribute loaded.
 68 | from argparse import ArgumentParser
 69 | 
 70 | 
 71 | def parse_args():
 72 | 	parser = ArgumentParser(description= "Extract nucleotide/protein sequences from GenBank files")
 73 | 	parser.add_argument("--targets", "-t", dest = "targets", type = str, required = True, help = "A tab-delimited file listing (locus_tag/gene name, feature type) tuples, or ^tag:type")
 74 | 	parser.add_argument("--gbk", "-g", dest = "gbk", nargs = "+", type = str, required = True, help = "One or multiple GenBank files")
 75 | 	parser.add_argument("--usegene", "-u", dest = "usegene", action = "store_true", required = False, help = "A flag enabling the use of gene names rather than locus tags for feature match")
 76 | 	parser.add_argument("--aa", "-a", dest = "aa", action = "store_true", required = False, help = "Set to print amino acid sequences instead of nucleotide sequences")
 77 | 	parser.add_argument("--extname", "-x", dest = "extname", action = "store_true", required = False, help = "Set to attach genome names to sequence names, making an extended sequence name")
 78 | 	
 79 | 	return parser.parse_args()
 80 | 
 81 | 		
 82 | def main():
 83 | 	args = parse_args()
 84 | 
 85 | 	# Read targets
 86 | 	targets_def = args.targets
 87 | 	if targets_def.startswith("^"):
 88 | 		targets_def = targets_def[1 : ]
 89 | 		tag, feature_type = targets_def.split(":")
 90 | 		tags = {tag : feature_type}
 91 | 	else:
 92 | 		tags = read_table(args.targets)  # read the table and store it as a dictionary
 93 | 
 94 | 	# Validity check of tags
 95 | 	if len(tags) == 0:  # if the dictionary 'tags' is empty, then terminate the loop
 96 | 		print("Warning: no target is read.")
 97 | 		sys.exit(0)
 98 | 
 99 | 	search_key = "gene" if args.usegene else "locus_tag"
100 | 
101 | 	for gbk in args.gbk:
102 | 		print("Processing %s" % gbk, file = sys.stderr)
103 | 		process_gbk(gbk = gbk, tags = tags, search_key = search_key, usegene = args.usegene, get_protein = args.aa,\
104 | 			att_name = args.extname, tag_num = len(tags))
105 | 	
106 | 	return
107 | 
108 | 
109 | def process_gbk(gbk, tags, search_key, usegene, get_protein, att_name, tag_num):
110 | 	"""
111 | 	This function processes a single GenBank file.
112 | 	"""
113 | 	target_feature_types = set(tags.values())  # Creates a set of feature types (CDS, tRNA, tmRNA, rRNA, etc.) from the dictionary of targets
114 | 	if "gene" in target_feature_types:
115 | 		print("Error: 'gene' is not a legit feature type.")
116 | 		sys.exit(0)
117 | 	targets = list(tags.keys())  # Names of targets. For instance, a list of gene names or locus tags.
118 | 	g = os.path.splitext(os.path.basename(gbk))[0]  # Remove path and filename extension from the path of the input GenBank file.
119 | 	loci_found = 0  # Number of target loci encountered in this GenBank file. This variable is useful when usegene = False.
120 | 	continue_search = True
121 | 
122 | 	for contig in SeqIO.parse(gbk, "genbank"):
123 | 		"""
124 | 		Do not use list(SeqIO.parse(gbk, "genbank")) in order to save memory.
125 | 		Object 'contig' belongs to class SeqRecord and corresponds to a LOCUS feature in the GenBank file.
126 | 		A GenBank file may be comprised of multiple contigs. The following loop goes through every feature of the contig.
127 | 		"""
128 | 		if continue_search:  # Go through features of the current contig.
129 | 			for f in contig.features:
130 | 				if f.type in target_feature_types:  # Skipping unwanted feature types saves time.
131 | 					f_qualifier_keys = list(f.qualifiers.keys())
132 | 					if search_key in f_qualifier_keys:  # type(f.qualifiers): collections.OrderedDict
133 | 						tag_name = f.qualifiers[search_key][0]  # Equals gene name when search_key is 'gene' or locus tag when search_key is 'locus_tag'.
134 | 						if tag_name in targets and f.type == tags[tag_name]: # If the true feature type matches the anticipated type, then it is a true discovery.
135 | 							strand = "+" if f.strand == 1 else "-"
136 | 							start = int(f.location.start) + 1  # An alias for f.location.nofuzzy_start. Conventional start position is 1 bp greater than the Python-style coordinate.
137 | 							end = int(f.location.end)  # An alias for f.location.nofuzzy_end
138 | 
139 | 							# Get the sequence
140 | 							if get_protein and f.type == "CDS":
141 | 								if "translation" in f_qualifier_keys:
142 | 									seq = f.qualifiers["translation"][0]  # Type: str
143 | 								else:  # It happens when the CDS is a pseudo gene.
144 | 									print("Warning: CDS of feature %s in %s does not have a translated sequence." % (tag_name, gbk),\
145 | 										file = sys.stderr)
146 | 									continue  # Skip the current feature and move to the next one.
147 | 							else:
148 | 								seq = str(f.extract(contig.seq))
149 | 
150 | 							# Determine the output sequence ID
151 | 							if att_name:
152 | 								seq_id = "%s.%s" % (tag_name, g)
153 | 							else:
154 | 								seq_id = tag_name
155 | 							
156 | 							# Determine the gene name where available
157 | 							if "gene" in f_qualifier_keys:
158 | 								gene_name = f.qualifiers["gene"][0]
159 | 							else:
160 | 								gene_name = "NA"
161 | 
162 | 							# Determine protein accession number when available
163 | 							if f.type == "CDS" and "protein_id" in f_qualifier_keys:
164 | 								protein_accession = f.qualifiers["protein_id"][0]
165 | 							else:
166 | 								protein_accession = "NA"
167 | 
168 | 							# Get the locus tag
169 | 							if usegene and "locus_tag" in f_qualifier_keys:
170 | 								locus_tag = f.qualifiers['locus_tag'][0]
171 | 							else:
172 | 								locus_tag = tag_name
173 | 
174 | 							# Get the product name
175 | 							if "product" in f_qualifier_keys:
176 | 								product = f.qualifiers["product"][0]
177 | 							else:
178 | 								product = "NA"
179 | 							
180 | 							# Print the target sequence
181 | 							print(">%s %s|%s|%s|%s|%i-%i|+|%s|%s|%s" % (seq_id, gene_name, g, contig.id, strand, start, end, locus_tag,\
182 | 								protein_accession, product))  # print the header
183 | 							print(seq)  # extract nucleotide sequence of this feature
184 | 
185 | 							"""
186 | 							In order to save time, the for loop is terminated when locus tags (which are unique in every GenBank file)
187 | 							are used as search keys and all target locus tags have been found.
188 | 							"""
189 | 							if not usegene:
190 | 								loci_found += 1
191 | 								if loci_found == tag_num:  # Do not need to do further search.
192 | 									continue_search = False  # To break the outer for loop
193 | 									break  # Terminate the current for loop
194 | 		else:
195 | 			break  # This termination happens when usegene = False and all target locus tags have been found.
196 | 
197 | 	return
198 | 
199 | 
200 | def read_table(f):
201 | 	"""
202 | 	This function reads a tab-delimited file and saves it as a dictionary through using the first column as keys.
203 | 	"""
204 | 	d = {}  # create an empty dictionary
205 | 	with open(f, "r") as csv_file:  # a wrapper for reading a file instead of using open() and f.close()
206 | 		csv_reader = csv.reader(csv_file, delimiter = "\t")
207 | 		try:
208 | 			for row in csv_reader:
209 | 				d[row[0]] = row[1]  # use the first column as keys and the second column as values
210 | 		except:
211 | 			print("Error: cannot read tags values. Your tag file should use ASCII or utf-8 encoding system.")
212 | 			sys.exit(1)
213 | 	return d
214 | 
215 | 					
216 | if __name__ == "__main__":
217 | 	main()
218 | 


--------------------------------------------------------------------------------
/other_licence/Apache Licence-2.0.txt:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/shell/download_reads_from_sra.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # Copyright (C) 2020-2025 Yu Wan <wanyuac@gmail.com>
  3 | # Licensed under the GNU General Public Licence version 3 (GPLv3) <https://www.gnu.org/licenses/>.
  4 | # Publication: 11 March 2020; last modification: 5 May 2025
  5 | # Important update on 16/4/2025: added fastq-dump arguments "--skip-technical --clip --dumpbase --read-filter pass"
  6 | # according to https://edwards.flinders.edu.au/fastq-dump/. (Thanks to Sophie Mannix for pointing this out)
  7 | 
  8 | # Help information #########################
  9 | show_help() {
 10 |     echo "
 11 |     Download files of sequencing reads from the NCBI SRA database.
 12 |     Arguments:
 13 |         -d: (optional) Directory that contains the program fastq-dump (does not need to end by a slash character '/').
 14 |         -o: Output directory (no forward slash). Default: ${HOME}/SRA_reads.
 15 |         -a: A comma-delimited string of target accession numbers (SRR*)
 16 |         -f: A single-column text file of SRR numbers or a two-column CSV file of genome names (1st column)
 17 |             and SRR numbers (2nd column).
 18 |         -r: A logical flag turning on replacement of SRR numbers with genome names for read files.
 19 |         -s: A logical flag notifying this script that the reads to be downloaded are single-end.
 20 |         -u: Skip the dos2unix step if your input file is known to follow the Unix-style line ending.
 21 |         -p: Prefix of the log file (Markdown format) in the output directory (default: download_reads_from_sra_[date (YYYY-MM-DD)]_[HH-MM-SS])
 22 |     Example command:
 23 |         ./download_reads_from_sra.sh -d=\"\$HOME/bin/sra_toolkit/bin\" -o=\"\$PWD\" -f=readsets.csv -r -p=download_reads_from_sra
 24 |     Note that:
 25 |         1. The -a argument is ignored when the -f argument is set.
 26 |         2. Newline characters in the input file must be '\n' rather than '\r\n'.
 27 |         3. Fastq-dump sometimes fails in downloading or parsing read files. Remember to check the log and error files after each run.
 28 |         4. Please ensure genome names are unique throughout your dataset. Otherwise, files of the same names may be overridden at the renaming step.
 29 |         5. Dependency: SRA Toolkit v3.0.6 and later versions, dos2unix.
 30 |         6. This script was called download_sra_reads.sh.
 31 |     "
 32 | }
 33 | 
 34 | if [[ $# -eq 0 ]]; then  # When $1 does not exist (Error "$1: unbound variable" arises when use `if [[ $# -eq 0 ]] || [ "$1" = "-h" ]`). Do not use `-z "$1"`.
 35 |     show_help
 36 |     exit 0
 37 | elif [ "$1" = "-h" ]; then
 38 |     show_help
 39 |     exit 0
 40 | fi
 41 | 
 42 | # Remove leading and trailing whitespace (spaces, tabs, etc.) while handling carriage returns (\r) and newlines (\n) #########################
 43 | function trim_whitespace() {
 44 |     echo -e "$1" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//'
 45 | }
 46 | 
 47 | function time_stamp() {
 48 |     echo "[$(date +"%Y-%m-%d %H:%M:%S")]"
 49 | }
 50 | 
 51 | function write_log() {
 52 |     echo "$1" >> "$log"  # The global variable log will be defined later.
 53 | }
 54 | 
 55 | # Main function #########################
 56 | out_dir="${HOME}/SRA_reads"  # The default output directory
 57 | replace_names=false  # By default, do not replace accession numbers with genome names.
 58 | paired_end=true  # Assumes all read files are paired-end.
 59 | read_file=false  # Assumes that by default accessions are not provided in a file.
 60 | unix_format=false  # Assumes the input file has non-Unix-style line endings ('\n\r' etc).
 61 | prefix=download_reads_from_sra
 62 | suffix=$(date +"%Y-%m-%d_%H-%M-%S")  # Name suffix of log and error files
 63 | wait_time=2  # In seconds. Time to pause before the start of a new download iteration
 64 | 
 65 | # Read arguments
 66 | for i in "$@"; do
 67 |     case $i in
 68 |         -a=*)
 69 |         accessions="${i#*=}"
 70 |         accessions=( $(echo $accessions | tr ',' '\n') )
 71 |         ;;
 72 |         -f=*)
 73 |         acc_list="${i#*=}"  # A file listing accession numbers and probably genome names as well
 74 |         read_file=true
 75 |         ;;
 76 |         -r)
 77 |         replace_names=true
 78 |         ;;
 79 |         -s)
 80 |         paired_end=false  # Single-end reads
 81 |         ;;
 82 |         -d=*)
 83 |         program_dir="${i#*=}"
 84 |         ;;
 85 |         -o=*)
 86 |         out_dir="${i#*=}"
 87 |         ;;
 88 |         -u)
 89 |         unix_format=true
 90 |         ;;
 91 |         -p=*)
 92 |         prefix="${i#*=}"
 93 |         ;;
 94 |         -w=*)
 95 |         wait_time="${i#*=}"
 96 |         ;;
 97 |         *)  # Do nothing otherwise.
 98 |         ;;
 99 |     esac
100 | done
101 | 
102 | log="${out_dir}/${prefix}_${suffix}.md"  # A plain-text Markdown file
103 | 
104 | # Check the output directory
105 | if [ ! -d "$out_dir" ]; then
106 |     mkdir -p "$out_dir"
107 |     write_log "# Global configurations"
108 |     write_log "$(time_stamp) Created the output directory: ${out_dir}."
109 | fi
110 | 
111 | # Load module
112 | if [ ! -z "$program_dir" ]; then
113 |     export PATH="${program_dir}:$PATH"
114 | fi
115 | 
116 | # Download and parse read files
117 | write_log "Waiting time between consecutive download iterations: $wait_time seconds."
118 | accession_count=0  # Assign a default value to accession_count in case neither -f nor -a is set.
119 | successes=0
120 | failures=0
121 | 
122 | if [ "$read_file" = true ]; then
123 |     if [ ! -f "$acc_list" ]; then
124 |         write_log "$(time_stamp) Error: $acc_list was not found."
125 |         exit 1
126 |     fi
127 |     if [ "$unix_format" = false ]; then
128 |         dos2unix "$acc_list" >> "$log" 2>&1
129 |     fi
130 |         
131 |     # Read lines of the input file into an array, skipping empty lines
132 |     # https://stackoverflow.com/questions/15685736/how-to-extract-a-particular-element-from-an-array-in-bash
133 |     mapfile -t lines_array < "$acc_list"
134 |     accession_count="${#lines_array[@]}"  # Number of accessions
135 |     write_log "$(time_stamp) Imported $accession_count entries from file ${acc_list}."
136 |     if [ "$replace_names" = true ]; then
137 |         write_log "Run accessions in FASTQ filenames will be replaced by genome names."
138 |         echo >> "$log"  # Create an empty line in the log file
139 |         write_log "# Task records"
140 |         for line in "${lines_array[@]}"; do
141 |             # write_log "$(time_stamp) Parse line '${line}'"  # This command is used for debugging
142 |             IFS=',' read -r -a line_fields <<< "${line}"
143 |             genome="${line_fields[0]}"  # Genome or isolate name
144 |             accession="${line_fields[1]}"  # Run accession
145 |             genome="$(trim_whitespace "$genome")"  # Sometimes people include whitespaces or tab characters in some genome names or accessions by accident.
146 |             accession="$(trim_whitespace "$accession")"
147 |             if [[ -z "$genome" || -z "$accession" ]]; then
148 |                 write_log "$(time_stamp) Warning: Skipping malformed line: '$line'"
149 |                 continue
150 |             fi
151 |             fastq_original_prefix="${out_dir}/${accession}"
152 |             fastq_renamed_prefix="${out_dir}/${genome}"
153 |             # write_log "$(time_stamp)  genome name ${genome} and its Run accession ${accession}"  # For debugging
154 |             if [ "$paired_end" = true ]; then  # Paired-end mode
155 |                 write_log "## Download paired-end reads under Run accession $accession of genome ${genome}"
156 |                 write_log "$(time_stamp) Run fastq-dump to download the reads."
157 |                 if fastq-dump --readids --skip-technical --clip --dumpbase --read-filter pass --outdir "$out_dir" --split-3 "$accession" >> "$log" 2>&1; then  # Download and split the read file, and create the output directory if necessary
158 |                     write_log "$(time_stamp) fastq-dump command finished for $accession of genome ${genome}."
159 |                 else
160 |                     write_log "$(time_stamp) fastq-dump failed for $accession."
161 |                     ((failures++))
162 |                     continue
163 |                 fi
164 |                 f1="${fastq_original_prefix}_pass_1.fastq"
165 |                 f2="${fastq_original_prefix}_pass_2.fastq"
166 |                 f1_renamed="${fastq_renamed_prefix}_1.fastq"
167 |                 f2_renamed="${fastq_renamed_prefix}_2.fastq"
168 |                 if [ -f "$f1" ] && [ -f "$f2" ]; then
169 |                     #write_log "$(time_stamp) Compress FASTQ files $f1 and $f2"
170 |                     #gzip "$f1"  # Removed the gzip step because occasionally it fails to execute on my university's HPC
171 |                     #gzip "$f2"
172 |                     #if [ -f "${f1}.gz" ] && [ -f "${f2}.gz"]; then
173 |                         #mv "${f1}.gz" "$f1_renamed"
174 |                         #mv "${f2}.gz" "$f2_renamed"
175 |                         #write_log "$(time_stamp) Successfully compressed and renamed FASTQ files: ${f1}.gz -> ${f1_renamed}; ${f2}.gz -> ${f2_renamed}"
176 |                     mv "$f1" "$f1_renamed"
177 |                     mv "$f2" "$f2_renamed"
178 |                     if [ -f "$f1_renamed" ] && [ -f "$f2_renamed" ]; then
179 |                         write_log "$(time_stamp) Successfully renamed FASTQ files: $f1 -> ${f1_renamed}; $f2 -> ${f2_renamed}."
180 |                         ((successes++))
181 |                     else
182 |                         write_log "$(time_stamp) Error: FASTQ files $f1 and/or $f2 could not be renamed."
183 |                         ((failures++))
184 |                     fi
185 |                 else
186 |                     write_log "$(time_stamp) Error: $f1 and/or $f2 could not be created by fastq-dump. Skip the step of renaming FASTQ files."
187 |                     ((failures++))
188 |                 fi
189 |             else  # Single-end mode (for instance, PacBio or Nanopore reads)
190 |                 write_log "## Download single-end read set $accession of genome ${genome}"
191 |                 write_log "$(time_stamp) Run fastq-dump to download the reads."
192 |                 if fastq-dump --readids --skip-technical --clip --dumpbase --read-filter pass --outdir "$out_dir" --split-3 "$accession" >> "$log" 2>&1; then
193 |                     write_log "$(time_stamp) fastq-dump command finished for $accession of genome ${genome}."
194 |                 else
195 |                     write_log "$(time_stamp) fastq-dump failed for $accession."
196 |                     ((failures++))
197 |                     continue
198 |                 fi
199 |                 f1="${fastq_original_prefix}_pass.fastq"
200 |                 f1_renamed="${fastq_renamed_prefix}.fastq"
201 |                 if [ -f "$f1" ]; then
202 |                     #gzip "$f1"
203 |                     mv "$f1" "$f1_renamed"
204 |                     if [ -f "$f1_renamed" ]; then
205 |                         write_log "$(time_stamp) Successfully renamed FASTQ file: $f1 -> ${f1_renamed}."
206 |                         ((successes++))
207 |                     else
208 |                         write_log "$(time_stamp) Error: FASTQ file $f1 could not be renamed."
209 |                         ((failures++))
210 |                     fi
211 |                 else
212 |                     write_log "$(time_stamp) Error: $f1 could not be created by fastq-dump."
213 |                     ((failures++))
214 |                 fi
215 |             fi
216 |             echo >> "$log"  # Add an empty line before the start of a new download iteration
217 |             sleep "$wait_time"  # Pause, to avoid too many connection requests to NCBI's server.
218 |         done
219 |     else  # A single-column input file of Run accessions
220 |         write_log "Use Run accessions as filenames of downloaded read sets."
221 |         echo >> "$log"
222 |         write_log "# Task records"
223 |         for line in "${lines_array[@]}"; do
224 |             accession="$(trim_whitespace "$line")"
225 |             if [ -z "$accession" ]; then
226 |                 write_log "$(time_stamp) Warning: Skipping malformed line: '$line'"
227 |                 continue
228 |             fi
229 |             fastq_original_prefix="${out_dir}/${accession}"
230 |             write_log "## Download reads under Run accession $accession"
231 |             write_log "$(time_stamp) Run fastq-dump to download reads under ${accession}."
232 |             if fastq-dump --readids --skip-technical --clip --dumpbase --read-filter pass --outdir "$out_dir" --split-3 "$accession" >> "$log" 2>&1; then
233 |                 write_log "$(time_stamp) fastq-dump command finished for ${accession}."
234 |             else
235 |                 write_log "$(time_stamp) fastq-dump failed for $accession."
236 |                 ((failures++))
237 |                 continue
238 |             fi
239 |             if [ "$paired_end" = true ]; then  # Paired-end reads
240 |                 f1="${fastq_original_prefix}_pass_1.fastq"
241 |                 f2="${fastq_original_prefix}_pass_2.fastq"
242 |                 if [ -f "$f1" ] && [ -f "$f2" ]; then
243 |                     #gzip "$f1"
244 |                     #gzip "$f2"
245 |                     write_log "$(time_stamp) Successfully created FASTQ files $f1 and ${f2}."
246 |                     ((successes++))
247 |                 else
248 |                     write_log "$(time_stamp) Error: $f1 and/or $f2 could not be created by fastq-dump."
249 |                     ((failures++))
250 |                 fi
251 |             else  # Single-end reads
252 |                 f1="${fastq_original_prefix}_pass.fastq"
253 |                 if [ -f "$f1" ]; then
254 |                     #gzip "$f1"
255 |                     write_log "$(time_stamp) Successfully created FASTQ file ${f1}."
256 |                     ((successes++))
257 |                 else
258 |                     write_log "$(time_stamp) Error: $f1 could not be created by fastq-dump."
259 |                     ((failures++))
260 |                 fi
261 |             fi
262 |             echo >> "$log"
263 |             sleep "$wait_time"
264 |         done
265 |     fi
266 | else  # When accession numbers come from the -a parameter
267 |     accession_count="${#accessions[@]}"
268 |     write_log "$(time_stamp) Imported $accession_count entries from the -a argument."
269 |     echo >> "$log"
270 |     write_log "# Task records"
271 |     for entry in "${accessions[@]}"; do  # Filename replacement is not supported under this mode.
272 |         accession="$(trim_whitespace "$entry")"
273 |         if [ -z "$accession" ]; then
274 |             write_log "$(time_stamp) Warning: Skipping malformed entry: '$entry'"
275 |             continue
276 |         fi
277 |         fastq_original_prefix="${out_dir}/${accession}"
278 |         write_log "## Download reads under Run accession $accession"
279 |         if fastq-dump --readids --skip-technical --clip --dumpbase --read-filter pass --outdir "$out_dir" --split-3 "$accession" >> "$log" 2>&1; then
280 |             write_log "$(time_stamp) fastq-dump command finished for ${accession}."
281 |         else
282 |             write_log "$(time_stamp) fastq-dump failed for $accession."
283 |             ((failures++))
284 |             continue
285 |         fi
286 |         if [ "${paired_end}" = true ]; then
287 |             f1="${fastq_original_prefix}_1.fastq"
288 |             f2="${fastq_original_prefix}_2.fastq"
289 |             if [ -f "$f1" ] && [ -f "$f2" ]; then
290 |                 #gzip "$f1"
291 |                 #gzip "$f2"
292 |                 write_log "$(time_stamp) Successfully created FASTQ files $f1 and ${f2}."
293 |                 ((successes++))
294 |             else
295 |                 echo "$(time_stamp) Error: $f1 and/or $f2 could not be downloaded." >&2
296 |                 ((failures++))
297 |             fi
298 |         else
299 |             f1="${fastq_original_prefix}.fastq"
300 |             if [ -f "$f1" ]; then
301 |                 #gzip "$f1"
302 |                 write_log "$(time_stamp) Successfully created FASTQ file ${f1}."
303 |                 ((successes++))
304 |             else
305 |                 write_log "$(time_stamp) Error: $f1 could not be created by fastq-dump."
306 |                 ((failures++))
307 |             fi
308 |         fi
309 |         echo >> "$log"
310 |         sleep "$wait_time"
311 |     done
312 | fi
313 | 
314 | write_log '# Conclusion'
315 | write_log "$(time_stamp) Finished all $accession_count tasks. Downloaded $successes readsets and failed to download $failures readsets."
316 | exit 0
317 | 


--------------------------------------------------------------------------------
/downloadSeqFromNCBI.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """
  4 | This script takes a list of NCBI accession numbers (one for each line) from the STDIN and downloads corresponding entries (either GenBank files or FASTA files) under the target directory.
  5 | 
  6 | Usage: python downloadSeqFromNCBI.py --records "file:objects.txt" --with_prefix --format fasta --email xxx@xxx.com --ext fna --outdir ./ref --skip > download.log
  7 | 	   python downloadSeqFromNCBI.py --records "NC_0001,NC_0002" --format genbank --email xxx@xxx.com --ext gbk --outdir ./ref --skip > download.log
  8 | 	   python downloadSeqFromNCBI.py --records "NC_0001,NC_0002" --format genbank --email xxx@xxx.com --prefix K12 --ext gbk --outdir ./ref --skip > download.log
  9 | 	   python downloadSeqFromNCBI.py --records "file:objects.tsv" --with_prefix --format fasta --email xxx@xxx.com --ext fna --outdir ./ref --skip > download.log
 10 | 	   Type python downloadSeqFromNCBI.py -h or --help for help information.
 11 | 
 12 | Important options and arguments:
 13 | 	--records or -r: Can be either a file (must contain a suffix of ".txt") listing targets to be downloaded, or a string of accession IDs separated by commas (no space is allowed).
 14 | 	--with_prefix: A logical option specifying that the record file is a tab-delimited file of two columns (without a header line) for accession numbers and prefixes.
 15 | 	--no_accession: Set this flag to not attach an NCBI accession number after the genome name in each file name. Only applicable when --prefix != None. This option may cause overwriting output files when multiple NCBI accessions share the same prefix.
 16 | 	--format or -f: The format of files to be downloaded. This option is not used when --db = assembly.
 17 | 	--db: Customised specification of an NCBI database to retrieve records from
 18 | 	--ext or -x: The file extension, can be "fasta" (default), "fna", "fna.gz", "gb", "gb.gz" or "gbk". No dot preceding the extension is needed.
 19 | 	--outdir or -o: Output directory, no backslash at the end.
 20 | 
 21 | An example of the input list: seq_list.txt. Note that accession IDs may not include version numbers, such as ".1".
 22 | 	HG326223.1\n
 23 | 	CP011642\n
 24 | 	
 25 | The input file may be composed of two columns: accession number, prefix (genome name), sepearated by a tab character. For instance,
 26 |     HG326223.1\tDb11\n
 27 |     CP011642\tCAV1492\n
 28 | 
 29 | References:
 30 | 	1. This script is inspired by Mark Schultz's (dr.mark.schultz@gmail.com, GitHub: schultzm) script "downloadGenbankByAccessions.py" stored under the master branch of github.com/katholt/holtlab.
 31 | 	2. Forum post: www.biostars.org/p/63506/
 32 | 	3. Damien Farrell's blog post: Retrieving genome assemblies via Entrez with Python (dmnfarrell.github.io/bioinformatics/assemblies-genbank-python)
 33 | 	
 34 | Copyright (C) 2015-2020 Yu Wan <wanyuac@126.com>
 35 | First publication: 27 June 2015 - 14 July 2015; the latest modification: 28 June 2020
 36 | Python version 2 and 3 compatible
 37 | Licensed under the GNU General Public Licence version 3 (GPLv3) <https://www.gnu.org/licenses/>.
 38 | Previous names: download_gbk.py, download_NCBI_records.py
 39 | """
 40 | 
 41 | from __future__ import print_function
 42 | import os
 43 | import sys
 44 | import time
 45 | import xml.etree.ElementTree as xmlTree
 46 | from Bio import Entrez
 47 | from ftplib import FTP
 48 | from collections import namedtuple
 49 | from argparse import ArgumentParser
 50 | 
 51 | 
 52 | def parse_arguments():
 53 | 	parser = ArgumentParser(description = "Read options and arguments")
 54 | 	parser.add_argument("--records", "-r", dest = "records", type = str, required = True, \
 55 | 						help = "Items you want to fetch from the NCBI database")
 56 | 	parser.add_argument("--with_prefix", "-w", dest = "with_prefix", action = "store_true", required = False, \
 57 | 						help = "Set when the accession file contains two columns for accessions and prefixes, respectively")
 58 | 	parser.add_argument("--db", "-d", dest = "db", type = str, default = "nucleotide", required = False, \
 59 | 						help = "NCBI database to be retrieved from. Options: nucleotide (default), assembly")
 60 | 	parser.add_argument("--format", "-f", dest = "format", type = str, default = "fasta", required = False, \
 61 | 						help = "Format: fasta(default)/genbank")
 62 | 	parser.add_argument("--refseq", "-q", dest = "refseq", action = "store_true", required = False, \
 63 | 						help = "Set it to specify the RefSeq database for downloading assemblies")
 64 | 	parser.add_argument("--email", "-e", dest = "email", type = str, required = True, \
 65 | 						help = "User email address")
 66 | 	parser.add_argument("--prefix", "-p", dest="prefix", type = str, default = None, required = False, \
 67 | 						help = "Common prefix adding to all files")
 68 | 	parser.add_argument("--no_accession", "-n", dest = "no_accession", action = "store_true", required = False, \
 69 | 						help = "Set this flag to not attach an NCBI accession number after the genome name in each file name. Only applicable when --prefix != None.")
 70 | 	parser.add_argument("--ext", "-x", dest = "ext", type = str, default = "fasta", required = False, \
 71 | 						help = "File extension: fasta (default), fna, gb, gbk, fna.gz, gbff.gz (For assemblies, usually with .gz)")
 72 | 	parser.add_argument("--outdir", "-o", dest = "outdir", type = str, default = ".", required = False, \
 73 | 						help = "Destination directory, no backslash at the end")
 74 | 	parser.add_argument("--skip", "-sk", dest = "skip", action = "store_true", required = False, \
 75 | 						help = "Set to skip downloaded files")
 76 | 	parser.add_argument("--ftp", "-t", dest = "ftp", type = str, default = "ftp.ncbi.nlm.nih.gov", required = False, \
 77 | 						help = "Address of the NCBI FTP site from which assemblies are downloaded. Default: ftp.ncbi.nlm.nih.gov.")
 78 | 	return parser.parse_args()
 79 | 
 80 | 		
 81 | def main():
 82 | 	args = parse_arguments()
 83 | 	Entrez.email = args.email
 84 | 	
 85 | 	# Read input and set up output file names
 86 | 	check_output_dir(args.outdir)
 87 | 	accessions = extract_accessions(targets = args.records, with_prefix = args.with_prefix)
 88 | 	new_files = create_output_filenames(accessions = accessions, with_prefix = args.with_prefix, \
 89 | 										no_accession = args.no_accession, outdir = args.outdir, \
 90 | 										out_prefix = args.prefix, extension = "." + args.ext)
 91 | 	
 92 | 	# Iteratively download files
 93 | 	if args.db == "nucleotide":
 94 | 		"""
 95 | 		Download nucleotide records (FASTA or GenBank files) through Entrez.efetch utility.
 96 | 		"""
 97 | 		if args.format == "fasta":
 98 | 			download_records(new_files = new_files, skip_existing = args.skip, record_type = "fasta", \
 99 | 							 outdir = args.outdir)
100 | 		else:
101 | 			"""
102 | 			Download nucleotide records as GenBank files. Do not use "gb" for rettype, as it only includes
103 | 			contig locations if the entry is built from contigs.
104 | 			"""
105 | 			download_records(new_files = new_files, skip_existing = args.skip, record_type = "gbwithparts", \
106 | 							 outdir = args.outdir)
107 | 	elif args.db == "assembly":
108 | 		"""
109 | 		Download assemblies (FASTA or GenBank files) from NCBI's FTP server.
110 | 		"""
111 | 		if args.format == "fasta":
112 | 			download_assemblies(new_files = new_files, skip_existing = args.skip, outdir = args.outdir, \
113 | 								use_refseq = args.refseq, site = args.ftp, fasta = True)
114 | 		else:
115 | 			download_assemblies(new_files = new_files, skip_existing = args.skip, outdir = args.outdir, \
116 | 								use_refseq = args.refseq, site = args.ftp, fasta = False)
117 | 	else:
118 | 		print("Error: only databases 'nucleotide' and 'assembly' are supported by far. No download task will be launched.")
119 | 
120 | 	return
121 | 
122 | 
123 | def extract_accessions(targets, with_prefix):
124 | 	"""
125 | 	Parsing the argument for '--records'.
126 | 	This function deals with two kinds of input: a string of accession numbers or a file.
127 | 	"""
128 | 	if targets.startswith("file:"):  # treat "targets" as a file name
129 | 		with open(targets[5 : ], "r") as f:
130 | 			lines = f.read().splitlines()  # read each line into a component of a list and drop newline characters
131 | 			
132 | 			# parse every line when there are two columns separated by a tab in each line
133 | 			if with_prefix:
134 | 				accessions = {}
135 | 				for line in lines:
136 | 					fields = line.split("\t")  # "accession\tprefix"
137 | 					accessions[fields[0]] = fields[1]  # return a dictionary
138 | 			else:
139 | 				accessions = lines  # a list
140 | 	else:  # treat "targets" as a series of accession numbers separated by commas.
141 | 		accessions = targets.split(",")
142 | 		
143 | 	return accessions
144 | 
145 | 
146 | def create_output_filenames(accessions, with_prefix, no_accession, outdir, out_prefix, extension):
147 | 	"""
148 | 	Set up file names for download. Return variable: a dictionary.
149 | 	"""
150 | 	new_files = {}
151 | 	if with_prefix:  # when "accessions" is a dictionary
152 | 		for entry, prefix in accessions.items():  # entry: an accession number, the key of the dictionary "accessions"
153 | 			if no_accession:  # Do not attach an accession number after the genome name in the output filename.
154 | 				new_files[entry] = os.path.join(outdir, prefix + extension)
155 | 			else:
156 | 				new_files[entry] = os.path.join(outdir, prefix + "__" + entry + extension)
157 | 	else:
158 | 		for entry in accessions:  # when "accessions" is a list
159 | 			if out_prefix != None:
160 | 				if no_accession:
161 | 					new_files[entry] = os.path.join(outdir, out_prefix + extension)
162 | 				else:
163 | 					new_files[entry] = os.path.join(outdir, out_prefix + "__" + entry + extension)
164 | 			else:
165 | 				new_files[entry] = os.path.join(outdir, entry + extension)
166 | 	
167 | 	return new_files
168 | 
169 | 
170 | def download_records(new_files, skip_existing, record_type, outdir):
171 | 	"""
172 | 	Download from the nucleotide database (through Entrez.efetch) and save files in FASTA or GenBank format
173 | 	"""
174 | 	print("Start to download records from the NCBI Nucleotide database.")
175 | 	n = 0  # the counter for downloaded files
176 | 	for entry, new_file in new_files.items():
177 | 		if os.path.exists(new_file) and skip_existing:
178 | 			print(new_file + " already exists, skipped.")
179 | 			continue  # go to the next entry
180 | 		try:
181 | 			handle = Entrez.efetch(db = "nucleotide", id = entry, rettype = record_type, retmode = "text")
182 | 			with open(new_file, "w") as output_file:
183 | 				print("Downloading " + entry + " to " + new_file)
184 | 				output_file.write(handle.read())  # read and write this entry into a new file named after its accession number
185 | 			n += 1
186 | 			handle.close()
187 | 		except:
188 | 			print("The record "+ entry + " is not found.")
189 | 			continue
190 | 		time.sleep(1) # Pause for one second to obviate submitting too many concurrent requests to NCBI
191 | 
192 | 	if outdir == ".":
193 | 		outdir = "the current workding directory"
194 | 	print("Done. Altogether %d files were downloaded and stored in %s successfully." % (n, outdir))
195 | 	
196 | 	return
197 | 
198 | 
199 | def download_assemblies(new_files, skip_existing, outdir, use_refseq, site, fasta):
200 | 	"""
201 | 	Download nucleotide sequences from the NCBI Assembly database through FTP
202 | 	"""
203 | 	print("Start to download records from the NCBI Assembly database.")
204 | 	urls = get_urls(new_files, use_refseq, skip_existing, fasta)  # urls is a named tuple with three fields
205 | 	
206 | 	# Download files through FTP
207 | 	print("Connecting to site " + site + ".")
208 | 	try:
209 | 		ftp = FTP(site, timeout = 30)  # In general, urlsplit(URL).netloc returns the site address. Paramenter "timeout" is mandatory.
210 | 		ftp.login()  # '230 Anonymous access granted, restrictions apply'
211 | 		print("Successfully logged in.")
212 | 	except:
213 | 		sys.exit("Error: could not log in to the site.")
214 | 	
215 | 	n = 0
216 | 	prefix_len = len("ftp://" + site)  # For example, len("ftp://ftp.ncbi.nlm.nih.gov") = 26
217 | 	output_format = "FASTA" if fasta else "GenBank"
218 | 	
219 | 	for assembly in urls:
220 | 		try:
221 | 			with open(assembly.local, "wb") as f: # Create a binary file
222 | 				"""
223 | 				The retrbinary method of an FTP object does not work for the full FTP address, such as
224 | 				ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/.../file.fna.gz. It only accepts an relative path
225 | 				such as /genomes/all/GCA/000/.../file.fna.gz. Hence we need to extract the path from an URL
226 | 				using a simple command: assembly.url[prefix_len : ].
227 | 				"""
228 | 				ftp.retrbinary("RETR " + assembly.url[prefix_len : ], f.write)
229 | 			print("Saved %s file %s as %s." % (output_format, assembly.url, assembly.local))
230 | 			n += 1
231 | 			time.sleep(1)
232 | 		except:
233 | 			print("Warning: remote file " + assembly.url + " is not accessible. Skip.")
234 | 			os.system("rm " + assembly.local)
235 | 	ftp.quit()
236 | 	print("Done. Altogether %d files were downloaded and stored in %s successfully." % (n, outdir))
237 | 	
238 | 	return
239 | 
240 | 
241 | def get_urls(new_files, use_refseq, skip_existing, fasta):
242 | 	"""
243 | 	This is a subordinate function of function download_assemblies. It retrieves FTP addresses of
244 | 	assembly files on the NCBI server. Sequence files are downloaded after gathering all addresses.
245 | 	"""
246 | 	Assembly = namedtuple("Assembly", ["accession", "url", "local"])
247 | 	urls = []
248 | 	filename_suffix = "_genomic.fna.gz" if fasta else "_genomic.gbff.gz"  # FASTA or GenBank file
249 | 
250 | 	for entry, new_file in new_files.items():
251 | 		if os.path.exists(new_file) and skip_existing:
252 | 			print(new_file + " already exists, skipped.")
253 | 			continue  # go to the next entry
254 | 
255 | 		try:
256 | 			handle = Entrez.esearch(db = "assembly", term = entry, retmax = "1")  # There is only one record per accession number.
257 | 			record = Entrez.read(handle)
258 | 			uid = record["IdList"][0]  # Convert this single-element list into a string
259 | 		except:
260 | 			print("The record "+ entry + " was not found in the database.")
261 | 			continue
262 | 
263 | 		summary = get_assembly_summary(uid)
264 | 		if summary != None:
265 | 			if use_refseq:
266 | 				url = summary["FtpPath_RefSeq"]  # Field <FtpPath_RefSeq> in the summary XML file
267 | 			else:
268 | 				url = summary["FtpPath_GenBank"]  # Field <FtpPath_GenBank> in the summary XML file
269 | 				
270 | 			if url == "":
271 | 				print("Warning: URL of assembly %s (sequence ID: %s) was not found in the record summary." % (entry, uid))
272 | 				continue
273 | 			else:
274 | 				urls.append(Assembly(accession = entry, url = os.path.join(url, os.path.basename(url) + filename_suffix),\
275 | 					local = new_file))
276 | 
277 | 		time.sleep(1)
278 | 		
279 | 	return urls
280 | 
281 | 
282 | def get_assembly_summary(uid):
283 | 	"""
284 | 	Retrieve details of an assembly under a given ID (the parameter 'id')
285 | 	This is a subordinate function of get_urls and it returns either a dictionary or a None object.
286 | 
287 | 	The follow command is equivalent to visiting the URL:
288 | 	https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=assembly&id=uid" in a web browser.
289 | 	The command returns a document summary in the format of DocSums for uid.
290 | 	(Reference: https://www.ncbi.nlm.nih.gov/books/NBK25499/)
291 | 	"""
292 | 	summary_handle = Entrez.esummary(db = "assembly", id = uid, report = "full", retmax = 1, retmode = "xml")
293 | 	try:
294 | 		summary = Entrez.read(summary_handle)  # Not working anymore
295 | 		summary = summary["DocumentSummarySet"]["DocumentSummary"][0]  # Returns a dictionary
296 | 	except (ValueError, TypeError):
297 | 		"""
298 | 		A note created on 28 June 2020: The Entrez.read method seems not working recently and returns an internal
299 | 		exception "ValidationError". "Bio.Entrez.Parser.ValidationError: Failed to find tag 'AssemblyStatusSort'
300 | 		in the DTD. To skip all tags that are not represented in the DTD, please call Bio.Entrez.read or
301 | 		Bio.Entrez.parse with validate=False."
302 | 		
303 | 		I do not know the reason but have managed to find out an alternation to this
304 | 		method through directly parsing the summary XML file from the NCBI server. This solution requires model xml.
305 | 		"""
306 | 		print("Document summary could not be parsed for assembly %s. Trying an alternative approach to extract the FTP address..." % uid)
307 | 		try:
308 | 			# The handle need to be regenerated as it seems to be damaged by the method Entrez.read.
309 | 			summary_handle = Entrez.esummary(db = "assembly", id = uid, report = "full", retmax = 1, retmode = "xml")
310 | 			xml = xmlTree.parse(summary_handle).getroot()
311 | 			summary = {}
312 | 
313 | 			"""
314 | 			Structure of the object xml:
315 | 				xml[0]: <DocumentSummarySet>
316 | 					xml[0][0]: <DbBuild>
317 | 					xml[0][1]: <DocumentSummary uid=...>
318 | 			"""
319 | 			for field in xml[0][1]:
320 | 				if field.tag in ["FtpPath_GenBank", "FtpPath_RefSeq"]:
321 | 					summary[field.tag] = field.text
322 | 			
323 | 			if len(summary) > 0:
324 | 				print("    FTP address(es) has/have been successfully extracted for assembly %s." % uid)
325 | 			else:
326 | 				print("    FTP address still could not be obtained for assembly %s." % uid)
327 | 				summary = None
328 | 		except (ValueError, TypeError):  # The worst scenario: no FTP path is found
329 | 			print("    FTP address still could not be obtained for assembly %s." % uid)
330 | 			summary = None
331 | 	
332 | 	return summary
333 | 
334 | 
335 | def check_output_dir(outdir):
336 | 	"""
337 | 	Prepare the output directory
338 | 	"""
339 | 	if outdir != ".":
340 | 		if not os.path.exists(outdir):
341 | 			os.system("mkdir " + outdir)
342 | 		else:
343 | 			print("Output directory " + outdir + " exists.")
344 | 	else:
345 | 		print("Skipped checking the output directory as it is the current working directory.")
346 | 	
347 | 	return
348 | 
349 | 
350 | if __name__ == "__main__":
351 | 	main()
352 | 


--------------------------------------------------------------------------------
/other_licence/GPL-2.0.txt:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 2, June 1991
  3 | 
  4 |  Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
  5 |  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  6 |  Everyone is permitted to copy and distribute verbatim copies
  7 |  of this license document, but changing it is not allowed.
  8 | 
  9 |                             Preamble
 10 | 
 11 |   The licenses for most software are designed to take away your
 12 | freedom to share and change it.  By contrast, the GNU General Public
 13 | License is intended to guarantee your freedom to share and change free
 14 | software--to make sure the software is free for all its users.  This
 15 | General Public License applies to most of the Free Software
 16 | Foundation's software and to any other program whose authors commit to
 17 | using it.  (Some other Free Software Foundation software is covered by
 18 | the GNU Lesser General Public License instead.)  You can apply it to
 19 | your programs, too.
 20 | 
 21 |   When we speak of free software, we are referring to freedom, not
 22 | price.  Our General Public Licenses are designed to make sure that you
 23 | have the freedom to distribute copies of free software (and charge for
 24 | this service if you wish), that you receive source code or can get it
 25 | if you want it, that you can change the software or use pieces of it
 26 | in new free programs; and that you know you can do these things.
 27 | 
 28 |   To protect your rights, we need to make restrictions that forbid
 29 | anyone to deny you these rights or to ask you to surrender the rights.
 30 | These restrictions translate to certain responsibilities for you if you
 31 | distribute copies of the software, or if you modify it.
 32 | 
 33 |   For example, if you distribute copies of such a program, whether
 34 | gratis or for a fee, you must give the recipients all the rights that
 35 | you have.  You must make sure that they, too, receive or can get the
 36 | source code.  And you must show them these terms so they know their
 37 | rights.
 38 | 
 39 |   We protect your rights with two steps: (1) copyright the software, and
 40 | (2) offer you this license which gives you legal permission to copy,
 41 | distribute and/or modify the software.
 42 | 
 43 |   Also, for each author's protection and ours, we want to make certain
 44 | that everyone understands that there is no warranty for this free
 45 | software.  If the software is modified by someone else and passed on, we
 46 | want its recipients to know that what they have is not the original, so
 47 | that any problems introduced by others will not reflect on the original
 48 | authors' reputations.
 49 | 
 50 |   Finally, any free program is threatened constantly by software
 51 | patents.  We wish to avoid the danger that redistributors of a free
 52 | program will individually obtain patent licenses, in effect making the
 53 | program proprietary.  To prevent this, we have made it clear that any
 54 | patent must be licensed for everyone's free use or not licensed at all.
 55 | 
 56 |   The precise terms and conditions for copying, distribution and
 57 | modification follow.
 58 | 
 59 |                     GNU GENERAL PUBLIC LICENSE
 60 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 61 | 
 62 |   0. This License applies to any program or other work which contains
 63 | a notice placed by the copyright holder saying it may be distributed
 64 | under the terms of this General Public License.  The "Program", below,
 65 | refers to any such program or work, and a "work based on the Program"
 66 | means either the Program or any derivative work under copyright law:
 67 | that is to say, a work containing the Program or a portion of it,
 68 | either verbatim or with modifications and/or translated into another
 69 | language.  (Hereinafter, translation is included without limitation in
 70 | the term "modification".)  Each licensee is addressed as "you".
 71 | 
 72 | Activities other than copying, distribution and modification are not
 73 | covered by this License; they are outside its scope.  The act of
 74 | running the Program is not restricted, and the output from the Program
 75 | is covered only if its contents constitute a work based on the
 76 | Program (independent of having been made by running the Program).
 77 | Whether that is true depends on what the Program does.
 78 | 
 79 |   1. You may copy and distribute verbatim copies of the Program's
 80 | source code as you receive it, in any medium, provided that you
 81 | conspicuously and appropriately publish on each copy an appropriate
 82 | copyright notice and disclaimer of warranty; keep intact all the
 83 | notices that refer to this License and to the absence of any warranty;
 84 | and give any other recipients of the Program a copy of this License
 85 | along with the Program.
 86 | 
 87 | You may charge a fee for the physical act of transferring a copy, and
 88 | you may at your option offer warranty protection in exchange for a fee.
 89 | 
 90 |   2. You may modify your copy or copies of the Program or any portion
 91 | of it, thus forming a work based on the Program, and copy and
 92 | distribute such modifications or work under the terms of Section 1
 93 | above, provided that you also meet all of these conditions:
 94 | 
 95 |     a) You must cause the modified files to carry prominent notices
 96 |     stating that you changed the files and the date of any change.
 97 | 
 98 |     b) You must cause any work that you distribute or publish, that in
 99 |     whole or in part contains or is derived from the Program or any
100 |     part thereof, to be licensed as a whole at no charge to all third
101 |     parties under the terms of this License.
102 | 
103 |     c) If the modified program normally reads commands interactively
104 |     when run, you must cause it, when started running for such
105 |     interactive use in the most ordinary way, to print or display an
106 |     announcement including an appropriate copyright notice and a
107 |     notice that there is no warranty (or else, saying that you provide
108 |     a warranty) and that users may redistribute the program under
109 |     these conditions, and telling the user how to view a copy of this
110 |     License.  (Exception: if the Program itself is interactive but
111 |     does not normally print such an announcement, your work based on
112 |     the Program is not required to print an announcement.)
113 | 
114 | These requirements apply to the modified work as a whole.  If
115 | identifiable sections of that work are not derived from the Program,
116 | and can be reasonably considered independent and separate works in
117 | themselves, then this License, and its terms, do not apply to those
118 | sections when you distribute them as separate works.  But when you
119 | distribute the same sections as part of a whole which is a work based
120 | on the Program, the distribution of the whole must be on the terms of
121 | this License, whose permissions for other licensees extend to the
122 | entire whole, and thus to each and every part regardless of who wrote it.
123 | 
124 | Thus, it is not the intent of this section to claim rights or contest
125 | your rights to work written entirely by you; rather, the intent is to
126 | exercise the right to control the distribution of derivative or
127 | collective works based on the Program.
128 | 
129 | In addition, mere aggregation of another work not based on the Program
130 | with the Program (or with a work based on the Program) on a volume of
131 | a storage or distribution medium does not bring the other work under
132 | the scope of this License.
133 | 
134 |   3. You may copy and distribute the Program (or a work based on it,
135 | under Section 2) in object code or executable form under the terms of
136 | Sections 1 and 2 above provided that you also do one of the following:
137 | 
138 |     a) Accompany it with the complete corresponding machine-readable
139 |     source code, which must be distributed under the terms of Sections
140 |     1 and 2 above on a medium customarily used for software interchange; or,
141 | 
142 |     b) Accompany it with a written offer, valid for at least three
143 |     years, to give any third party, for a charge no more than your
144 |     cost of physically performing source distribution, a complete
145 |     machine-readable copy of the corresponding source code, to be
146 |     distributed under the terms of Sections 1 and 2 above on a medium
147 |     customarily used for software interchange; or,
148 | 
149 |     c) Accompany it with the information you received as to the offer
150 |     to distribute corresponding source code.  (This alternative is
151 |     allowed only for noncommercial distribution and only if you
152 |     received the program in object code or executable form with such
153 |     an offer, in accord with Subsection b above.)
154 | 
155 | The source code for a work means the preferred form of the work for
156 | making modifications to it.  For an executable work, complete source
157 | code means all the source code for all modules it contains, plus any
158 | associated interface definition files, plus the scripts used to
159 | control compilation and installation of the executable.  However, as a
160 | special exception, the source code distributed need not include
161 | anything that is normally distributed (in either source or binary
162 | form) with the major components (compiler, kernel, and so on) of the
163 | operating system on which the executable runs, unless that component
164 | itself accompanies the executable.
165 | 
166 | If distribution of executable or object code is made by offering
167 | access to copy from a designated place, then offering equivalent
168 | access to copy the source code from the same place counts as
169 | distribution of the source code, even though third parties are not
170 | compelled to copy the source along with the object code.
171 | 
172 |   4. You may not copy, modify, sublicense, or distribute the Program
173 | except as expressly provided under this License.  Any attempt
174 | otherwise to copy, modify, sublicense or distribute the Program is
175 | void, and will automatically terminate your rights under this License.
176 | However, parties who have received copies, or rights, from you under
177 | this License will not have their licenses terminated so long as such
178 | parties remain in full compliance.
179 | 
180 |   5. You are not required to accept this License, since you have not
181 | signed it.  However, nothing else grants you permission to modify or
182 | distribute the Program or its derivative works.  These actions are
183 | prohibited by law if you do not accept this License.  Therefore, by
184 | modifying or distributing the Program (or any work based on the
185 | Program), you indicate your acceptance of this License to do so, and
186 | all its terms and conditions for copying, distributing or modifying
187 | the Program or works based on it.
188 | 
189 |   6. Each time you redistribute the Program (or any work based on the
190 | Program), the recipient automatically receives a license from the
191 | original licensor to copy, distribute or modify the Program subject to
192 | these terms and conditions.  You may not impose any further
193 | restrictions on the recipients' exercise of the rights granted herein.
194 | You are not responsible for enforcing compliance by third parties to
195 | this License.
196 | 
197 |   7. If, as a consequence of a court judgment or allegation of patent
198 | infringement or for any other reason (not limited to patent issues),
199 | conditions are imposed on you (whether by court order, agreement or
200 | otherwise) that contradict the conditions of this License, they do not
201 | excuse you from the conditions of this License.  If you cannot
202 | distribute so as to satisfy simultaneously your obligations under this
203 | License and any other pertinent obligations, then as a consequence you
204 | may not distribute the Program at all.  For example, if a patent
205 | license would not permit royalty-free redistribution of the Program by
206 | all those who receive copies directly or indirectly through you, then
207 | the only way you could satisfy both it and this License would be to
208 | refrain entirely from distribution of the Program.
209 | 
210 | If any portion of this section is held invalid or unenforceable under
211 | any particular circumstance, the balance of the section is intended to
212 | apply and the section as a whole is intended to apply in other
213 | circumstances.
214 | 
215 | It is not the purpose of this section to induce you to infringe any
216 | patents or other property right claims or to contest validity of any
217 | such claims; this section has the sole purpose of protecting the
218 | integrity of the free software distribution system, which is
219 | implemented by public license practices.  Many people have made
220 | generous contributions to the wide range of software distributed
221 | through that system in reliance on consistent application of that
222 | system; it is up to the author/donor to decide if he or she is willing
223 | to distribute software through any other system and a licensee cannot
224 | impose that choice.
225 | 
226 | This section is intended to make thoroughly clear what is believed to
227 | be a consequence of the rest of this License.
228 | 
229 |   8. If the distribution and/or use of the Program is restricted in
230 | certain countries either by patents or by copyrighted interfaces, the
231 | original copyright holder who places the Program under this License
232 | may add an explicit geographical distribution limitation excluding
233 | those countries, so that distribution is permitted only in or among
234 | countries not thus excluded.  In such case, this License incorporates
235 | the limitation as if written in the body of this License.
236 | 
237 |   9. The Free Software Foundation may publish revised and/or new versions
238 | of the General Public License from time to time.  Such new versions will
239 | be similar in spirit to the present version, but may differ in detail to
240 | address new problems or concerns.
241 | 
242 | Each version is given a distinguishing version number.  If the Program
243 | specifies a version number of this License which applies to it and "any
244 | later version", you have the option of following the terms and conditions
245 | either of that version or of any later version published by the Free
246 | Software Foundation.  If the Program does not specify a version number of
247 | this License, you may choose any version ever published by the Free Software
248 | Foundation.
249 | 
250 |   10. If you wish to incorporate parts of the Program into other free
251 | programs whose distribution conditions are different, write to the author
252 | to ask for permission.  For software which is copyrighted by the Free
253 | Software Foundation, write to the Free Software Foundation; we sometimes
254 | make exceptions for this.  Our decision will be guided by the two goals
255 | of preserving the free status of all derivatives of our free software and
256 | of promoting the sharing and reuse of software generally.
257 | 
258 |                             NO WARRANTY
259 | 
260 |   11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268 | REPAIR OR CORRECTION.
269 | 
270 |   12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278 | POSSIBILITY OF SUCH DAMAGES.
279 | 
280 |                      END OF TERMS AND CONDITIONS
281 | 
282 |             How to Apply These Terms to Your New Programs
283 | 
284 |   If you develop a new program, and you want it to be of the greatest
285 | possible use to the public, the best way to achieve this is to make it
286 | free software which everyone can redistribute and change under these terms.
287 | 
288 |   To do so, attach the following notices to the program.  It is safest
289 | to attach them to the start of each source file to most effectively
290 | convey the exclusion of warranty; and each file should have at least
291 | the "copyright" line and a pointer to where the full notice is found.
292 | 
293 |     <one line to give the program's name and a brief idea of what it does.>
294 |     Copyright (C) <year>  <name of author>
295 | 
296 |     This program is free software; you can redistribute it and/or modify
297 |     it under the terms of the GNU General Public License as published by
298 |     the Free Software Foundation; either version 2 of the License, or
299 |     (at your option) any later version.
300 | 
301 |     This program is distributed in the hope that it will be useful,
302 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
303 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
304 |     GNU General Public License for more details.
305 | 
306 |     You should have received a copy of the GNU General Public License along
307 |     with this program; if not, write to the Free Software Foundation, Inc.,
308 |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
309 | 
310 | Also add information on how to contact you by electronic and paper mail.
311 | 
312 | If the program is interactive, make it output a short notice like this
313 | when it starts in an interactive mode:
314 | 
315 |     Gnomovision version 69, Copyright (C) year name of author
316 |     Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
317 |     This is free software, and you are welcome to redistribute it
318 |     under certain conditions; type `show c' for details.
319 | 
320 | The hypothetical commands `show w' and `show c' should show the appropriate
321 | parts of the General Public License.  Of course, the commands you use may
322 | be called something other than `show w' and `show c'; they could even be
323 | mouse-clicks or menu items--whatever suits your program.
324 | 
325 | You should also get your employer (if you work as a programmer) or your
326 | school, if any, to sign a "copyright disclaimer" for the program, if
327 | necessary.  Here is a sample; alter the names:
328 | 
329 |   Yoyodyne, Inc., hereby disclaims all copyright interest in the program
330 |   `Gnomovision' (which makes passes at compilers) written by James Hacker.
331 | 
332 |   <signature of Ty Coon>, 1 April 1989
333 |   Ty Coon, President of Vice
334 | 
335 | This General Public License does not permit incorporating your program into
336 | proprietary programs.  If your program is a subroutine library, you may
337 | consider it more useful to permit linking proprietary applications with the
338 | library.  If this is what you want to do, use the GNU Lesser General
339 | Public License instead of this License.
340 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 3, 29 June 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 |                             Preamble
  9 | 
 10 |   The GNU General Public License is a free, copyleft license for
 11 | software and other kinds of works.
 12 | 
 13 |   The licenses for most software and other practical works are designed
 14 | to take away your freedom to share and change the works.  By contrast,
 15 | the GNU General Public License is intended to guarantee your freedom to
 16 | share and change all versions of a program--to make sure it remains free
 17 | software for all its users.  We, the Free Software Foundation, use the
 18 | GNU General Public License for most of our software; it applies also to
 19 | any other work released this way by its authors.  You can apply it to
 20 | your programs, too.
 21 | 
 22 |   When we speak of free software, we are referring to freedom, not
 23 | price.  Our General Public Licenses are designed to make sure that you
 24 | have the freedom to distribute copies of free software (and charge for
 25 | them if you wish), that you receive source code or can get it if you
 26 | want it, that you can change the software or use pieces of it in new
 27 | free programs, and that you know you can do these things.
 28 | 
 29 |   To protect your rights, we need to prevent others from denying you
 30 | these rights or asking you to surrender the rights.  Therefore, you have
 31 | certain responsibilities if you distribute copies of the software, or if
 32 | you modify it: responsibilities to respect the freedom of others.
 33 | 
 34 |   For example, if you distribute copies of such a program, whether
 35 | gratis or for a fee, you must pass on to the recipients the same
 36 | freedoms that you received.  You must make sure that they, too, receive
 37 | or can get the source code.  And you must show them these terms so they
 38 | know their rights.
 39 | 
 40 |   Developers that use the GNU GPL protect your rights with two steps:
 41 | (1) assert copyright on the software, and (2) offer you this License
 42 | giving you legal permission to copy, distribute and/or modify it.
 43 | 
 44 |   For the developers' and authors' protection, the GPL clearly explains
 45 | that there is no warranty for this free software.  For both users' and
 46 | authors' sake, the GPL requires that modified versions be marked as
 47 | changed, so that their problems will not be attributed erroneously to
 48 | authors of previous versions.
 49 | 
 50 |   Some devices are designed to deny users access to install or run
 51 | modified versions of the software inside them, although the manufacturer
 52 | can do so.  This is fundamentally incompatible with the aim of
 53 | protecting users' freedom to change the software.  The systematic
 54 | pattern of such abuse occurs in the area of products for individuals to
 55 | use, which is precisely where it is most unacceptable.  Therefore, we
 56 | have designed this version of the GPL to prohibit the practice for those
 57 | products.  If such problems arise substantially in other domains, we
 58 | stand ready to extend this provision to those domains in future versions
 59 | of the GPL, as needed to protect the freedom of users.
 60 | 
 61 |   Finally, every program is threatened constantly by software patents.
 62 | States should not allow patents to restrict development and use of
 63 | software on general-purpose computers, but in those that do, we wish to
 64 | avoid the special danger that patents applied to a free program could
 65 | make it effectively proprietary.  To prevent this, the GPL assures that
 66 | patents cannot be used to render the program non-free.
 67 | 
 68 |   The precise terms and conditions for copying, distribution and
 69 | modification follow.
 70 | 
 71 |                        TERMS AND CONDITIONS
 72 | 
 73 |   0. Definitions.
 74 | 
 75 |   "This License" refers to version 3 of the GNU General Public License.
 76 | 
 77 |   "Copyright" also means copyright-like laws that apply to other kinds of
 78 | works, such as semiconductor masks.
 79 | 
 80 |   "The Program" refers to any copyrightable work licensed under this
 81 | License.  Each licensee is addressed as "you".  "Licensees" and
 82 | "recipients" may be individuals or organizations.
 83 | 
 84 |   To "modify" a work means to copy from or adapt all or part of the work
 85 | in a fashion requiring copyright permission, other than the making of an
 86 | exact copy.  The resulting work is called a "modified version" of the
 87 | earlier work or a work "based on" the earlier work.
 88 | 
 89 |   A "covered work" means either the unmodified Program or a work based
 90 | on the Program.
 91 | 
 92 |   To "propagate" a work means to do anything with it that, without
 93 | permission, would make you directly or secondarily liable for
 94 | infringement under applicable copyright law, except executing it on a
 95 | computer or modifying a private copy.  Propagation includes copying,
 96 | distribution (with or without modification), making available to the
 97 | public, and in some countries other activities as well.
 98 | 
 99 |   To "convey" a work means any kind of propagation that enables other
100 | parties to make or receive copies.  Mere interaction with a user through
101 | a computer network, with no transfer of a copy, is not conveying.
102 | 
103 |   An interactive user interface displays "Appropriate Legal Notices"
104 | to the extent that it includes a convenient and prominently visible
105 | feature that (1) displays an appropriate copyright notice, and (2)
106 | tells the user that there is no warranty for the work (except to the
107 | extent that warranties are provided), that licensees may convey the
108 | work under this License, and how to view a copy of this License.  If
109 | the interface presents a list of user commands or options, such as a
110 | menu, a prominent item in the list meets this criterion.
111 | 
112 |   1. Source Code.
113 | 
114 |   The "source code" for a work means the preferred form of the work
115 | for making modifications to it.  "Object code" means any non-source
116 | form of a work.
117 | 
118 |   A "Standard Interface" means an interface that either is an official
119 | standard defined by a recognized standards body, or, in the case of
120 | interfaces specified for a particular programming language, one that
121 | is widely used among developers working in that language.
122 | 
123 |   The "System Libraries" of an executable work include anything, other
124 | than the work as a whole, that (a) is included in the normal form of
125 | packaging a Major Component, but which is not part of that Major
126 | Component, and (b) serves only to enable use of the work with that
127 | Major Component, or to implement a Standard Interface for which an
128 | implementation is available to the public in source code form.  A
129 | "Major Component", in this context, means a major essential component
130 | (kernel, window system, and so on) of the specific operating system
131 | (if any) on which the executable work runs, or a compiler used to
132 | produce the work, or an object code interpreter used to run it.
133 | 
134 |   The "Corresponding Source" for a work in object code form means all
135 | the source code needed to generate, install, and (for an executable
136 | work) run the object code and to modify the work, including scripts to
137 | control those activities.  However, it does not include the work's
138 | System Libraries, or general-purpose tools or generally available free
139 | programs which are used unmodified in performing those activities but
140 | which are not part of the work.  For example, Corresponding Source
141 | includes interface definition files associated with source files for
142 | the work, and the source code for shared libraries and dynamically
143 | linked subprograms that the work is specifically designed to require,
144 | such as by intimate data communication or control flow between those
145 | subprograms and other parts of the work.
146 | 
147 |   The Corresponding Source need not include anything that users
148 | can regenerate automatically from other parts of the Corresponding
149 | Source.
150 | 
151 |   The Corresponding Source for a work in source code form is that
152 | same work.
153 | 
154 |   2. Basic Permissions.
155 | 
156 |   All rights granted under this License are granted for the term of
157 | copyright on the Program, and are irrevocable provided the stated
158 | conditions are met.  This License explicitly affirms your unlimited
159 | permission to run the unmodified Program.  The output from running a
160 | covered work is covered by this License only if the output, given its
161 | content, constitutes a covered work.  This License acknowledges your
162 | rights of fair use or other equivalent, as provided by copyright law.
163 | 
164 |   You may make, run and propagate covered works that you do not
165 | convey, without conditions so long as your license otherwise remains
166 | in force.  You may convey covered works to others for the sole purpose
167 | of having them make modifications exclusively for you, or provide you
168 | with facilities for running those works, provided that you comply with
169 | the terms of this License in conveying all material for which you do
170 | not control copyright.  Those thus making or running the covered works
171 | for you must do so exclusively on your behalf, under your direction
172 | and control, on terms that prohibit them from making any copies of
173 | your copyrighted material outside their relationship with you.
174 | 
175 |   Conveying under any other circumstances is permitted solely under
176 | the conditions stated below.  Sublicensing is not allowed; section 10
177 | makes it unnecessary.
178 | 
179 |   3. Protecting Users' Legal Rights From Anti-Circumvention Law.
180 | 
181 |   No covered work shall be deemed part of an effective technological
182 | measure under any applicable law fulfilling obligations under article
183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
184 | similar laws prohibiting or restricting circumvention of such
185 | measures.
186 | 
187 |   When you convey a covered work, you waive any legal power to forbid
188 | circumvention of technological measures to the extent such circumvention
189 | is effected by exercising rights under this License with respect to
190 | the covered work, and you disclaim any intention to limit operation or
191 | modification of the work as a means of enforcing, against the work's
192 | users, your or third parties' legal rights to forbid circumvention of
193 | technological measures.
194 | 
195 |   4. Conveying Verbatim Copies.
196 | 
197 |   You may convey verbatim copies of the Program's source code as you
198 | receive it, in any medium, provided that you conspicuously and
199 | appropriately publish on each copy an appropriate copyright notice;
200 | keep intact all notices stating that this License and any
201 | non-permissive terms added in accord with section 7 apply to the code;
202 | keep intact all notices of the absence of any warranty; and give all
203 | recipients a copy of this License along with the Program.
204 | 
205 |   You may charge any price or no price for each copy that you convey,
206 | and you may offer support or warranty protection for a fee.
207 | 
208 |   5. Conveying Modified Source Versions.
209 | 
210 |   You may convey a work based on the Program, or the modifications to
211 | produce it from the Program, in the form of source code under the
212 | terms of section 4, provided that you also meet all of these conditions:
213 | 
214 |     a) The work must carry prominent notices stating that you modified
215 |     it, and giving a relevant date.
216 | 
217 |     b) The work must carry prominent notices stating that it is
218 |     released under this License and any conditions added under section
219 |     7.  This requirement modifies the requirement in section 4 to
220 |     "keep intact all notices".
221 | 
222 |     c) You must license the entire work, as a whole, under this
223 |     License to anyone who comes into possession of a copy.  This
224 |     License will therefore apply, along with any applicable section 7
225 |     additional terms, to the whole of the work, and all its parts,
226 |     regardless of how they are packaged.  This License gives no
227 |     permission to license the work in any other way, but it does not
228 |     invalidate such permission if you have separately received it.
229 | 
230 |     d) If the work has interactive user interfaces, each must display
231 |     Appropriate Legal Notices; however, if the Program has interactive
232 |     interfaces that do not display Appropriate Legal Notices, your
233 |     work need not make them do so.
234 | 
235 |   A compilation of a covered work with other separate and independent
236 | works, which are not by their nature extensions of the covered work,
237 | and which are not combined with it such as to form a larger program,
238 | in or on a volume of a storage or distribution medium, is called an
239 | "aggregate" if the compilation and its resulting copyright are not
240 | used to limit the access or legal rights of the compilation's users
241 | beyond what the individual works permit.  Inclusion of a covered work
242 | in an aggregate does not cause this License to apply to the other
243 | parts of the aggregate.
244 | 
245 |   6. Conveying Non-Source Forms.
246 | 
247 |   You may convey a covered work in object code form under the terms
248 | of sections 4 and 5, provided that you also convey the
249 | machine-readable Corresponding Source under the terms of this License,
250 | in one of these ways:
251 | 
252 |     a) Convey the object code in, or embodied in, a physical product
253 |     (including a physical distribution medium), accompanied by the
254 |     Corresponding Source fixed on a durable physical medium
255 |     customarily used for software interchange.
256 | 
257 |     b) Convey the object code in, or embodied in, a physical product
258 |     (including a physical distribution medium), accompanied by a
259 |     written offer, valid for at least three years and valid for as
260 |     long as you offer spare parts or customer support for that product
261 |     model, to give anyone who possesses the object code either (1) a
262 |     copy of the Corresponding Source for all the software in the
263 |     product that is covered by this License, on a durable physical
264 |     medium customarily used for software interchange, for a price no
265 |     more than your reasonable cost of physically performing this
266 |     conveying of source, or (2) access to copy the
267 |     Corresponding Source from a network server at no charge.
268 | 
269 |     c) Convey individual copies of the object code with a copy of the
270 |     written offer to provide the Corresponding Source.  This
271 |     alternative is allowed only occasionally and noncommercially, and
272 |     only if you received the object code with such an offer, in accord
273 |     with subsection 6b.
274 | 
275 |     d) Convey the object code by offering access from a designated
276 |     place (gratis or for a charge), and offer equivalent access to the
277 |     Corresponding Source in the same way through the same place at no
278 |     further charge.  You need not require recipients to copy the
279 |     Corresponding Source along with the object code.  If the place to
280 |     copy the object code is a network server, the Corresponding Source
281 |     may be on a different server (operated by you or a third party)
282 |     that supports equivalent copying facilities, provided you maintain
283 |     clear directions next to the object code saying where to find the
284 |     Corresponding Source.  Regardless of what server hosts the
285 |     Corresponding Source, you remain obligated to ensure that it is
286 |     available for as long as needed to satisfy these requirements.
287 | 
288 |     e) Convey the object code using peer-to-peer transmission, provided
289 |     you inform other peers where the object code and Corresponding
290 |     Source of the work are being offered to the general public at no
291 |     charge under subsection 6d.
292 | 
293 |   A separable portion of the object code, whose source code is excluded
294 | from the Corresponding Source as a System Library, need not be
295 | included in conveying the object code work.
296 | 
297 |   A "User Product" is either (1) a "consumer product", which means any
298 | tangible personal property which is normally used for personal, family,
299 | or household purposes, or (2) anything designed or sold for incorporation
300 | into a dwelling.  In determining whether a product is a consumer product,
301 | doubtful cases shall be resolved in favor of coverage.  For a particular
302 | product received by a particular user, "normally used" refers to a
303 | typical or common use of that class of product, regardless of the status
304 | of the particular user or of the way in which the particular user
305 | actually uses, or expects or is expected to use, the product.  A product
306 | is a consumer product regardless of whether the product has substantial
307 | commercial, industrial or non-consumer uses, unless such uses represent
308 | the only significant mode of use of the product.
309 | 
310 |   "Installation Information" for a User Product means any methods,
311 | procedures, authorization keys, or other information required to install
312 | and execute modified versions of a covered work in that User Product from
313 | a modified version of its Corresponding Source.  The information must
314 | suffice to ensure that the continued functioning of the modified object
315 | code is in no case prevented or interfered with solely because
316 | modification has been made.
317 | 
318 |   If you convey an object code work under this section in, or with, or
319 | specifically for use in, a User Product, and the conveying occurs as
320 | part of a transaction in which the right of possession and use of the
321 | User Product is transferred to the recipient in perpetuity or for a
322 | fixed term (regardless of how the transaction is characterized), the
323 | Corresponding Source conveyed under this section must be accompanied
324 | by the Installation Information.  But this requirement does not apply
325 | if neither you nor any third party retains the ability to install
326 | modified object code on the User Product (for example, the work has
327 | been installed in ROM).
328 | 
329 |   The requirement to provide Installation Information does not include a
330 | requirement to continue to provide support service, warranty, or updates
331 | for a work that has been modified or installed by the recipient, or for
332 | the User Product in which it has been modified or installed.  Access to a
333 | network may be denied when the modification itself materially and
334 | adversely affects the operation of the network or violates the rules and
335 | protocols for communication across the network.
336 | 
337 |   Corresponding Source conveyed, and Installation Information provided,
338 | in accord with this section must be in a format that is publicly
339 | documented (and with an implementation available to the public in
340 | source code form), and must require no special password or key for
341 | unpacking, reading or copying.
342 | 
343 |   7. Additional Terms.
344 | 
345 |   "Additional permissions" are terms that supplement the terms of this
346 | License by making exceptions from one or more of its conditions.
347 | Additional permissions that are applicable to the entire Program shall
348 | be treated as though they were included in this License, to the extent
349 | that they are valid under applicable law.  If additional permissions
350 | apply only to part of the Program, that part may be used separately
351 | under those permissions, but the entire Program remains governed by
352 | this License without regard to the additional permissions.
353 | 
354 |   When you convey a copy of a covered work, you may at your option
355 | remove any additional permissions from that copy, or from any part of
356 | it.  (Additional permissions may be written to require their own
357 | removal in certain cases when you modify the work.)  You may place
358 | additional permissions on material, added by you to a covered work,
359 | for which you have or can give appropriate copyright permission.
360 | 
361 |   Notwithstanding any other provision of this License, for material you
362 | add to a covered work, you may (if authorized by the copyright holders of
363 | that material) supplement the terms of this License with terms:
364 | 
365 |     a) Disclaiming warranty or limiting liability differently from the
366 |     terms of sections 15 and 16 of this License; or
367 | 
368 |     b) Requiring preservation of specified reasonable legal notices or
369 |     author attributions in that material or in the Appropriate Legal
370 |     Notices displayed by works containing it; or
371 | 
372 |     c) Prohibiting misrepresentation of the origin of that material, or
373 |     requiring that modified versions of such material be marked in
374 |     reasonable ways as different from the original version; or
375 | 
376 |     d) Limiting the use for publicity purposes of names of licensors or
377 |     authors of the material; or
378 | 
379 |     e) Declining to grant rights under trademark law for use of some
380 |     trade names, trademarks, or service marks; or
381 | 
382 |     f) Requiring indemnification of licensors and authors of that
383 |     material by anyone who conveys the material (or modified versions of
384 |     it) with contractual assumptions of liability to the recipient, for
385 |     any liability that these contractual assumptions directly impose on
386 |     those licensors and authors.
387 | 
388 |   All other non-permissive additional terms are considered "further
389 | restrictions" within the meaning of section 10.  If the Program as you
390 | received it, or any part of it, contains a notice stating that it is
391 | governed by this License along with a term that is a further
392 | restriction, you may remove that term.  If a license document contains
393 | a further restriction but permits relicensing or conveying under this
394 | License, you may add to a covered work material governed by the terms
395 | of that license document, provided that the further restriction does
396 | not survive such relicensing or conveying.
397 | 
398 |   If you add terms to a covered work in accord with this section, you
399 | must place, in the relevant source files, a statement of the
400 | additional terms that apply to those files, or a notice indicating
401 | where to find the applicable terms.
402 | 
403 |   Additional terms, permissive or non-permissive, may be stated in the
404 | form of a separately written license, or stated as exceptions;
405 | the above requirements apply either way.
406 | 
407 |   8. Termination.
408 | 
409 |   You may not propagate or modify a covered work except as expressly
410 | provided under this License.  Any attempt otherwise to propagate or
411 | modify it is void, and will automatically terminate your rights under
412 | this License (including any patent licenses granted under the third
413 | paragraph of section 11).
414 | 
415 |   However, if you cease all violation of this License, then your
416 | license from a particular copyright holder is reinstated (a)
417 | provisionally, unless and until the copyright holder explicitly and
418 | finally terminates your license, and (b) permanently, if the copyright
419 | holder fails to notify you of the violation by some reasonable means
420 | prior to 60 days after the cessation.
421 | 
422 |   Moreover, your license from a particular copyright holder is
423 | reinstated permanently if the copyright holder notifies you of the
424 | violation by some reasonable means, this is the first time you have
425 | received notice of violation of this License (for any work) from that
426 | copyright holder, and you cure the violation prior to 30 days after
427 | your receipt of the notice.
428 | 
429 |   Termination of your rights under this section does not terminate the
430 | licenses of parties who have received copies or rights from you under
431 | this License.  If your rights have been terminated and not permanently
432 | reinstated, you do not qualify to receive new licenses for the same
433 | material under section 10.
434 | 
435 |   9. Acceptance Not Required for Having Copies.
436 | 
437 |   You are not required to accept this License in order to receive or
438 | run a copy of the Program.  Ancillary propagation of a covered work
439 | occurring solely as a consequence of using peer-to-peer transmission
440 | to receive a copy likewise does not require acceptance.  However,
441 | nothing other than this License grants you permission to propagate or
442 | modify any covered work.  These actions infringe copyright if you do
443 | not accept this License.  Therefore, by modifying or propagating a
444 | covered work, you indicate your acceptance of this License to do so.
445 | 
446 |   10. Automatic Licensing of Downstream Recipients.
447 | 
448 |   Each time you convey a covered work, the recipient automatically
449 | receives a license from the original licensors, to run, modify and
450 | propagate that work, subject to this License.  You are not responsible
451 | for enforcing compliance by third parties with this License.
452 | 
453 |   An "entity transaction" is a transaction transferring control of an
454 | organization, or substantially all assets of one, or subdividing an
455 | organization, or merging organizations.  If propagation of a covered
456 | work results from an entity transaction, each party to that
457 | transaction who receives a copy of the work also receives whatever
458 | licenses to the work the party's predecessor in interest had or could
459 | give under the previous paragraph, plus a right to possession of the
460 | Corresponding Source of the work from the predecessor in interest, if
461 | the predecessor has it or can get it with reasonable efforts.
462 | 
463 |   You may not impose any further restrictions on the exercise of the
464 | rights granted or affirmed under this License.  For example, you may
465 | not impose a license fee, royalty, or other charge for exercise of
466 | rights granted under this License, and you may not initiate litigation
467 | (including a cross-claim or counterclaim in a lawsuit) alleging that
468 | any patent claim is infringed by making, using, selling, offering for
469 | sale, or importing the Program or any portion of it.
470 | 
471 |   11. Patents.
472 | 
473 |   A "contributor" is a copyright holder who authorizes use under this
474 | License of the Program or a work on which the Program is based.  The
475 | work thus licensed is called the contributor's "contributor version".
476 | 
477 |   A contributor's "essential patent claims" are all patent claims
478 | owned or controlled by the contributor, whether already acquired or
479 | hereafter acquired, that would be infringed by some manner, permitted
480 | by this License, of making, using, or selling its contributor version,
481 | but do not include claims that would be infringed only as a
482 | consequence of further modification of the contributor version.  For
483 | purposes of this definition, "control" includes the right to grant
484 | patent sublicenses in a manner consistent with the requirements of
485 | this License.
486 | 
487 |   Each contributor grants you a non-exclusive, worldwide, royalty-free
488 | patent license under the contributor's essential patent claims, to
489 | make, use, sell, offer for sale, import and otherwise run, modify and
490 | propagate the contents of its contributor version.
491 | 
492 |   In the following three paragraphs, a "patent license" is any express
493 | agreement or commitment, however denominated, not to enforce a patent
494 | (such as an express permission to practice a patent or covenant not to
495 | sue for patent infringement).  To "grant" such a patent license to a
496 | party means to make such an agreement or commitment not to enforce a
497 | patent against the party.
498 | 
499 |   If you convey a covered work, knowingly relying on a patent license,
500 | and the Corresponding Source of the work is not available for anyone
501 | to copy, free of charge and under the terms of this License, through a
502 | publicly available network server or other readily accessible means,
503 | then you must either (1) cause the Corresponding Source to be so
504 | available, or (2) arrange to deprive yourself of the benefit of the
505 | patent license for this particular work, or (3) arrange, in a manner
506 | consistent with the requirements of this License, to extend the patent
507 | license to downstream recipients.  "Knowingly relying" means you have
508 | actual knowledge that, but for the patent license, your conveying the
509 | covered work in a country, or your recipient's use of the covered work
510 | in a country, would infringe one or more identifiable patents in that
511 | country that you have reason to believe are valid.
512 | 
513 |   If, pursuant to or in connection with a single transaction or
514 | arrangement, you convey, or propagate by procuring conveyance of, a
515 | covered work, and grant a patent license to some of the parties
516 | receiving the covered work authorizing them to use, propagate, modify
517 | or convey a specific copy of the covered work, then the patent license
518 | you grant is automatically extended to all recipients of the covered
519 | work and works based on it.
520 | 
521 |   A patent license is "discriminatory" if it does not include within
522 | the scope of its coverage, prohibits the exercise of, or is
523 | conditioned on the non-exercise of one or more of the rights that are
524 | specifically granted under this License.  You may not convey a covered
525 | work if you are a party to an arrangement with a third party that is
526 | in the business of distributing software, under which you make payment
527 | to the third party based on the extent of your activity of conveying
528 | the work, and under which the third party grants, to any of the
529 | parties who would receive the covered work from you, a discriminatory
530 | patent license (a) in connection with copies of the covered work
531 | conveyed by you (or copies made from those copies), or (b) primarily
532 | for and in connection with specific products or compilations that
533 | contain the covered work, unless you entered into that arrangement,
534 | or that patent license was granted, prior to 28 March 2007.
535 | 
536 |   Nothing in this License shall be construed as excluding or limiting
537 | any implied license or other defenses to infringement that may
538 | otherwise be available to you under applicable patent law.
539 | 
540 |   12. No Surrender of Others' Freedom.
541 | 
542 |   If conditions are imposed on you (whether by court order, agreement or
543 | otherwise) that contradict the conditions of this License, they do not
544 | excuse you from the conditions of this License.  If you cannot convey a
545 | covered work so as to satisfy simultaneously your obligations under this
546 | License and any other pertinent obligations, then as a consequence you may
547 | not convey it at all.  For example, if you agree to terms that obligate you
548 | to collect a royalty for further conveying from those to whom you convey
549 | the Program, the only way you could satisfy both those terms and this
550 | License would be to refrain entirely from conveying the Program.
551 | 
552 |   13. Use with the GNU Affero General Public License.
553 | 
554 |   Notwithstanding any other provision of this License, you have
555 | permission to link or combine any covered work with a work licensed
556 | under version 3 of the GNU Affero General Public License into a single
557 | combined work, and to convey the resulting work.  The terms of this
558 | License will continue to apply to the part which is the covered work,
559 | but the special requirements of the GNU Affero General Public License,
560 | section 13, concerning interaction through a network will apply to the
561 | combination as such.
562 | 
563 |   14. Revised Versions of this License.
564 | 
565 |   The Free Software Foundation may publish revised and/or new versions of
566 | the GNU General Public License from time to time.  Such new versions will
567 | be similar in spirit to the present version, but may differ in detail to
568 | address new problems or concerns.
569 | 
570 |   Each version is given a distinguishing version number.  If the
571 | Program specifies that a certain numbered version of the GNU General
572 | Public License "or any later version" applies to it, you have the
573 | option of following the terms and conditions either of that numbered
574 | version or of any later version published by the Free Software
575 | Foundation.  If the Program does not specify a version number of the
576 | GNU General Public License, you may choose any version ever published
577 | by the Free Software Foundation.
578 | 
579 |   If the Program specifies that a proxy can decide which future
580 | versions of the GNU General Public License can be used, that proxy's
581 | public statement of acceptance of a version permanently authorizes you
582 | to choose that version for the Program.
583 | 
584 |   Later license versions may give you additional or different
585 | permissions.  However, no additional obligations are imposed on any
586 | author or copyright holder as a result of your choosing to follow a
587 | later version.
588 | 
589 |   15. Disclaimer of Warranty.
590 | 
591 |   THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
592 | APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
596 | PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
597 | IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
599 | 
600 |   16. Limitation of Liability.
601 | 
602 |   IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
610 | SUCH DAMAGES.
611 | 
612 |   17. Interpretation of Sections 15 and 16.
613 | 
614 |   If the disclaimer of warranty and limitation of liability provided
615 | above cannot be given local legal effect according to their terms,
616 | reviewing courts shall apply local law that most closely approximates
617 | an absolute waiver of all civil liability in connection with the
618 | Program, unless a warranty or assumption of liability accompanies a
619 | copy of the Program in return for a fee.
620 | 
621 |                      END OF TERMS AND CONDITIONS
622 | 
623 |             How to Apply These Terms to Your New Programs
624 | 
625 |   If you develop a new program, and you want it to be of the greatest
626 | possible use to the public, the best way to achieve this is to make it
627 | free software which everyone can redistribute and change under these terms.
628 | 
629 |   To do so, attach the following notices to the program.  It is safest
630 | to attach them to the start of each source file to most effectively
631 | state the exclusion of warranty; and each file should have at least
632 | the "copyright" line and a pointer to where the full notice is found.
633 | 
634 |     <one line to give the program's name and a brief idea of what it does.>
635 |     Copyright (C) <year>  <name of author>
636 | 
637 |     This program is free software: you can redistribute it and/or modify
638 |     it under the terms of the GNU General Public License as published by
639 |     the Free Software Foundation, either version 3 of the License, or
640 |     (at your option) any later version.
641 | 
642 |     This program is distributed in the hope that it will be useful,
643 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
644 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
645 |     GNU General Public License for more details.
646 | 
647 |     You should have received a copy of the GNU General Public License
648 |     along with this program.  If not, see <https://www.gnu.org/licenses/>.
649 | 
650 | Also add information on how to contact you by electronic and paper mail.
651 | 
652 |   If the program does terminal interaction, make it output a short
653 | notice like this when it starts in an interactive mode:
654 | 
655 |     <program>  Copyright (C) <year>  <name of author>
656 |     This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
657 |     This is free software, and you are welcome to redistribute it
658 |     under certain conditions; type `show c' for details.
659 | 
660 | The hypothetical commands `show w' and `show c' should show the appropriate
661 | parts of the General Public License.  Of course, your program's commands
662 | might be different; for a GUI interface, you would use an "about box".
663 | 
664 |   You should also get your employer (if you work as a programmer) or school,
665 | if any, to sign a "copyright disclaimer" for the program, if necessary.
666 | For more information on this, and how to apply and follow the GNU GPL, see
667 | <https://www.gnu.org/licenses/>.
668 | 
669 |   The GNU General Public License does not permit incorporating your program
670 | into proprietary programs.  If your program is a subroutine library, you
671 | may consider it more useful to permit linking proprietary applications with
672 | the library.  If this is what you want to do, use the GNU Lesser General
673 | Public License instead of this License.  But first, please read
674 | <https://www.gnu.org/licenses/why-not-lgpl.html>.
675 | 


--------------------------------------------------------------------------------