├── example ├── NJST258_1__CP006923.gbk.gz └── gbk2tbl_modifiers.txt ├── shell ├── unlinkFiles.sh ├── compile_ariba_reports.sh ├── gff2fasta.sh ├── shortenSPAdesContigNames.sh ├── cat_fasta.sh ├── renameFiles.sh ├── extractSingleChrFromVCF.sh ├── mkSymbolicLinks.sh ├── catContigStats.sh ├── rename_PE_readsets.sh ├── saveSPAdesOutputs.sh ├── blastShowRepeats.sh ├── extractSPAdesAssemblyStats.sh ├── catAssemblyStats.sh ├── catGzippedFASTQsPerDirectory.sh ├── linkFiles.sh ├── catGzippedFASTQsPerSample.sh ├── download_ena_pe_reads.sh └── download_reads_from_sra.sh ├── .gitignore ├── tabulateMUMmerCoordinates.py ├── extractSeqFromMultiFASTA.py ├── add_sample_name_FASTA.py ├── parse_ENA_sampleInfo_XML.py ├── rename_fasta_seqs.py ├── filename_generator.py ├── exclude_pseudo_seqs.py ├── seqlen.py ├── extractNuclRegionFromFASTA.py ├── gfa_stats.py ├── filterSPAdesContigs.py ├── gc.py ├── run_CutAdapt.py ├── mergeGenomicRegions.R ├── screen_genes_blast.py ├── linkPEreadsets.py ├── gbk2tsv.py ├── gbk2tbl.py ├── parse_biosample.py ├── README.md ├── extractSeqFromGBK.py ├── other_licence ├── Apache Licence-2.0.txt └── GPL-2.0.txt ├── downloadSeqFromNCBI.py └── LICENSE /example/NJST258_1__CP006923.gbk.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wanyuac/BINF_toolkit/HEAD/example/NJST258_1__CP006923.gbk.gz -------------------------------------------------------------------------------- /example/gbk2tbl_modifiers.txt: -------------------------------------------------------------------------------- 1 | [organism=Klebsiella pneumoniae] [strain=NJST258_1] [topology=circular] [moltype=DNA] [tech=wgs] [gcode=11] [country=USA] [isolation-source=urine] [collection-date=2010] -------------------------------------------------------------------------------- /shell/unlinkFiles.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Iterates the unlink command on multiple soft links. 3 | # Example: 4 | # unlinkFiles.sh reads/*.fastq.gz 5 | # unlinkFiles.sh $(cat link_list.txt) 6 | # Author: Yu Wan (14-15 April 2017) published at https://github.com/wanyuac/BINF_toolkit 7 | # License: Apache-2.0 8 | 9 | links=( $@ ) 10 | for i in ${links[@]}; do 11 | unlink $i 12 | done 13 | -------------------------------------------------------------------------------- /shell/compile_ariba_reports.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Run this script in conda environment 'ariba' 3 | # Yu Wan (1/7/2021) 4 | 5 | cd $1 6 | 7 | while read -r g 8 | do 9 | f="ariba_out/${g}/report.tsv" 10 | if [ -f "$f" ] 11 | then 12 | cp $f report/in.report.${g}.tsv 13 | else 14 | echo "Isolate $g did not have any report generated." 15 | fi 16 | done < "$2" 17 | 18 | ariba summary --no_tree --verbose summary report/in.report.*.tsv 19 | -------------------------------------------------------------------------------- /shell/gff2fasta.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This script extract sequence regions from GFF3 files which contain complete assembled sequences. 3 | # In this kind of file, the sequences must be put at the end of each file. The sequence domain is 4 | # separated from the annotation domain by the delimiter "###FASTA". 5 | # Usage: bash gff2fasta.sh [input GFF file(s)] 6 | # Examples: 7 | # bash gff2fasta.sh *.gff 8 | # bash gff2fasta.sh strain1.gff strain2.gff ... 9 | # Licence: GNU GPL 2.1 10 | # Author: Yu Wan (wanyuac@gmail.com) 11 | # Development history: 21/7/2016 12 | 13 | ext='fna' # the file extension 14 | 15 | for f in "$@"; do # loop through each argument 16 | base=`basename $f .gff` # remove the path as well as the file extension 17 | k=`grep -n '##FASTA' $f | cut -f1 -d ':'` 18 | tail -n +$((k + 1)) $f > ${base}.$ext # print lines starting with the kth 19 | done 20 | -------------------------------------------------------------------------------- /shell/shortenSPAdesContigNames.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (C) 2020 Yu Wan 3 | # Licensed under the GNU General Public Licence version 3 (GPLv3) . 4 | # Release: 28 Aug 2020 5 | 6 | display_usage(){ 7 | echo " 8 | Shorten contig names from SPAdes by substituting white spaces for '_length_' in sequence headers, 9 | so the latter part of sequence description will be ignored by Prokka and some other software. 10 | This is particularly useful for Prokka annotation because a long contig name consumes all space 11 | between the contig name and length in the output GenBank file, causing a problem to SniEff and so on. 12 | 13 | Command line: bash shortenSPAdesContigNames.sh [input FASTA file] > [new FASTA file] 14 | " 15 | } 16 | 17 | if [ -z $1 ]; then 18 | display_usage 19 | exit 20 | else 21 | sed 's/_length_/ /g' $1 22 | fi 23 | -------------------------------------------------------------------------------- /shell/cat_fasta.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This script concatenates reference sequences of the same bacterial strain into a single multi-FASTA file. 3 | # The name of every FASTA file must follow the format: [strain name]__[accession number].fasta 4 | # For example, AH0650_Sm1__LFJS01000001.fasta. 5 | # Example command line: 6 | # bash concat_fasta.sh 'fasta/*__*.fasta' # Quotes are necessary! 7 | # bash concat_fasta.sh 'fasta/strain__*.fasta' 8 | # bash concat_fasta.sh 'fasta/strain__*.faa' 9 | # Licence: GNU GPL 2.1 10 | # Author: Yu Wan (wanyuac@gmail.com) 11 | # Development history: 9 Aug 2016, 12 Sep 2016 12 | 13 | f=$1 14 | ext=${f##*.} # get the file name extension 15 | 16 | strains=$(ls -1 ${1} | xargs -I '{}' basename {} ".${ext}" | grep -oP '.+(?=__)' | sort -u) 17 | echo "$(echo $strains | tr " " "\n" | wc -l) strains are to be processed." 18 | 19 | path=$(dirname "$1") 20 | for s in ${strains}; do 21 | cat ${path}/${s}__*.${ext} > ${s}.${ext} 22 | done 23 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | 53 | # Sphinx documentation 54 | docs/_build/ 55 | 56 | # PyBuilder 57 | target/ 58 | *.lnk 59 | org.py 60 | -------------------------------------------------------------------------------- /shell/renameFiles.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (C) 2020 Yu Wan 3 | # Licensed under the GNU General Public Licence version 3 (GPLv3) . 4 | # Publication: 3/3/2020 5 | 6 | # Guidance ######################### 7 | display_usage() { 8 | echo " 9 | Usage: 10 | Rename or move files according to a two-column TSV file: [old filename]\t[new filename] 11 | Example command: renameFiles.sh names.tsv 12 | " 13 | } 14 | 15 | if [ -z $1 ]; then 16 | display_usage 17 | exit 18 | fi 19 | 20 | # Implementation ######################### 21 | while read line; do 22 | # Split the delimited string into an arrary of two elements. 23 | # Do not use IFS=$"\t" as it does not work correctly. 24 | # # https://unix.stackexchange.com/questions/410710/splitting-a-line-into-array-in-bash-with-tab-as-delimiter 25 | 26 | IFS=$'\t' read -r -a names <<< "$line" 27 | echo -e "Change or move: ${names[0]} --> ${names[1]}." 28 | mv ${names[0]} ${names[1]} 29 | done < "$1" # expect a file name as an input 30 | -------------------------------------------------------------------------------- /shell/extractSingleChrFromVCF.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Copyright (C) 2021 Yu Wan 4 | # Licensed under the GNU General Public Licence version 3 (GPLv3) . 5 | # Publication: 28/4/2021; latest update: 28/4/2021 6 | 7 | display_useage() { 8 | echo " 9 | Extract variants in a specific chromosome from a VCF file. 10 | Command: extractSingleChrFromVCF.sh [input VCF] [output VCF] [target chromosome name] 11 | " 12 | } 13 | 14 | if [ -z $1 ]; then 15 | display_usage 16 | exit 17 | fi 18 | 19 | vcf_in=$1 20 | vcf_out=$2 21 | chr=$3 22 | outdir=$(dirname $vcf_out) 23 | tmpfile=$outdir/tmp.vcf 24 | 25 | n=$(grep -n "##contig= $vcf_out 27 | 28 | # The following two lines avoid printing a duplicated "##contig= $tmpfile 31 | 32 | grep '#CHROM' $tmpfile >> $vcf_out 33 | grep "$chr" $tmpfile >> $vcf_out 34 | 35 | rm -f $tmpfile 36 | -------------------------------------------------------------------------------- /tabulateMUMmerCoordinates.py: -------------------------------------------------------------------------------- 1 | """ 2 | Convert MUMmer's output into a CSV file. 3 | Yu Wan (20 Apr 2017) 4 | Example: python tabulateMUMerCoordinates.py input.coords > output.coords 5 | Reference: David Edwards' script filterCoords.py in the RedDog (https://github.com/katholt/RedDog) suite. 6 | Licence: Apache-2.0 7 | Python version: 3.5.2 (but compatible to Python 2) 8 | """ 9 | 10 | import sys 11 | 12 | def main(): 13 | count = 0 14 | with open(sys.argv[1], "rU") as f: 15 | print("Start,End,Identity") 16 | for line in f: 17 | if count <= 5: # skip the first five lines, including the self-self comparison (100% identity) 18 | count += 1 19 | else: 20 | data = line.split("|") 21 | identity = float(data[3]) # drops all white spaces 22 | coords = data[0].split() # removes all white spaces as well 23 | start = coords[0] 24 | end = coords[1] 25 | print(",".join([start, end, str(identity)])) 26 | 27 | if __name__ == "__main__": 28 | main() -------------------------------------------------------------------------------- /shell/mkSymbolicLinks.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Makes symbolic links of any kind of files under another directory in accordance with a list of sample names as inputs from 3 | # either a single-column text file or stdin. 4 | # Format of the command line: 5 | # bash mkSymbolicLins.sh [suffix] [source directory] [output directory] sample_names.txt 6 | # Notice: 1. source and output directories must not be the same; 2. no forward slash ("/") should be attached to directory names. 7 | # Examples: 8 | # sh mkSymbolicLinks.sh '_snps.vcf' ~/data ~/links strain_names.txt 9 | # sh mkSymbolicLinks.sh '_snps.vcf' . ~/links strain_names.txt 10 | # cat strain_names.txt | sh mkVCFLinks.sh '_snps.vcf' ~/data ~/links 11 | # In all examples, symbolic links [strain name]__snps.vcf will be created under the directory ~/links. 12 | # Limitation: every pair of original file and its symobolic link shares the same filename suffix. Hence users must separate them 13 | # with different directories. 14 | # Author: Yu Wan (20, 22 Apr 2017) 15 | # Licence: Apache-2.0 16 | 17 | while IFS= read -r id; do 18 | ln -s ${2}/${id}${1} ${3}/${id}${1} 19 | done < "${4:-/dev/stdin}" # takes $4 if defined otherwise takes the stdin 20 | -------------------------------------------------------------------------------- /shell/catContigStats.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This script concatenates assembly statistics from our lab's previous assembly pipeline. 3 | # Copyright (C) 2017 Yu Wan 4 | # Licensed under the GNU General Public License (GPL) version 3 5 | # Creation: 31/10/2016; the latest version: 28/11/2017 6 | 7 | display_usage(){ 8 | echo " 9 | Concatenates *_contigStats.txt files that are generated using our lab's contigMetrics.py. 10 | Usage: bash catContigStats.sh ./assemblies/ 11 | Outputs: 12 | contigStats_files.txt 13 | contigStats_combined.csv 14 | " 15 | } 16 | 17 | HEADER="contigFile,numContigs,totalBases,N50,smallest,lowerQ,median,upperQ,largest" 18 | FILE_LIST="contigStats_files.txt" 19 | STATS="contigStats_combined.csv" 20 | 21 | # Check argument ########## 22 | if [ -z $1 ]; then 23 | echo "Error: a subject directory must be provided." 24 | display_usage 25 | exit 26 | fi 27 | 28 | # Find all contigStats.txt files ########## 29 | find $1 -name *_contigStats.txt -type f > $FILE_LIST 30 | echo "There are `cat ${FILE_LIST} | wc -l` genomes." 31 | 32 | # Print contig statistics into a CSV file ########## 33 | echo $HEADER > $STATS 34 | 35 | # Extract the second line of every file and appends it to the CSV file ========== 36 | files=`cat ${FILE_LIST}` 37 | n=0 38 | 39 | for f in ${files}; do 40 | r=`cat ${f} | wc -l` 41 | if [ "$r" -eq "2" ]; then 42 | tail -n 1 $f >> $STATS 43 | ((n++)) 44 | else 45 | echo "Warning: ${f} does not contain contig statistics." 46 | fi 47 | done 48 | 49 | echo "Success: ${n} lines of statistics have been transferred into ${STATS}." 50 | -------------------------------------------------------------------------------- /shell/rename_PE_readsets.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Renaming Illumina paired-end readsets via symbolic links. 3 | # Copyright (C) 2023 Yu Wan 4 | # Licensed under the GNU General Public Licence version 3 (GPLv3) . 5 | # Publication: 31 July 2023; latest update: 31 July 2023 6 | 7 | display_usage() { 8 | echo "Rename paired-end readsets via symbolic links. 9 | Command line: 10 | rename_PE_readsets.sh [mapping file] [directory of original readsets] [directory for links] 11 | The mapping file is TSV-delimited and does not have any header. It consists of two columns: 12 | original name and new name, respectively. This script assumes filenames of readsets have 13 | suffices _1.fastq.gz and _2.fastq.gz" 14 | } 15 | 16 | if [ -z "$1" ] || [ $1 = "-h" ]; then 17 | display_usage 18 | exit 19 | fi 20 | 21 | dir_in="$2" 22 | dir_out="$3" 23 | 24 | if [ ! -d "$dir_in" ]; then 25 | echo "Error: input directory $dir_in was not found." >&2 26 | exit 27 | fi 28 | 29 | if [ ! -d "$dir_out" ]; then 30 | echo "Create output directory $dir_out" 31 | mkdir -p "$dir_out" 32 | fi 33 | 34 | while read -r line; do 35 | IFS=$'\t' read -r -a vals <<< "$line" 36 | i="${vals[0]}" # Original name 37 | j="${vals[1]}" # New name 38 | r1="$dir_in/${i}_1.fastq.gz" 39 | r2="$dir_in/${i}_2.fastq.gz" 40 | if [ -f "$r1" ] && [ -f "$r2" ]; then 41 | t1="$dir_out/${j}_1.fastq.gz" 42 | t2="$dir_out/${j}_2.fastq.gz" 43 | echo -e "$r1 -> $t1\t$r2 -> $t2" 44 | ln -s "$r1" "$t1" 45 | ln -s "$r2" "$t2" 46 | else 47 | echo "Error: $r1 or $r2 were not accessible. No links were created for sample ${i}." >&2 48 | fi 49 | done < "$1" 50 | -------------------------------------------------------------------------------- /shell/saveSPAdesOutputs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (C) 2021 Yu Wan 4 | # Licensed under the GNU General Public Licence version 3 (GPLv3) . 5 | # Release: 23 Mar 2021 6 | 7 | display_usage(){ 8 | echo " 9 | Assuming SPAdes have produced assembly files for each genome under a subdirectory named by the genome. 10 | For example, for genomes g1, g2, and g3, subdirectories g1/, g2/, and g3/ have been created under a 11 | parental directory assemblies/. 12 | 13 | Command line: sh saveSPAdesOutputs.sh [parental directory] 14 | Example command line: sh saveSPAdesOutputs.sh \$PWD 15 | 16 | Output: essential assembly files will be copied from subdirectories to the parental directory and be renamed 17 | by subdirectory names (genome names). Then users may delete subdirectories to save space. 18 | " 19 | } 20 | 21 | d=$1 # Parental (output) directory 22 | cd $d 23 | for dsub in `ls -1 -d */`; do 24 | g=`basename $dsub` # Remove the end '/' character and use the subdirectory name as the genome name 25 | 26 | # Assembly graphs 27 | cp $g/assembly_graph.fastg ./${g}.fastg 28 | cp $g/assembly_graph_after_simplification.gfa ./${g}__simplified.gfa 29 | cp $g/assembly_graph_with_scaffolds.gfa ./${g}__scaffolds.gfa # May be the same as assembly_graph_after_simplification.gfa. 30 | 31 | # Contigs 32 | cp $g/contigs.fasta ./${g}__contigs.fna # It has less nodes than does assembly_graph.fastg. 33 | cp $g/contigs.paths ./${g}__contigs.paths 34 | 35 | # Scaffolds 36 | cp $g/scaffolds.fasta ./${g}__scaffolds.fna # It has less contigs (scaffolded) than does contigs.fasta. 37 | cp $g/scaffolds.paths ./${g}__scaffolds.paths 38 | 39 | # Supplementary information 40 | cp $g/spades.log ./${g}.log 41 | done 42 | -------------------------------------------------------------------------------- /shell/blastShowRepeats.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # We BLAST a nucleotide sequence against itself to identify repetitive regions. Of course, every region 3 | # matches to itself as well. This script is hence developed to remove such self-matches. It assumes 4 | # the input crunch file follows default columns in the '-fmt 6' output format of BLAST: 5 | # qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore 6 | # 7 | # Example commands (First, use the 'chmod' command to make this script executable): 8 | # blastn -query sample.fasta -db ref -task megablast -evalue 0.01 -perc_identity 90 -max_target_seqs 10 -outfmt 6 | blastShowRepeats.sh sample_vs_ref.crunch 9 | # or: blastShowRepeats.sh sample_vs_ref.crunch > sample_vs_ref_repeats.crunch 10 | # 11 | # Copyright (C) 2020 Yu Wan 12 | # Licensed under the GNU General Public Licence version 3 (GPLv3) . 13 | # Publication: 28 Apr 2020 14 | 15 | # Parse tab-delimited lines into an array 16 | while IFS=$'\t' read -r -a line 17 | do 18 | qstart="${line[6]}" 19 | qend="${line[7]}" 20 | sstart="${line[8]}" 21 | send="${line[9]}" 22 | 23 | # The following statement is the same as [[ "$qstart" -ne "$sstart" && "$qend" -ne "$send") ]]. 24 | # This if statement ignores hits where qstart = sstart AND qend = send. 25 | if [ "$qstart" -ne "$sstart" ] && [ "$qend" -ne "$send" ] 26 | then 27 | # We use a sub-shell to avoid overriding the current IFS: ( IFS=$'\t'; echo "${line[*]}" ). 28 | # https://superuser.com/questions/461981/how-do-i-convert-a-bash-array-variable-to-a-string-delimited-with-newlines/462400 29 | # It is also necessary here even though the IFS has been set to a tab character for read data. 30 | # Otherwise, the output of echo is space-delimited. 31 | ( IFS=$'\t'; echo "${line[*]}" ) 32 | fi 33 | done < "${1:-/dev/stdin}" 34 | -------------------------------------------------------------------------------- /shell/extractSPAdesAssemblyStats.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (C) 2021 Yu Wan 4 | # Licensed under the GNU General Public Licence version 3 (GPLv3) . 5 | # Publication: 16 Apr 2021; latest update: 11 Sep 2021 6 | 7 | display_usage() { 8 | echo " 9 | Extract contig/scaffold names, lengths, and depths from a SPAdes output FASTA file and save them in a tab-delimited text file. 10 | Command line: 11 | extractSPAdesAssemblyStats.sh [input.fasta] > [isolate1.tsv] # Single-assembly mode: print a header line 12 | extractSPAdesAssemblyStats.sh [input.fasta] [isolate name] >> [fasta summary.tsv] # Multi-assembly mode: do not print a header line and append 13 | the isolate name in each line for the convenience of concatenating files. This mode is used in a loop that runs this script iteratively. 14 | For the multi-assembly mode, users may run \`echo -e \"Isolate\tNode\tLength\tDepth\" > asm_stats.tsv\` before the loop. 15 | " 16 | } 17 | 18 | if [ -z $1 ]; then 19 | display_usage 20 | exit 21 | fi 22 | 23 | if [ -z "$2" ]; then # Single-assembly mode 24 | echo -e 'Node\tLength\tDepth' # Print the header line. Note that the echo command automatically appends a newline character to the output string. 25 | grep '>' $1 | sed -e 's/>//g' | sed -e 's/_length_/\t/g' | sed -e 's/_cov_/\t/g' 26 | else # Multi-assembly mode (namely, to loop through multiple FASTA files, where each iteration calls this script) 27 | IFS=$'\n' # https://stackoverflow.com/questions/8768420/how-to-convert-command-output-to-an-array-line-by-line-in-bash 28 | lines=( $(grep '>' $1 | sed -e 's/>//g' | sed -e 's/_length_/\t/g' | sed -e 's/_cov_/\t/g') ) 29 | for i in ${lines[@]}; do 30 | echo -e "${2}\t${i}" # Add the assembly name to the head of the result line; use 'echo', not 'printf' (which does not print a newline character at the end of the output) 31 | done 32 | fi 33 | -------------------------------------------------------------------------------- /shell/catAssemblyStats.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Concatenates files of assembly statistics that are generated using our lab's script contigMetrics.py. 4 | # Copyright (C) 2017 Yu Wan 5 | # Licensed under the GNU General Public License (GPL) version 3 6 | # First edition: 31/10/2016; the latest edition: 28/11/2017 7 | # Previous name: catContigStats.sh 8 | 9 | display_usage(){ 10 | echo " 11 | Concatenates files of assembly statistics that are generated using our lab's script contigMetrics.py. 12 | Usage: bash catAssemblyStats.sh ./assemblies/ 13 | Outputs: 14 | assemblyStats_files.txt 15 | assemblyStats.tsv 16 | " 17 | } 18 | 19 | #Constants for the previous version of contigMetrics.py 20 | #HEADER="contigFile,numContigs,totalBases,N50,smallest,lowerQ,median,upperQ,largest" # previous version 21 | #STATS="contigStats_combined.csv" 22 | #FILE_LIST="contigStats_files.txt" 23 | # 24 | #Previous outputs: 25 | # contigStats_files.txt 26 | # contigStats_combined.csv 27 | 28 | HEADER="Assembly\tContig_number\tN50\tQ1\tQ2\tQ3\tMean\tSmallest\tLargest\tLength" 29 | STATS="assemblyStats.tsv" 30 | FILE_LIST="assemblyStats_files.txt" 31 | NAME_PATTERN="assembly_stats.tsv" # previous name: *_contigStats.txt 32 | 33 | # Check argument ########## 34 | if [ -z $1 ]; then 35 | echo "Error: a subject directory must be provided." 36 | display_usage 37 | exit 38 | fi 39 | 40 | # Find all contigStats.txt files ########## 41 | find $1 -name $NAME_PATTERN -type f > $FILE_LIST 42 | echo "There are `cat ${FILE_LIST} | wc -l` genomes." 43 | 44 | # Print contig statistics into a CSV file ########## 45 | echo -e $HEADER > $STATS # -e: convert each "\t" to a tab character 46 | 47 | # Extract the second line of every file and appends it to the CSV file ========== 48 | files=`cat ${FILE_LIST}` 49 | n=0 50 | 51 | for f in ${files}; do 52 | r=`cat ${f} | wc -l` 53 | if [ "$r" -eq "2" ]; then 54 | tail -n 1 $f >> $STATS 55 | ((n++)) 56 | else 57 | echo "Warning: ${f} does not contain contig statistics." 58 | fi 59 | done 60 | 61 | echo "Success: ${n} lines of statistics have been transferred into ${STATS}." 62 | -------------------------------------------------------------------------------- /shell/catGzippedFASTQsPerDirectory.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (C) 2021 Yu Wan 4 | # Licensed under the GNU General Public Licence version 3 (GPLv3) . 5 | # First edition: 25 Aug 2021; the latest update: 5 Nov 2021 6 | 7 | # User guide #################### 8 | display_usage() { 9 | echo " 10 | Concatenates *.fastq.gz in each subdirectory into a single fastq.gz file. Useful for concatenating 11 | Guppy's demultiplexed (with options '--barcode_kits' and '--trim_barcodes' enabled) output sequence files. 12 | Usage: 13 | bash catGzippedFASTQsPerDirectory.sh [input parental directory] [output directory] [inputs.tsv] 14 | For example: ./catGzippedFASTQsPerDirectory.sh ~/fastq/pass ~/fastq/concat barcodes.tsv > cat_fastqs.log 15 | Input TSV file inputs.tsv consists of two columns: [subdirectory name]\t[output filename without '.fastq.gz' or so]. 16 | " 17 | } 18 | 19 | if [ -z $1 ]; then 20 | display_usage 21 | exit 22 | fi 23 | 24 | # Main utility #################### 25 | indir="$1" 26 | outdir="$2" 27 | 28 | if [ ! -d "$indir" ] 29 | then 30 | echo "Error: input parental directory $indir does not exist." >&2 # Print to standard error 31 | exit 32 | fi 33 | 34 | if [ ! -d "$outdir" ] 35 | then 36 | echo "Making output directory $outdir" 37 | mkdir $outdir 38 | fi 39 | 40 | n=0 # Count the number of subdirectories visited 41 | while read line # Please ensure every line in the input TSV file is ended with a newline character. 42 | do 43 | if [ ! -z "$line" ] 44 | then 45 | IFS=$'\t' read -r -a fields <<< "$line" # Parse the line into two fields by '\t'. 46 | input_subdir="$indir/${fields[0]}" 47 | if [ ! -d "$input_subdir" ] 48 | then 49 | echo "Skip inaccessible input directory $input_subdir" >&2 50 | else 51 | output="$outdir/${fields[1]}.fastq.gz" 52 | k=$(ls -1 $input_subdir/*.fastq.gz | wc -l) 53 | echo "Concatenate $k .fastq.gz files from $input_subdir into $output" 54 | zcat $input_subdir/*.fastq.gz | gzip > $output # Slower than 'cat *.fastq.gz' but generates a smaller file. 55 | (( n++ )) 56 | fi 57 | fi 58 | done < "$3" 59 | 60 | echo "FASTQ files of $n samples have been successfully concatenated." -------------------------------------------------------------------------------- /extractSeqFromMultiFASTA.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | This script extracts sequences from a multi-FASTA file using sequence IDs, which are defined in sequence headers. It 5 | basically filters contigs from a multi-FASTA file based on sequence IDs. The script ignores sequence annotation that 6 | is separated from the sequence ID by a white space. 7 | 8 | Input: a multi-FASTA file from stdin. It can be a gene-feature file (.ffn) downloaded from the NCBI nucleotide database 9 | or an assembly file comprised of several contig sequences. 10 | 11 | Argument: a comma-delimited string of target sequence IDs. 12 | 13 | Usage: 14 | cat input.fna | python extractSeqFromMultiFASTA.py "gene1,gene2,...,geneN" > output.fna 15 | cat input.fna | python extractSeqFromMultiFASTA.py "contig1,contig2,...,contigM" > output.fna 16 | Or, 17 | targets=$(cat seqIDs.txt) # seqIDs.txt contains a single comma-delimited line. 18 | cat input.fna | python extractSeqFromMultiFASTA.py $targets > output.fna 19 | For SPAdes assemblies: 20 | targets=$(cat seqIDs.txt) 21 | cat input__scaffolds.fna | sed 's/_length_/ /g' | python extractSeqFromMultiFASTA.py $targets > input__scaffolds_subset.fna 22 | 23 | Author: Yu Wan (wanyuac@126.com, https://github.com/wanyuac) 24 | Python version 2 and 3 compatible 25 | License: GNU GPL 2.1 26 | First edition: 5 July 2016; the latest edition: 13 Sep 2021 27 | Previous name: extract_fasta_loci.py 28 | """ 29 | 30 | from __future__ import print_function 31 | import sys 32 | from Bio import SeqIO 33 | from Bio.Seq import Seq 34 | from Bio.SeqRecord import SeqRecord 35 | 36 | def main(): 37 | # read the list of locus tags 38 | try: 39 | """ 40 | First, drop the newline character in any combinations of \r and \n. 41 | Otherwise, the last ID does not match to any sequence ID. 42 | """ 43 | loci = sys.argv[1].rstrip("\r\n") 44 | loci = loci.split(",") # Parse the string for target sequence IDs 45 | except ValueError: 46 | print("Error: missing argument. A comma-delimited string of sequence IDs is required.") 47 | 48 | for seq in SeqIO.parse(sys.stdin, "fasta"): # read the input FASTA file from stdin 49 | if seq.id in loci: 50 | print(seq.format("fasta")) # write the current sequence to the stdout 51 | 52 | if __name__ == "__main__": 53 | main() 54 | -------------------------------------------------------------------------------- /shell/linkFiles.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Creating symbolic links according to a table of two columns: original file path and link path, separated by tab characters. 3 | # Copyright (C) 2017-2021 Yu Wan 4 | # Licensed under the GNU General Public Licence version 3 (GPLv3) . 5 | # First edition: 18 Oct 2017, the latest update: 6 Sep 2021 6 | # Update note: changed the input file format from CSV to TSV for convenience of users. 7 | 8 | # Display help information ############### 9 | display_usage(){ 10 | echo " 11 | Usage: 12 | chmod a+x linkFiles.sh # before the first run 13 | ./linkFiles.sh [input TSV file] 14 | ./linkFiles.sh [input TSV file] 1 # Add the second argument '1' to renew existing links. 15 | The TSV file should not contain a header line. The first column consists of original file paths, and 16 | the second column consists of link paths: 17 | [old name & path]\t[new name & path]\n 18 | An example of the TSV file: 19 | ~/data/genome1_1.fasta\t/scratch/input/genome1_unimelb.fna 20 | ~/data/genome1_2.fasta\t/scratch/input/genome1_zju.fna 21 | 22 | Notice a user must ensure the directory is accessible for storing links. 23 | " 24 | } 25 | 26 | if [ -z $1 ] 27 | then 28 | display_usage 29 | exit 30 | fi 31 | 32 | # Set the override mode ############### 33 | if [ -z $2 ] 34 | then 35 | override=false 36 | elif [ "$2" -eq "1" ] 37 | then 38 | override=true 39 | else 40 | override=false 41 | fi 42 | 43 | # Otherwise, make symbolic links following the input file ############### 44 | while read line; do 45 | if [ ! -z "$line" ] # Sometimes empty lines are present in the input TSV file, causing an error of ln if keep them untreated. 46 | then 47 | IFS=$'\t' read -ra paths <<< "$line" # split the delimited string into an arrary of two elements 48 | target="${paths[1]}" 49 | origin="${paths[0]}" 50 | if [ ! -L "$target" ] 51 | then 52 | ln -s $origin $target 53 | elif [ "$override" = true ] 54 | then 55 | echo "Warning: redirecting existing link $target -> $(readlink ${target}) to ${origin}." 56 | unlink $target 57 | ln -s $origin $target 58 | else 59 | echo "Warning: skipped existing link $target" 60 | fi 61 | fi 62 | done < "$1" # expect a file name as an input -------------------------------------------------------------------------------- /add_sample_name_FASTA.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ''' 4 | This script adds a sample name at the beginning of each sequence in a FASTA file. For example, the header ">g1 description" becomes 5 | ">sample1__g1 description" after running this script. 6 | 7 | Author: Yu Wan (wanyuac@gmail.com, github.com/wanyuac) 8 | 9 | Example: python add_sample_name_FASTA.py -i filename.txt (or filename.fna) -o output_dir -n 10 | 11 | License: GNU GPL 2.0 12 | 13 | First edition: Fri 27 Nov 2015 14 | Last edition: Sat 28 Nov 2015 15 | ''' 16 | 17 | from argparse import ArgumentParser 18 | from Bio import SeqIO, SeqFeature 19 | from Bio.Seq import Seq 20 | from Bio.SeqRecord import SeqRecord 21 | 22 | def parse_args(): 23 | parser = ArgumentParser(description="Add a sample name to every header of sequences") 24 | parser.add_argument("-i", type = str, required = True, help = "A textual list of input files or the file name of a single FASTA file") 25 | parser.add_argument("-o", type = str, required = False, default = ".", help = "Output directory") 26 | parser.add_argument("-n", required = False, action="store_true", help = "Whether to extract the sample name from the file name rather than the path?") 27 | return parser.parse_args() 28 | 29 | def main(): 30 | args = parse_args() 31 | 32 | # read file names from a list 33 | if ".txt" in args.i: 34 | with open(args.i, "rU") as f: 35 | fasta_files = f.read().splitlines() 36 | else: 37 | fasta_files = [args.i] # If there is just a single FASTA file to be processed. 38 | 39 | # read every FASTA file, change all sequence IDs and write into a new file 40 | for f in fasta_files: 41 | new_fasta = [] 42 | fields = f.split("/") 43 | if args.n: 44 | sample = (fields[-1].split("__"))[0] # get the sample name from the first part of the file name 45 | else: 46 | sample = fields[-3] # split the path and get the second last field as the sample name 47 | extension = (fields[-1].split("."))[1] # get the filename extension: faa, fna or ffn 48 | records = list(SeqIO.parse(open(f, "rU"), "fasta")) # records of a single GenBank file 49 | 50 | # process each sequence 51 | for s in records: 52 | s.id = "__".join([sample, s.id]) 53 | s.description = " ".join(s.description.split(" ")[1 : ]) # remove the first field, which is identical to the sequence ID 54 | new_fasta.append(SeqRecord(s.seq, id = s.id, name = "", description = s.description)) 55 | 56 | SeqIO.write(new_fasta, "%s/%s.%s" % (args.o, sample, extension), "fasta") 57 | 58 | if __name__ == "__main__": 59 | main() -------------------------------------------------------------------------------- /parse_ENA_sampleInfo_XML.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script parses an ENA metadata file in XML format and prints a subset of information. 3 | 4 | Usage: python parse_ENA_sampleInfo_XML.py ERP000909.xml > samples.txt 5 | 6 | Input: an XML file exported for a list of ERS accession numbers from ENA using the REST URLs API. For example, one can download an XML file 7 | for sample ERS086023 using http://www.ebi.ac.uk/ena/data/view/ERS086023&display=xml. 8 | 9 | Output: a tab-delimited text file containing information retrieved from the XML file. 10 | study_accession, sample_accession, secondary_sample_accession, experiment_accession, run_accession, Isolate_ID, Host, Place_of_isolation, Year_of_isolation 11 | 12 | Author of this version: Yu Wan (wanyuac@gmail.com, https://github.com/wanyuac) 13 | Edition history: 6-7, 11 August 2015 14 | 15 | Licence: GNU GPL 2.1 16 | """ 17 | 18 | import sys 19 | import xml.etree.ElementTree as xmlTree 20 | 21 | def get_domains(sample): 22 | study = BioSample = ERS = experiment = run = isolate = strain = host = place = year = "NA" # default value of all fields 23 | for domain in sample: 24 | if domain.tag == "IDENTIFIERS": 25 | BioSample, ERS = sample[0][1].text, sample[0][0].text # text 26 | if domain.tag == "SAMPLE_LINKS": 27 | study = sample[4][0][0][1].text # visit nested elements with indices 28 | experiment = sample[4][1][0][1].text 29 | run = sample[4][2][0][1].text 30 | if domain.tag == "SAMPLE_ATTRIBUTES": # This domain may be variable in terms of attributes 31 | for attribute in domain: 32 | if attribute[0].text == "collection_date": 33 | year = attribute[1].text 34 | elif attribute[0].text == "isolate": 35 | isolate = attribute[1].text 36 | elif attribute[0].text == "specific_host": 37 | host = attribute[1].text 38 | elif attribute[0].text == "country": 39 | place = attribute[1].text 40 | elif attribute[0].text == "strain": 41 | strain = attribute[1].text 42 | return [study, BioSample, ERS, experiment, run, isolate, strain, host, place, year] 43 | 44 | def main(): 45 | file = sys.argv[1] 46 | xml = xmlTree.parse(file).getroot() # parse an XML into a tree of elements 47 | 48 | # print the header line 49 | print "\t".join(["study_accession", "sample_accession", "secondary_sample_accession", "experiment_accession", "run_accession", "Isolate_ID", "Strain", "Host", "Place_of_isolation", "Year_of_isolation"]) 50 | for sample in xml: 51 | print "\t".join(get_domains(sample)) 52 | return 53 | 54 | if __name__ == '__main__': 55 | main() 56 | -------------------------------------------------------------------------------- /rename_fasta_seqs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Rename sequences in a FASTA file. It filters out sequences that are not included in the target list, 5 | when specified. 6 | 7 | Author: Yu Wan (wanyuac@gmail.com, https://github.com/wanyuac) 8 | Python version 2 and 3 compatible 9 | License: GNU GPL 2.1 10 | First edition: 11 Nov 2018, the latest revision: 14 Nov 2021. 11 | Created and finished in Nara, Japan. 12 | """ 13 | 14 | from __future__ import print_function 15 | import sys 16 | from Bio import SeqIO 17 | from Bio.Seq import Seq 18 | from Bio.SeqRecord import SeqRecord 19 | from argparse import ArgumentParser 20 | 21 | 22 | def parse_arguments(): 23 | parser = ArgumentParser(description="Read options and arguments") 24 | parser.add_argument("--fasta", "-f", dest = "fasta", type = str, required = True, help = "A FASTA file whose sequences will be renamed.") 25 | parser.add_argument("--mapping", "-m", dest = "mapping", type = str, required = True, help = "A tab-delimited file mapping original sequence IDs to new IDs.") 26 | parser.add_argument("--out", "-o", dest = "out", type = str, required = False, default = "./renamed.fasta", help = "Name and path for output FASTA file.") 27 | parser.add_argument("--keep_all", "-k", dest = "keep_all", action = "store_true", required = False, help = "Set to keep all sequences when some IDs are not found in the rename table.") 28 | parser.add_argument("--simple", "-s", dest = "simple", action = "store_true", required = False, help = "Drop original sequence names to make simple headers.") 29 | 30 | return parser.parse_args() 31 | 32 | 33 | def main(): 34 | args = parse_arguments() 35 | mapping = import_mapping_table(args.mapping) 36 | drop_prev_name = args.simple 37 | to_rename = list(mapping.keys()) 38 | in_fasta = open(args.fasta, "r") 39 | out = open(args.out, "w") 40 | 41 | for seq in SeqIO.parse(in_fasta, "fasta"): # read the input FASTA file 42 | if seq.id in to_rename: 43 | if drop_prev_name: 44 | seq.description = "" 45 | seq.id = mapping[seq.id] 46 | print(seq.format("fasta"), file = out) 47 | elif args.keep_all: 48 | print(seq.format("fasta"), file = out) 49 | 50 | in_fasta.close() 51 | out.close() 52 | 53 | return 54 | 55 | 56 | def import_mapping_table(rename): 57 | # Read the tab-delimited table for renaming sequences. 58 | with open(rename, "r") as f: 59 | lines = f.read().splitlines() 60 | 61 | r = {} 62 | for l in lines: 63 | old_id, new_id = l.split("\t") 64 | r[old_id] = new_id 65 | 66 | return(r) 67 | 68 | 69 | if __name__ == "__main__": 70 | main() 71 | -------------------------------------------------------------------------------- /filename_generator.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This script generates a list of file names based on a list of strings. It is useful if you want to generate a list of file names for read sets from a list of bacterial strain names. 3 | 4 | Usage: python filename_generator.py -i -o -p -s -f -l -pe 5 | 6 | Input: a list of filenames 7 | Example: (inlist.txt) 8 | sample1__genes__results.txt 9 | sample2__genes__results.txt 10 | Command: python filename_generator.py -i inlist.txt -o outlist.txt -p /reads/ -s .fastq.gz -f 0 -l 7 -pe 11 | Output: a list of new file names generated on the basis of strings in inlist.txt 12 | Example: (outlist.txt) 13 | /reads/sample1_1.fastq.gz 14 | /reads/sample1_2.fastq.gz 15 | /reads/sample2_1.fastq.gz 16 | /reads/sample2_2.fastq.gz 17 | 18 | Author: Yu Wan (wanyuac@gmail.com, GitHub: https://github.com/wanyuac) 19 | First edition: 6 July 2015 20 | Last edition: 5 Nov 2015 21 | 22 | License: GNU GPL 2.1 23 | ''' 24 | 25 | from argparse import ArgumentParser 26 | 27 | def parse_args(): 28 | # Read arguments from the command line 29 | parser = ArgumentParser(description='Regenerate filenames.') 30 | # Inputs 31 | parser.add_argument('-i', type = str, required = True, help = 'File name of the input list') 32 | parser.add_argument('-o', type = str, required = True, help = 'File name of the output list') 33 | parser.add_argument('-p', type = str, required = False, help = 'The prefix added to the base for new filenames') 34 | parser.add_argument('-s', type = str, required = False, default = '.fastq.gz', help = 'The suffix added to the base for new filenames') 35 | parser.add_argument('-f', type = int, required = True, help = 'From which character of the base') 36 | parser.add_argument('-l', type = int, required = True, help = 'How many characters of the base should be used; -1: use the whole base') 37 | parser.add_argument('-pe', required = False, action='store_true', help = 'Whether read sets are paired-end') 38 | return parser.parse_args() 39 | 40 | def main(): 41 | args = parse_args() 42 | with open(args.i, 'rU') as in_f: 43 | bases = in_f.read().splitlines() 44 | out_f = open(args.o, 'w') 45 | 46 | if args.l > -1: # only use part of the base for constructing a new file name 47 | for i in range(0, len(bases)): 48 | bases[i] = bases[i][args.f : args.l] 49 | 50 | for item in bases: 51 | if args.pe: # if input files are related to paired-ended libraries 52 | for i in range(1, 3): 53 | filename = '{prefix}{base}_{index}{suffix}\n'.format(prefix = args.p, base = item, index = i, suffix = args.s) 54 | out_f.write(filename) 55 | else: 56 | filename = args.p + item[args.f : args.l] + args.s + '\n' 57 | out_f.write(filename) 58 | out_f.close() 59 | print 'All filenames were generated from bases.' 60 | 61 | if __name__ == '__main__': 62 | main() -------------------------------------------------------------------------------- /shell/catGzippedFASTQsPerSample.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (C) 2021 Yu Wan 4 | # Licensed under the GNU General Public Licence version 3 (GPLv3) . 5 | # First edition: 8 Sep 2021; the latest update: 9 Sep 2021 6 | # This script is derived from catGzippedFASTQsPerDirectory.sh. 7 | 8 | # User guide #################### 9 | display_usage() { 10 | echo " 11 | Concatenate *.fastq.gz of each sample into a single fastq.gz file. 12 | Usage: 13 | bash catGzippedFASTQsPerSample.sh [input parental directory] [output directory] [a list of input sample names] 14 | For example: ./catGzippedFASTQsPerSample.sh ~/fastq/pass ~/fastq/concat isolates.txt &> cat_fastqs.log 15 | There is one sample name per line in the input sample-name list. 16 | " 17 | } 18 | 19 | if [ -z $1 ]; then 20 | display_usage 21 | exit 22 | fi 23 | 24 | # Main utility #################### 25 | 26 | # 1. Set up directories =============== 27 | indir="$1" 28 | outdir="$2" 29 | 30 | if [ ! -d "$indir" ] 31 | then 32 | echo "Error: input parental directory $indir does not exist." >&2 # Print to standard error 33 | exit 34 | fi 35 | 36 | if [ ! -d "$outdir" ] 37 | then 38 | echo "Making output directory $outdir" 39 | mkdir $outdir 40 | fi 41 | 42 | # 2. Concatenate read files =============== 43 | n=0 # The counter of samples processed 44 | while read i # Please ensure every line in the input TSV file is ended with a newline character. 45 | do 46 | if [ ! -z "$i" ] # Skip empty lines 47 | then 48 | # Users may customise the following two commands to match their filenames. 49 | ra="$indir/*_${i}A-[1,2].bacterial-fastq-only.ngsservice.processed.R" # There should be only a single match. 50 | rb="$indir/*_${i}B-[1,2].bacterial-fastq-only.ngsservice.processed.R" # The same as above. 51 | ra1=`ls -1 ${ra}1.fastq.gz` 52 | ra2=`ls -1 ${ra}2.fastq.gz` 53 | rb1=`ls -1 ${rb}1.fastq.gz` 54 | rb2=`ls -1 ${rb}2.fastq.gz` 55 | 56 | # Concatenate the read files of the current sample 57 | if [ -f "$ra1" ] && [ -f "$ra2" ] && [ -f "$rb1" ] && [ -f "$rb2" ] 58 | then 59 | echo "Process read files of isolate $i" 60 | echo " Concatenating $ra1 and $rb1" 61 | zcat $ra1 $rb1 | gzip > $outdir/${i}_1.fastq.gz # Slower than 'cat *.fastq.gz' but generates a smaller file. 62 | echo -e " Concatenating $ra2 and ${rb2}\n" 63 | zcat $ra2 $rb2 | gzip > $outdir/${i}_2.fastq.gz 64 | (( n++ )) 65 | else 66 | echo -e "Skip file concatenation for sample $i due to absence of one or more read files.\n" >&2 67 | fi 68 | fi 69 | done < "$3" # Read sample names one-by-one from the input list 70 | 71 | echo "FASTQ files of $n samples have been successfully concatenated." -------------------------------------------------------------------------------- /exclude_pseudo_seqs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Exclude nucleotide/protein sequences of pseudo genes from a multi-Fasta file. 4 | 5 | Copyright (C) 2025 Yu Wan 6 | First release: 2 Jan 2025; latest update: 3 Jan 2025 7 | Licensed under the GNU General Public Licence version 3 (GPLv3) . 8 | """ 9 | 10 | from Bio import SeqIO 11 | from argparse import ArgumentParser 12 | import re 13 | 14 | def parse_arguments(): 15 | parser = ArgumentParser(description = "Filter pseudo sequences from a multi-FASTA file.") 16 | parser.add_argument('--input', '-i', dest = 'input', required = True, help = "Path to the input FASTA file.") 17 | parser.add_argument('--output', '-o', dest = 'output', required = False, default = 'filtered_output.fasta', help = "Path to the output FASTA file.") 18 | parser.add_argument('--pseudo', '-p', dest = 'pseudo', required = False, default = 'pseudo.fasta', help = "Path to the output FASTA file of excluded sequences") 19 | parser.add_argument('--discard_annot', '-d', dest = 'discard_annot', required= False, action = 'store_true', help = "A flag to discard sequence annotations and only keep names") 20 | return parser.parse_args() 21 | 22 | def filter_pseudo_sequences(input_fasta, output_fasta, pseudo_fasta, discard_annot): 23 | with open(input_fasta, 'r') as infile,\ 24 | open(output_fasta, "w") as outfile,\ 25 | open(pseudo_fasta, 'w') as pseudofile: 26 | for record in SeqIO.parse(infile, 'fasta'): # Iterate through sequences in the input FASTA file 27 | if "[pseudo=true]" in record.description: # Check if "[pseudo=true]" is in the header 28 | output_handle = pseudofile 29 | else: 30 | output_handle = outfile 31 | if discard_annot: 32 | record.id = rename_seq(record.description, record.id) 33 | record.description = record.id 34 | SeqIO.write(record, output_handle, 'fasta') 35 | 36 | def rename_seq(seq_description, seq_id): 37 | match_locus_tag = re.search(r'\[locus_tag=([^\]]+)\]', seq_description) # Extract locus_tag and protein_id from the description using regular expressions 38 | match_protein_id = re.search(r'\[protein_id=([^\]]+)\]', seq_description) 39 | locus_tag = match_locus_tag.group(1) if match_locus_tag else None 40 | protein_id = match_protein_id.group(1) if match_protein_id else None 41 | if locus_tag and protein_id: 42 | new_id = f"{locus_tag}__{protein_id}" 43 | else: 44 | new_id = seq_id # No change to the sequence ID 45 | return new_id 46 | 47 | def main(): 48 | args = parse_arguments() 49 | filter_pseudo_sequences(args.input, args.output, args.pseudo, args.discard_annot) 50 | print(f"Filtered sequences have been written to {args.output} and {args.pseudo}.") 51 | 52 | if __name__ == '__main__': 53 | main() 54 | -------------------------------------------------------------------------------- /seqlen.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Calculate sequence lengths in a multiFASTA file. 5 | This script was inspired by Lesley Sitter's code published on biostars (www.biostars.org/p/148815/). 6 | 7 | Command: python seqlen.py -i [input FASTA file] (-a) (-n) > [output TSV file] 8 | Examples: 9 | python seqlen.py -i input.fna -a > seq_lengths.tsv # With sequence annotation in the sequence description 10 | python seqlen.py -i input.fna -n > seq_lengths.tsv # Only keep the sequence ID in the sequence description and ignore 'N' and '-' characters 11 | 12 | Any character: a flag to keep sequence annotation in the output. 13 | 14 | Copyright (C) 2021 Yu Wan 15 | Licensed under the GNU General Public Licence version 3 (GPLv3) . 16 | Release: 2021; latest update: 26 Sep 2022 17 | """ 18 | 19 | import sys 20 | import re 21 | from argparse import ArgumentParser 22 | 23 | def parse_argument(): 24 | parser = ArgumentParser(description = "Calculating lengths of sequences in a FASTA file") 25 | parser.add_argument('-i', '--input', dest = 'i', type = str, required = True, help = "An input FASTA file") 26 | parser.add_argument('-a', '--annot', dest = 'a', action = 'store_true', help = "Keep sequence annotations in addition to sequence names") 27 | parser.add_argument('-n', '--nucl', dest = 'n', action = 'store_true', help = "Ignoring \'-\' and \'N\' in nucleotide sequences.") 28 | return parser.parse_args() 29 | 30 | def main(): 31 | args = parse_argument() 32 | 33 | with open(args.i, "r") as f: 34 | input_fasta = f.read().splitlines() # Newline characters are dropped. 35 | print("\t".join(["Name", "Length"])) # The header line 36 | seq = "" 37 | allow_write = False # A flag indicating that the first sequence name has been completely loaded. 38 | ignore_annot = not args.a 39 | 40 | for line in input_fasta: 41 | if line.startswith(">"): # A new sequence is encountered 42 | if allow_write: 43 | if args.n: 44 | seq = re.sub('[N-]', '', seq.upper()) # Stripping multiple characters from a string. Ref: stackoverflow.com/questions/3900054/python-strip-multiple-characters. 45 | print("\t".join([seqid, str(len(seq))])) # Write the name and length of the previous sequence 46 | seq = "" 47 | if ignore_annot: 48 | seqid = line.split(" ")[0] # Get the sequence ID and ignore the sequence annotation 49 | else: 50 | seqid = line 51 | seqid = seqid[1 : ] # Drop the ">" character 52 | allow_write = True 53 | else: 54 | seq += line # concatenate lines of the current sequence 55 | 56 | print("\t".join([seqid, str(len(seq))])) # Write the length of the last sequence 57 | return 58 | 59 | if __name__ == '__main__': 60 | main() 61 | -------------------------------------------------------------------------------- /extractNuclRegionFromFASTA.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script extracts a region of nucleotides by its genomic coordinates from a fasta file 3 | 4 | Arguments 5 | -i: the path of the input file 6 | -n: the name of your selected contig 7 | -f: feature name specified by the user 8 | -s: the first nucleotide to be selected 9 | -e: the last nucelotide to be selected 10 | -o: the filename of the output 11 | 12 | Requirements 13 | Only one region should be selected 14 | The start and end positions should not spill out 15 | 16 | Author: Yu Wan (wanyuac@126.com) 17 | Date: 1 June and 17 July 2015 18 | GitHub: https://github.com/wanyuac/BINF_toolkit 19 | Licence: GNU GENERAL PUBLIC LICENSE Version 2 20 | Previous name: extract_nc_region.py, extract_nucl_region.py 21 | """ 22 | 23 | from argparse import ArgumentParser 24 | from Bio import SeqIO 25 | from Bio.SeqRecord import SeqRecord 26 | 27 | def parse_args(): 28 | # This function extracts arguments from the command line 29 | parser = ArgumentParser(description="Read arguments: input filename, start position, end position, and out filename") 30 | parser.add_argument("-i", type=str, required=True, help="Input path") # append an argument to variable "parser" 31 | parser.add_argument("-c", type=str, default="", help="Name of your selected contig; the first contig will be chosen if -c is not set.") 32 | parser.add_argument("-f", type=str, default="feature", help="The feature name") 33 | parser.add_argument("-s", type=int, required=True, help="Start position") 34 | parser.add_argument("-e", type=int, required=True, help="End position") 35 | parser.add_argument("-o", type=str, default="selected_region.fasta", help="File name of the output") 36 | return parser.parse_args() 37 | 38 | def write_seq(contig, feature, start, end, output): 39 | # read and write FASTA files 40 | seqlen = end - start + 1 # the length of selected region 41 | contiglen = len(contig.seq) 42 | if start > contiglen or end > contiglen: # the genetic coordinates are out bounded 43 | flag = False 44 | else: 45 | seq = contig.seq[start - 1: end] # gets the selected sequence of this contig 46 | descr = feature + "|" + str(start) + ".." + str(end) + "|" + str(seqlen) + " bp\n" # gets the header of this contig 47 | new_rec = SeqRecord(seq=seq, id=contig.id, name=feature, description=descr) # create a new SeqRecord instance. Note that the contig.name will not be written in a FASTA file (only in GenBank files). 48 | f = open(output, "w") 49 | SeqIO.write(new_rec, f, "fasta") # saves the selection 50 | f.close() 51 | flag = True 52 | return flag 53 | 54 | def main(): 55 | args = parse_args() # read arguments from the command line 56 | f = open(args.i, "rU") # supports universal newlines 57 | contigs = list(SeqIO.parse(f, "fasta")) 58 | found = False 59 | 60 | """ 61 | To-do: what happens when start > end? 62 | """ 63 | if args.c == "": 64 | found = write_seq(contig=contigs[0], feature=args.f, start=args.s, end=args.e, output=args.o) # read the first contig if -n is not set 65 | else: 66 | for contig in contigs: 67 | if contig.id == args.c: # if this is the selected contig 68 | found = write_seq(contig=contig, feature=args.f, start=args.s, end=args.e, output=args.o) 69 | break 70 | f.close() 71 | if found: 72 | print "The target sequence was extracted." 73 | else: 74 | print "No sequence was found." 75 | return 76 | 77 | # The main program 78 | if __name__ == "__main__": 79 | main() 80 | -------------------------------------------------------------------------------- /gfa_stats.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | This script generates summary statistics of nodes for a list of input GFA files 4 | that are produced by the assembler SPAdes. It prints a tab-delimited file to the 5 | STDIN. See gfa-spec.github.io/GFA-spec/GFA1.html for specifications of the 6 | GFA format. 7 | 8 | Example command line: 9 | python gfa_stats.py *.gfa > gfa_stats.tsv 10 | python gfa_stats.py *.gfa | sed 's/__scaffolds.gfa//g' > gfa_stats.tsv 11 | python gfa_stats.py *.gfa 1> gfa_stats.tsv 2> gfa_stats.err 12 | 13 | This script does not consider the overlap length (e.g., 77M), so every node length 14 | reported by this script includes the overlap length. For singleton nodes (namely, 15 | nodes that do not connect to any other nodes), the overlap length = 0. 16 | 17 | Copyright (C) 2021 Yu Wan 18 | Licensed under the GNU General Public Licence version 3 (GPLv3) . 19 | Creation: 15 July 2021; the latest update: 16 July 2021 20 | """ 21 | 22 | import os 23 | import sys 24 | import glob 25 | from collections import namedtuple 26 | 27 | def main(): 28 | """ 29 | The way each OS deals with the wildcard differs. For example, Win10 passes the string '*.gfa' directly to the script 30 | as sys.argv[1], whereas Linux replaces this wildcard express with all files globed. 31 | """ 32 | if len(sys.argv) > 2: 33 | gfas = sys.argv[1 : ] 34 | else: 35 | gfas = glob.glob(sys.argv[1]) # sys.argv[1] = "*.gfa" 36 | print("Summarise nodes in %i GFA files" % len(gfas), file = sys.stderr) 37 | 38 | # Print summary statistics 39 | print("\t".join(["Assembly", "Node", "Length", "Depth", "Kmers", "Singleton"]), file = sys.stdout) # The header line 40 | for g in gfas: # Filenames are used for filling the first column. 41 | if os.path.exists(g): 42 | summarise_gfa(g) 43 | else: 44 | print("Error: skip the inaccessible file " + g, file = sys.stderr) 45 | return 46 | 47 | 48 | def summarise_gfa(gfa): 49 | """ Produces summary statistics for a GFA file """ 50 | nodes = dict() 51 | linked_nodes = set() 52 | Node = namedtuple("Node", ["length", "depth", "kmers"]) 53 | 54 | # Extracts and transforms the S (segment) and L (link) fields of each GFA file 55 | g = open(gfa, "r") 56 | line = g.readline().strip() 57 | while line: 58 | if line.startswith("S"): # A segment line 59 | _, node_name, seq, depth, kmers = line.split("\t") 60 | nodes[node_name] = Node(length = str(len(seq)), depth = depth[5 : ], kmers = kmers[5 : ]) # Drop "DP:f:" and "KC:i:" 61 | elif line.startswith("L"): # A link line, which always comes after the "S" lines 62 | _, f, _, t, _, _ = line.split("\t") 63 | if f != t: 64 | linked_nodes = linked_nodes.union({f, t}) 65 | else: 66 | linked_nodes.add(f) 67 | else: 68 | break # The "P" lines make up the last section in the GFA file. 69 | line = g.readline().strip() 70 | g.close() 71 | 72 | # Mark non-singleton nodes 73 | for node_name, node_stats in nodes.items(): 74 | is_singleton = "0" if node_name in linked_nodes else "1" # "0": no; "1": yes. 75 | print("\t".join([gfa, node_name, node_stats.length, node_stats.depth, node_stats.kmers, is_singleton]), file= sys.stdout) 76 | return 77 | 78 | 79 | if __name__ == "__main__": 80 | main() 81 | -------------------------------------------------------------------------------- /filterSPAdesContigs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Filters contigs/scaffolds in SPAdes output FASTA files for a minimum length (bp) and a range of 4 | read depths (min <= d <= max). 5 | 6 | Outputs: (1) a filtered FASTA file to stdout, (2) a summary of the filtering process to stderr. 7 | Note that sequence headers in the output FASTA file differ from the original format for the convenience of 8 | subsequent analyses: 9 | Sequence headers in the input file: NODE_[n]_length_[L]_cov_[C] 10 | Sequence headers in the output file: NODE_[N] len=[L],cov=[C] 11 | Columns in the output from stderr: 12 | Input file name, number of contigs passed the filters, number of contigs failed the filters, names of contigs failed the filters 13 | 14 | Example command: 15 | python filterSPAdesContigs.py --input input.fna --min_len 200 --min_d 1 --max_d 100 1>filtered.fna 2>filter.log 16 | 17 | Dependencies: Biopython, Python v3 18 | 19 | Copyright (C) 2022 Yu Wan 20 | Licensed under the GNU General Public Licence version 3 (GPLv3) . 21 | Creation: 23 Jan 2022; the latest update: 23 Jan 2022. 22 | """ 23 | from argparse import ArgumentParser 24 | import os 25 | import sys 26 | from Bio import SeqIO 27 | from collections import namedtuple 28 | 29 | def parse_arguments(): 30 | parser = ArgumentParser(description = "Read options and arguments") 31 | parser.add_argument('--input', '-i', dest = 'input', type = str, required = True, help = "Input FASTA file from SPAdes") 32 | parser.add_argument('--min_len', '-l', dest = 'min_len', type = int, required = False, default = 1, help = "Minimum contig length [default: 1 bp (no filter)]") 33 | parser.add_argument('--min_d', '-d0', dest = 'min_d', type = float, required = False, default = 0, help = "Minimum read depth per contig [default: 0 (no filter)]") 34 | parser.add_argument('--max_d', '-d1', dest = 'max_d', type = float, required = False, default = 0, help = "Maximum read depth per contig [default: 0 (no filter)]") 35 | return parser.parse_args() 36 | 37 | def parse_seq_header(h): 38 | """ 39 | Parse sequence headers in SPAdes's output FASTA files 40 | Format of the headers: NODE_[n]_length_[L]_cov_[C]. 41 | """ 42 | Contig = namedtuple('Contig', ['name', 'len', 'cov']) 43 | fields = h.split('_') 44 | return Contig(name = '_'.join(fields[0 : 2]), len = int(fields[3]), cov = float(fields[5])) 45 | 46 | def main(): 47 | args = parse_arguments() 48 | min_len = args.min_len 49 | min_d = args.min_d 50 | max_d = args.max_d 51 | filter_len = min_len > 1 52 | filter_min_d = min_d > 0 53 | filter_max_d = max_d > 0 and min_d < max_d 54 | fasta = os.path.basename(args.input) 55 | if not os.path.exists(args.input): 56 | print(f"Error: input file {fasta} does not exist.", file = sys.stderr) 57 | sys.exit(1) 58 | n_pass = 0 59 | n_fail = 0 60 | names_fail = [] 61 | for contig in SeqIO.parse(args.input, 'fasta'): 62 | c = parse_seq_header(contig.id) 63 | keep = True 64 | if filter_len: 65 | keep = keep and c.len >= min_len 66 | if filter_min_d: 67 | keep = keep and c.cov >= min_d 68 | if filter_max_d: 69 | keep = keep and c.cov <= max_d 70 | if keep: 71 | contig.id = c.name 72 | contig.description = f'len={c.len},cov={c.cov}' 73 | SeqIO.write(contig, sys.stdout, 'fasta') 74 | n_pass += 1 75 | else: 76 | names_fail.append(contig.id) 77 | n_fail += 1 78 | if n_fail > 0: 79 | ns = ','.join(names_fail) 80 | else: 81 | ns = '' 82 | print(f'{fasta}\t{n_pass}\t{n_fail}\t{ns}', file = sys.stderr) 83 | 84 | if __name__ == '__main__': 85 | main() -------------------------------------------------------------------------------- /shell/download_ena_pe_reads.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # A wrapper for downloading paired-end read sets from the ENA database using ena-file-downloader (github.com/enasequence/ena-ftp-downloader) 3 | # Copyright (C) 2021-2023 Yu Wan 4 | # Licensed under the GNU General Public Licence version 3 (GPLv3) . 5 | # Publication: 29 Mar 2021; latest update: 3 Aug 2023 6 | 7 | # Guidance ######################### 8 | display_usage() { 9 | echo " 10 | Download read files from the ENA database using ena-file-downloader. Please ensure java is accessible in your working environment. 11 | Command: 12 | download_ena_pe_reads.sh -d=[path to ena-file-downloader.jar] -t=[a TSV file with two columns: isolate names, read accessions] -o=[Output directory] 13 | Example commands: 14 | download_ena_pe_reads.sh -d=\"\$HOME/bin/ena-file-downloader.jar\" -t=\"readsets.tsv\" -o=\"\$PWD\" 1> download_ENA_reads.log 2> download_ENA_reads.err 15 | " 16 | } 17 | 18 | if [ -z "$1" ] || [ "$1" == "-h" ]; then 19 | display_usage 20 | exit 0 21 | fi 22 | 23 | # Functions #################### 24 | check_dir() { 25 | if [ ! -d "$1" ]; then 26 | echo "Create directory $1" 27 | mkdir -p "$1" 28 | fi 29 | } 30 | 31 | download_reads() { 32 | p="$1" # The downloader program 33 | i="$2" # Isolate name 34 | a="$3" # ENA accession 35 | tmp_dir="reads_fastq/$a" # Output directory 36 | if [ -d "$tmp_dir" ]; then 37 | echo "Warning: existing temporary directory $tmp_dir is deleted." >&2 38 | rm -rf "$tmp_dir" 39 | fi 40 | java -jar "$p" --accessions="$a" --format=READS_FASTQ --location=$PWD --protocol=FTP --asperaLocation=null # A directory 'reads_fastq' and a subdirectory "reads_fastq/$j" are created by this command. 41 | r1="$tmp_dir/${a}_1.fastq.gz" 42 | r2="$tmp_dir/${a}_2.fastq.gz" 43 | if [ -f "$r1" ] && [ -f "$r2" ]; then 44 | mv "$r1" "${i}_1.fastq.gz" 45 | mv "$r2" "${i}_2.fastq.gz" 46 | echo "Successfully downloaded paired-end readset of isolate $i (accession: $a)." 47 | else 48 | echo "Error: paired-end readset of isolate $i (accession: $a) could not be downloaded." >&2 49 | echo "Files in directory ${tmp_dir}:" >&2 50 | ls -1 "${tmp_dir}" >&2 51 | fi 52 | rmdir "$tmp_dir" 53 | sleep 1 54 | } 55 | 56 | # Main ######################### 57 | # Read arguments 58 | for i in "$@"; do 59 | case "$i" in 60 | -t=*) 61 | accessions="${i#*=}" 62 | ;; 63 | -o=*) 64 | outdir="${i#*=}" 65 | ;; 66 | -d=*) 67 | downloader="${i#*=}" 68 | ;; 69 | *) 70 | ;; 71 | esac 72 | done 73 | 74 | # Check whether the downloader is accessible 75 | if [ ! -f $downloader ]; then 76 | echo "Error: ena-file-downloader.jar is not accessible at location $downloader" >&2 77 | exit 1 78 | fi 79 | 80 | # Check accession file 81 | if [ ! -f "$accessions" ]; then 82 | echo "Error: the TSV file of accession numbers is not found." >&2 83 | exit 1 84 | fi 85 | 86 | # Set up the output directory 87 | if [ ! -z "$outdir" ]; then 88 | check_dir "$outdir" 89 | else 90 | echo "Error: $outdir was not found." >&2 91 | exit 1 92 | fi 93 | 94 | # Download reads 95 | cd "$outdir" 96 | 97 | while read line; do # Read through the input TSV file line-by-line 98 | if [ ! -z "$line" ]; then 99 | IFS=$'\t' read -r -a fields <<< "$line" 100 | download_reads "$downloader" "${fields[0]}" "${fields[1]}" # Downloader, isolate name, ENA accession 101 | fi 102 | rmdir reads_fastq # A directory created by the downloader; directory 'logs' (also created by the downloader) is left in the output directory. 103 | done < "$accessions" # Expect a file name as an input 104 | -------------------------------------------------------------------------------- /gc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # Author: Yu Wan, University of Melbourne, 2014-3-21~4-1, 12 May 2015 3 | # Contact: wanyuac@126.com 4 | # This program calculates the length, GC content, and entropy for each record in a multi-fasta file. 5 | # GitHub: https://github.com/wanyuac/BINF_toolkit 6 | # Input: a fasta file which contains multiple sequences from the standard input 7 | # Output: for each sequence, print: 1) header 2) total sequence length 3) percentage of G+C 4) entropy of the sequence 8 | # Command line: python gc.py < filename.fasta 9 | # Treatment of the extended alphabet: 10 | # 1) consider all of 15 characters 11 | # 2) construct a weighted-count table using dictionary 12 | # 3) for each character in the table, take the probability of being A, G, C or T as effective counts 13 | # 4) counts for A, G, C and T is computed by adding up the vectors for every character read from the sequence. 14 | # Licence: GNU GENERAL PUBLIC LICENSE 2.0 15 | 16 | import sys 17 | import math 18 | 19 | alphabet = { # weighted-count table 20 | # A G C T 21 | 'A': [1, 0, 0, 0], 22 | 'G': [0, 1, 0, 0], 23 | 'C': [0, 0, 1, 0], 24 | 'T': [0, 0, 0, 1], 25 | 'S': [0, 0.5, 0.5, 0], 26 | 'W': [0.5, 0, 0, 0.5], 27 | 'R': [0.5, 0.5, 0, 0], 28 | 'Y': [0, 0, 0.5, 0.5], 29 | 'M': [0.5, 0, 0.5, 0], 30 | 'K': [0, 0.5, 0, 0.5], 31 | 'V': [0.33, 0.33, 0.33, 0], 32 | 'H': [0.33, 0, 0.33, 0.33], 33 | 'D': [0.33, 0.33, 0, 0.33], 34 | 'B': [0, 0.33, 0.33, 0.33], 35 | 'N': [0.25, 0.25, 0.25, 0.25] 36 | } 37 | 38 | def read_fasta (fasta): 39 | header = [] # starts from 0 40 | s = 'start' 41 | seq = [] 42 | for line in fasta: 43 | line = line.rstrip('\n') # remove '\n' at the end 44 | if line.startswith('>'): # find a new sequence 45 | header.append(line) # Do not use header = ... here because .append() returns None 46 | seq.append(s.upper()) # Append the last row of last sequence, note that the first element is ''. 47 | s = '' # reset s 48 | else: 49 | s = s + line.upper() 50 | seq.append(s) # append the last string to list seq after the loop 51 | seq = seq[1:] # remove the first element 52 | list = [header, seq] 53 | return list 54 | 55 | def base_count (seq): 56 | # A G C T 57 | num = [0, 0, 0, 0] # numbers of A, G, C, T 58 | L = len(seq) 59 | for i in range(0, L): 60 | num = [sum(j) for j in zip(num, alphabet[seq[i]])] # addition of vectors: element by element 61 | return num 62 | 63 | def GC_content (num, seq_len): 64 | GC = float(num[1] + num[2]) # sum of the numbers of G and C in the list num. 65 | return GC / seq_len 66 | 67 | def entropy (num, seq_len): 68 | # P(A) P(G) P(C) P(T) 69 | p = [0, 0, 0, 0] # initiate a list 70 | H = 0 # entropy 71 | y = 0 72 | for i in range(0, 4): 73 | p[i] = float(num[i]) / seq_len # estimates the probability by frequency 74 | if p[i] == 0: # cannot take log0 75 | y = 0 # lim(x^x) = 0 when x -> 0 76 | else: 77 | y = p[i] * math.log(p[i], 2) 78 | H = H - y 79 | return H 80 | 81 | #/////////////// Main program ///////////////////// 82 | # read from the standard input 83 | content = sys.stdin.readlines() # including '\n' 84 | fasta = read_fasta(content) # fasta[0]: headers, fasta[1]: sequences 85 | header = fasta[0] 86 | seq = fasta[1] 87 | n = range(0, len(header)) # number of sequences 88 | for i in n: 89 | s = seq[i] 90 | L = len(s) 91 | num = base_count(s) 92 | print header[i] # output 1: headers 93 | print L # output 2: the length of the sequence 94 | print '%04.2f'%(GC_content(num, L) * 100) # output 3: the G+C content of this sequence 95 | print '%02.1f'%entropy(num, L) # output 4: the entropy -------------------------------------------------------------------------------- /run_CutAdapt.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This script runs CutAdapt for a list of paired-end readsets. 3 | 4 | Author: Yu Wan (wanyuac@gmail.com) 5 | Edition history: 15 Dec 2015, 1-2 Jan 2016 6 | Licence: GNU GPL 2.0 7 | ''' 8 | 9 | import os, re 10 | from argparse import ArgumentParser 11 | 12 | MEMORY = "2048" # 2 GB for each job 13 | WALL_TIME = "1-0:0:0" 14 | 15 | def parse_args(): 16 | parser = ArgumentParser(description= "Run CutAdapt to remove adapter sequences from reads.") 17 | parser.add_argument("--reads", type = str, required = True, help = "A list of paired-end readsets") 18 | parser.add_argument("--f_adapter", type = str, required = True, help = "Adapter sequences of forward reads") 19 | parser.add_argument("--r_adapter", type = str, required = True, help = "Adapter sequences of reverse reads") 20 | parser.add_argument("--pattern", type = str, required = False, default = "\d\d\d\d_\d#\d*", help = "A regular expression for pulling out sample names") 21 | parser.add_argument("--side", type = str, required = False, default = "3'", help = "3'-end adapters or 5'-end adapters?") 22 | parser.add_argument("--len", type = str, required = False, default = "108", help = "Minimun read length") 23 | parser.add_argument("--overlap", type = str, required = False, default = "33", help = "Minimun overlap length between an adapter and a read") 24 | parser.add_argument("--discrep", type = str, required = False, default = "0.03", help = "The discrepancy rate between the reference adapter sequences and subjects") 25 | parser.add_argument("--outdir", type = str, required = True, default = ".", help = "The directory for outputs") 26 | return parser.parse_args() 27 | 28 | def submit_jobs(readsets, adapt_f, adapt_r, side, outdir, min_read_len, discrep, overlap): 29 | for sample, reads in readsets.iteritems(): 30 | cmd = '#!/bin/bash' 31 | cmd += '\n#SBATCH -p main' 32 | cmd += '\n#SBATCH --job-name=CutAdapt' 33 | cmd += '\n#SBATCH --ntasks=1' 34 | cmd += '\n#SBATCH --mem-per-cpu=' + MEMORY 35 | cmd += '\n#SBATCH --time=' + WALL_TIME 36 | cmd += '\ncd ' + outdir + '\n' 37 | if side == "3'": 38 | cutadapt_cmd = 'cutadapt -a file:' + adapt_f + ' -A file:' + adapt_r + ' --minimum-length ' + min_read_len + ' -e ' + discrep + \ 39 | ' --overlap ' + overlap + ' -o ' + sample + '_1.fastq.gz' + ' -p ' + sample + '_2.fastq.gz' + ' ' + reads[0] + ' ' + reads[1] 40 | else: 41 | cutadapt_cmd = 'cutadapt -g file:' + adapt_f + ' -G file:' + adapt_r + ' --minimum-length ' + min_read_len + ' -e ' + discrep + \ 42 | ' --overlap ' + overlap + ' -o ' + sample + '_1.fastq.gz' + ' -p ' + sample + '_2.fastq.gz' + ' ' + reads[0] + ' ' + reads[1] 43 | cmd += cutadapt_cmd + ' > ' + sample + '.log' 44 | #print cmd 45 | print cutadapt_cmd 46 | os.system("echo '" + cmd + "' | sbatch") 47 | 48 | def load_reads(f, pattern): 49 | readsets = {} 50 | samples = [] 51 | with open(f, "rU") as inputs: 52 | lines = inputs.read().splitlines() 53 | 54 | # initialises the dictionary 55 | for line in lines: 56 | samples.append(re.findall(pattern, line)[0]) 57 | samples = sorted(list(set(samples))) # remove redundancy and sort the list 58 | for sample in samples: 59 | readsets[sample] = ["", ""] 60 | 61 | # matches paired-end read sets to samples 62 | for line in lines: 63 | sample = re.findall(pattern, line)[0] 64 | orentation = int(re.findall("_\d\.", line)[0][1]) # e.g. "_1." => 1 65 | readsets[sample][orentation - 1] = line 66 | 67 | return(readsets) 68 | 69 | def main(): 70 | args = parse_args() 71 | readsets = load_reads(args.reads, args.pattern) # generates a dictionary {sample:[read1, read2],...} 72 | submit_jobs(readsets, args.f_adapter, args.r_adapter, args.side, args.outdir, args.len, args.discrep, args.overlap) # submit a job for each pair of readsets 73 | 74 | if __name__ == '__main__': 75 | main() 76 | -------------------------------------------------------------------------------- /mergeGenomicRegions.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Merge a list of genomic regions and find out complementary regions afterwards. 3 | # Commandline: Rscript mergeGenomicRegions.R [input file] [genome length] [output prefix] 4 | # The input file is a CSV file comprised of two columns ("from" and "to") without a header. 5 | # This script assumes that there are always >= 2 regions in any coordinate tables. 6 | # A typical input is the coordinate file produced by the script filterCoords.py of RedDog. 7 | # Example: 8 | # Rscript mergeGenomicRegions.R coord.txt 5248520 9 | # Outputs: 10 | # (1) [input filename]__merged.csv, (2) [input filename]__comple.csv 11 | # Saved under the current working directory. 12 | # 13 | # Copyright 2017 Yu Wan 14 | # Licensed under the Apache License, Version 2.0 15 | # Edition: 30 Apr 2017 16 | 17 | mergeRegions <- function(x) { 18 | y <- x[1, ] # take the first row of x to start with 19 | j <- 1 # row pointer of y 20 | ub1 <- y$to[1] # upper bound of the current region 21 | for (i in 2 : nrow(x)) { # must guarantee there are >= 2 rows in the table 22 | z <- x[i, ] 23 | lb2 <- z$from[1] # lower bound of the new region 24 | ub2 <- z$to[1] 25 | # There are only two behavious: either merge two regions or adding a separate region. 26 | if (lb2 <= (ub1 + 1)) { # two regions overlap or adjacent: merge them into a single one. lb1 <= lb2 because the data frame is sorted in an ascending order. 27 | if (ub2 > ub1) { # extend the previous region 28 | y$to[j] <- ub2 29 | ub1 <- ub2 30 | } # else, do nothing as the second range is a subset of the first one 31 | } else { # push a new and non-overlapping region into the stack of regions 32 | y <- rbind(y, z) 33 | j <- j + 1 # move the point to the new row 34 | ub1 <- ub2 35 | } 36 | } 37 | 38 | return(y) 39 | } 40 | 41 | findComplementaryRegions <- function(x, L) { # L: genome size 42 | n <- nrow(x) # number of predefined regions 43 | r <- x[1, ] 44 | lb <- r$from[1] 45 | ub <- r$to[1] 46 | 47 | # initialise z 48 | if (lb > 1) { # if the first region is not at the start of the genome 49 | z <- data.frame(from = 1, to = lb - 1) # lb - 1 may equal s 50 | } else { 51 | z <- data.frame(from = integer(0), to = integer(0)) 52 | } 53 | s <- ub + 1 54 | 55 | for (i in 2 : n) { # must guarantee there are >= 2 rows in the table 56 | r <- x[i, ] 57 | lb <- r$from[1] 58 | ub <- r$to[1] 59 | # Notice the function mergeRegions guarantees that lb > s and lb - s >= 1. 60 | # So the width of any gaps >= 1. 61 | z <- rbind(z, data.frame(from = s, to = lb - 1)) # Notice lb - 1 may equal s 62 | s <- ub + 1 63 | } 64 | 65 | # Are there any bases left beyond the last predefined region? 66 | if (s <= L) { 67 | z <- rbind(z, data.frame(from = s, to = L)) 68 | } 69 | 70 | return(z) 71 | } 72 | 73 | # Read arguments 74 | args <- commandArgs(trailingOnly = TRUE) 75 | input <- args[1] 76 | genome.len <- args[2] 77 | prefix <- ifelse(length(args) >= 3, args[3], "coords") 78 | 79 | if (file.exists(input)) { 80 | x <- read.csv(input, header = FALSE) 81 | } else { 82 | stop(paste("The input file", input, "is not found.", sep = " ")) 83 | } 84 | 85 | names(x) <- c("from", "to") 86 | 87 | # Sort beginnings of regions so that their heads do not go backwards 88 | x <- x[order(x$from, decreasing = FALSE), ] 89 | 90 | # Merge regions 91 | y <- mergeRegions(x) 92 | 93 | # Find out complementary regions 94 | z <- findComplementaryRegions(y, genome.len) 95 | 96 | # write results 97 | write.table(y, file = paste0(prefix, "__merged.csv"), col.names = FALSE, row.names = FALSE, quote = FALSE, sep = ",") # Merged regions 98 | write.table(z, file = paste0(prefix, "__comple.csv"), col.names = FALSE, row.names = FALSE, quote = FALSE, sep = ",") # Complementary regions 99 | 100 | # sumamrise regions 101 | y$base <- y$to - y$from + 1 102 | z$base <- z$to - z$from + 1 103 | 104 | print(paste(nrow(x), "regions have been merged into", nrow(y), "regions of", sum(y$base), "bases.", sep = " ")) 105 | print(paste("There are", nrow(z), "regions of", sum(z$base), "bases outside of the merged regions.", sep = " ")) 106 | -------------------------------------------------------------------------------- /screen_genes_blast.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script screens genes against a reference database using megaBLAST for every input FASTA file. 3 | Specifically, it takes as input a list of FASTA files and searches every DNA sequence against the reference database. Obviously, 4 | it performs a targeted analysis for input sets of DNA sequences. 5 | For example, you may want to find out all resistance genes in every bacterial genome, for which you may create FASTA files of 6 | coding sequences for every genome and use this script to profile this kind of genes. 7 | 8 | Number of options: 6 (2 compulsory and 4 optional) 9 | Usage: 10 | python screen_genes_blast.py --in *.fna --db [reference database] --strains [a comma-delimited string of strain names] 11 | --genomes [a comma-delimited string of genome names] --opt [options and arguments for BLAST] 12 | --outfmt [output format code] > [output file name] 13 | 14 | Prerequisite: A BLAST nucleotide database should be made before using this script. 15 | makeblastdb -in your.fasta -dbtype nucl -out db_name -logfile your.log 16 | 17 | Options "--strains" and "--genomes" are optional. 18 | 19 | A spreadsheet can be created beforehand to ensure the strain name and the genome name to match each FASTA file: 20 | strain genome fasta_file 21 | AH0650_Sm1 chr chr.fna 22 | AH0650_Sm1 plasmid plasmid.fna 23 | 24 | Author: Yu Wan (wanyuac@gmail.com, https://github.com/wanyuac) 25 | Development history: 3 July 2016 26 | Python version: 2.7.10 27 | License: GNU GPL 2.1 28 | """ 29 | 30 | from argparse import ArgumentParser 31 | import sys, os, subprocess 32 | 33 | def parse_arguments(): 34 | # read arguments of options 35 | parser = ArgumentParser(description="Fix problems in SRST2's ARG-Annot database") 36 | parser.add_argument("--in", "-i", dest = "input", nargs = "+", type = str, required = True, default = "", help = "A list of input FASTA files") 37 | parser.add_argument("--db", "-d", dest = "db", type = str, required = True, default = "", help="A reference nucleotide database for BLAST") 38 | parser.add_argument("--strains", "-s", dest = "strains", type = str, required = False, default = "", help = "(optional) Comma-delimited names of bacterial strains") 39 | parser.add_argument("--genomes", "-g", dest = "genomes", type = str, required = False, default = "", help = "(optional) Comma-delimited genome names") 40 | parser.add_argument("--opt", "-o", dest = "opt", type = str, required = False, default = "-evalue 0.001 -max_target_seqs 2 -perc_identity 98",\ 41 | help = "Options and argument passed to BLAST") 42 | parser.add_argument("--outfmt", "-f", dest = "outfmt", type = str, required = False,\ 43 | default = "6 qseqid sseqid qstart qend sstart send qlen slen length bitscore pident qcovs gaps evalue",\ 44 | help = "The configuration of the 'outfmt' option for BLAST") 45 | return parser.parse_args() 46 | 47 | def main(): 48 | args = parse_arguments() 49 | 50 | n_fasta = len(args.input) 51 | 52 | # parse strain information 53 | if args.strains != "": 54 | strains = args.strains.split(",") 55 | n_str = len(strains) 56 | else: 57 | strains = None 58 | n_str = 0 59 | 60 | # parse genome information 61 | if args.genomes != "": 62 | genomes = args.genomes.split(",") 63 | n_gen = len(genomes) 64 | else: 65 | genomes = None 66 | n_gen = 0 67 | 68 | # check whether strains, genomes and files match 69 | if n_str != n_fasta: 70 | sys.exit("Error: strain number is not equal to the number of FASTA files.") 71 | 72 | if n_gen != n_fasta: 73 | sys.exit("Error: genome number is not equal to the number of FASTA files.") 74 | 75 | # get column names of the output file 76 | colnames = args.outfmt.split(" ")[1 : ] # remove the first element -- the format id 77 | 78 | # print the header line to the stdout 79 | if n_gen > 0: 80 | colnames = ["genome"] + colnames 81 | if n_str > 0: 82 | colnames = ["strain"] + colnames 83 | print "\t".join(colnames) 84 | 85 | # search every set of query sequences against the reference database 86 | i = 0 # the counter of FASTA files 87 | for fasta in args.input: 88 | cmd = ["blastn", "-task", "megablast", "-db", args.db, "-query", fasta] + \ 89 | args.opt.split(" ") + ["-outfmt", args.outfmt] # Each pair of the option and its argument must be separated as elements of a list. 90 | proc = subprocess.Popen(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE) 91 | out = proc.communicate() # obtain the output of BLAST from the standard output 92 | hits = out[0].splitlines() # stderr: out[1] 93 | 94 | # print all lines in the current output 95 | for line in hits: 96 | if n_gen > 0: 97 | line = genomes[i] + "\t" + line # add the genome name to each line 98 | if n_str > 0: 99 | line = strains[i] + "\t" + line # add the strain name to each line 100 | print line 101 | i += 1 102 | 103 | if __name__ == "__main__": 104 | main() 105 | -------------------------------------------------------------------------------- /linkPEreadsets.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Creating symbolic links according to a table of five columns: sample name, input read directory, R1 read file, R2 read file, action (Copy or Link), 4 | separated by tab characters. 5 | Warning: existing symbolic links and files will be replaced by new links or files if --update/-u is flagged. 6 | Dependencies: Python 3, bash environment 7 | 8 | Copyright (C) 2021 Yu Wan 9 | Licensed under the GNU General Public Licence version 3 (GPLv3) . 10 | Creation: 6 Oct 2021; the latest update: 15 Oct 2021 11 | """ 12 | 13 | import os 14 | import sys 15 | import subprocess 16 | from argparse import ArgumentParser 17 | 18 | 19 | def parse_arguments(): 20 | parser = ArgumentParser(description = "Create symbolic links for paired-end read sets") 21 | parser.add_argument("--tsv", "-t", dest = "tsv", type = str, required = True, help = "A tab delimited, headerless file of five columns: sample name, input read directory, R1 read file, R2 read file, action (Copy or Link)") 22 | parser.add_argument("--outdir", "-o", dest = "outdir", type = str, required = "True", help = "Path to the output directory (without the forward slash at the end) in which symbolic links will be created or files will be copied into") 23 | parser.add_argument("--update", "-u", dest = "update", action = "store_true", help = "Update existing links or override existing files using the new read files. Default: skip existing links or files.") 24 | parser.add_argument("--R", "-R", dest = "R", action = "store_true", help = "Create symbolic links with suffices _R[1,2].fastq.gz rather than the default _[1,2].fastq.gz") 25 | return parser.parse_args() 26 | 27 | 28 | def main(): 29 | params = parse_arguments() 30 | 31 | with open(params.tsv, "r") as f: 32 | readsets = f.read().splitlines() 33 | 34 | outdir = params.outdir 35 | update = params.update 36 | 37 | if params.R: 38 | suffix_r1 = "_R1.fastq.gz" 39 | suffix_r2 = "_R2.fastq.gz" 40 | else: 41 | suffix_r1 = "_1.fastq.gz" 42 | suffix_r2 = "_2.fastq.gz" 43 | 44 | if not os.path.exists(outdir): 45 | os.mkdir(outdir) 46 | 47 | line_num = 0 48 | readset_num = 0 49 | for r in readsets: 50 | line_num += 1 51 | try: 52 | i, in_dir, r1, r2, action = r.split("\t") 53 | proceed = True 54 | except ValueError: 55 | print(f"Warning: Line {line_num} '{r}' does not contain the five values required. Skip this line.", file = sys.stderr) 56 | proceed = False 57 | if proceed: 58 | r1 = os.path.join(in_dir, r1) 59 | r2 = os.path.join(in_dir, r2) 60 | t1 = os.path.join(outdir, i + suffix_r1) 61 | t2 = os.path.join(outdir, i + suffix_r2) 62 | if action == "Link": 63 | readset_num += create_link(t1, r1, update) + create_link(t2, r2, update) 64 | else: 65 | readset_num += copy_file(t1, r1, update) + copy_file(t2, r2, update) 66 | print(f"Symbolic links or files were created for {readset_num} read files.") 67 | return 68 | 69 | 70 | def create_link(t, r, u): 71 | if os.path.exists(r): 72 | if os.path.exists(t): 73 | if u: # Replace existing symbolic links and files with new symbolic links 74 | if os.path.islink(t): 75 | print(f"Updating link {t}") 76 | subprocess.run(["unlink", t]) 77 | else: 78 | print(f"Warning: {t} is an existing file. It is deleted and a symbolic link is created with {r}.", file = sys.stderr) 79 | subprocess.run(["rm", t]) 80 | subprocess.run(["ln", "-s", r, t]) 81 | c = 1 82 | else: 83 | print(f"Warning: skip existing link/file {t}.", file = sys.stderr) 84 | c = 0 85 | else: 86 | subprocess.run(["ln", "-s", r, t]) 87 | c = 1 88 | else: 89 | print(f"Error: read file {r} does not exist. So link {t} was not created.", file = sys.stderr) 90 | c = 0 91 | return c 92 | 93 | 94 | def copy_file(t, r, u): 95 | if os.path.exists(r): 96 | if os.path.exists(t): 97 | if u: # Replace existing links and files with copied files 98 | if os.path.islink(t): 99 | print(f"Warning: target {t} is a symbolic link. Remove this link and copy file {r}.", file = sys.stderr) 100 | subprocess.run(["unlink", t]) 101 | else: 102 | print(f"Warning: overrode existing file {t} with file {r}.", file = sys.stderr) 103 | subprocess.run(["rm", t]) 104 | subprocess.run(["cp", r, t]) 105 | c = 1 106 | else: 107 | print(f"Warning: skip existing link/file {t}.", file = sys.stderr) 108 | c = 0 109 | else: 110 | subprocess.run(["cp", r, t]) 111 | c = 1 112 | else: 113 | print(f"Error: read file {r} does not exist, so it is not copied.", file = sys.stderr) 114 | c = 0 115 | return c 116 | 117 | 118 | if __name__ == "__main__": 119 | main() -------------------------------------------------------------------------------- /gbk2tsv.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | """ 4 | Convert GenBank files to tab-delimited text files (*.tsv). Every GenBank file must contain the locus_tag qualifier and may 5 | contain multiple contigs (LOCUS). 6 | 7 | Usage: 8 | python gbk2tsv.py --gbk 1.gbk --outdir . --features "CDS,rRNA,tRNA" --nucl_seq --prot_seq 9 | python gbk2tsv.py --gbk 1.gbk 2.gbk 3.gbk --outdir . --features "CDS,rRNA,tRNA" --nucl_seq --prot_seq 10 | python gbk2tsv.py --gbk $(ls *.gbk) --outdir . --features "CDS,rRNA,tRNA" --nucl_seq --prot_seq 11 | 12 | An example showing columns in every output file: 13 | Contig Locus Feature Start End Strand Pseudo Product Gene Nucl_seq Prot_seq 14 | Contig_1 locus_tag_1 CDS 1 2200 + N dehydrogenase I unknown ... ... 15 | Contig_1 locus_tag_2 CDS 2230 3100 - Y homoserine kinase unknown ... ... 16 | ... 17 | 18 | Dependency: BioPython 19 | Python versions 2 and 3 compatible 20 | Copyright 2019 Yu Wan (wanyuac@sina.cn) 21 | Licensed under the Apache License, Version 2.0 22 | First version: 13 Sep 2019 (Happy Mid-Autumn Festival) 23 | Latest update: 3 Oct 2025 24 | """ 25 | 26 | 27 | from __future__ import print_function 28 | from __future__ import division 29 | import os 30 | import sys 31 | import glob 32 | from Bio import SeqIO, SeqFeature 33 | from argparse import ArgumentParser 34 | 35 | 36 | def parse_args(): 37 | parser = ArgumentParser(description = "Convert GenBank files to tab-delimited text files") 38 | parser.add_argument("-g", "--gbk", nargs = "+", type = str, required = True, dest = "gbks", default = "", help = "Input GenBank files") 39 | parser.add_argument("-o", "--outdir", type = str, required = False, dest = "outdir", default = ".", help = "Output directory (no backslash or forward slash)") 40 | parser.add_argument("-f", "--features", type = str, required = False, dest = "features", default = "CDS,tRNA,rRNA", help = "Comma-separated features to store (default CDS,tRNA,rRNA)") 41 | parser.add_argument("-n", "--nucl_seq", action = "store_true", required = False, dest = "nucl_seq", help = "Turn on this option to print nucleotide sequences of features") 42 | parser.add_argument("-p", "--prot_seq", action = "store_true", required = False, dest = "prot_seq", help = "Turn on this option to print protein sequences of CDS") 43 | 44 | return parser.parse_args() # An instance of the class ArgumentParser 45 | 46 | 47 | def main(): 48 | args = parse_args() 49 | gbk_list = get_input_filenames(args.gbks) 50 | if args.outdir and not os.path.exists(args.outdir): # The first logical condition becomes "false" if args.outdir = "". 51 | os.makedirs(args.outdir) 52 | 53 | if (len(gbk_list) == 0): 54 | sys.exit("Invalid --gbk argument: no GenBank file is found.") 55 | 56 | header = ["Contig", "Locus", "Feature", "Start", "End", "Strand", "Pseudo", "Product", "Gene"] 57 | if args.nucl_seq: 58 | header += ["Nucl_seq"] 59 | if args.prot_seq: 60 | header += ["Prot_seq"] 61 | 62 | features = args.features.split(",") # Features of interest 63 | if len(features) == 0: 64 | sys.exit("Invalid --features argument: there is no feature to be extracted.") 65 | 66 | for gbk in gbk_list: 67 | tsv_name = os.path.join(args.outdir, os.path.splitext(os.path.basename(gbk))[0] + ".tsv") # Define the current output filename: pwd/1.gbk -> pwd/1.tsv 68 | tsv = open(tsv_name, "w") 69 | tsv.write("\t".join(header) + "\n") # Write the header line 70 | records = list(SeqIO.parse(gbk, "genbank")) # Read a GenBank file from the standard input and convert it into a list of SeqRecord objects 71 | for r in records: # Each record (r) is a contig with a unique LOCUS name in the GenBank file. 72 | contig = r.name # LOCUS name 73 | for f in r.features: # Iterate through every feature of the current contig. 74 | feature_type = f.type 75 | if feature_type in features: 76 | # Fetch the locus_tag 77 | if "locus_tag" in f.qualifiers: 78 | locus_tag = f.qualifiers["locus_tag"][0] 79 | else: 80 | locus_tag = "unnamed" 81 | 82 | # Determine which DNA strand the current feature is located in 83 | if f.location.strand == 1: 84 | strand = "+" 85 | else: 86 | strand = "-" 87 | 88 | # Determine whether the current gene is pesudo 89 | if "pseudo" in f.qualifiers or "pseudogene" in f.qualifiers: 90 | is_pesudo = "Y" # Yes 91 | else: 92 | is_pesudo = "N" # No 93 | 94 | # Determine the product name 95 | if "product" in f.qualifiers: 96 | product = f.qualifiers["product"][0] 97 | else: 98 | product = "unknown" 99 | 100 | if "gene" in f.qualifiers: 101 | gene = f.qualifiers["gene"][0] 102 | else: 103 | gene = "unknown" 104 | 105 | # Construct the line to be written into the output file 106 | line = [contig, locus_tag, f.type, str(f.location.start + 1), str(f.location.end), strand, is_pesudo, product, gene] 107 | if args.nucl_seq: 108 | line += [str(f.extract(r.seq))] 109 | if feature_type == "CDS" and args.prot_seq: 110 | if "translation" in f.qualifiers.keys(): 111 | line += [f.qualifiers["translation"][0]] 112 | else: # Pseudo genes may not have any translation. 113 | line += ["unknown"] 114 | 115 | tsv.write("\t".join(line) + "\n") 116 | tsv.close() 117 | 118 | return 119 | 120 | 121 | def get_input_filenames(gbks): 122 | gbk_list = list(gbks) 123 | if len(gbk_list) == 1 and gbk_list[0].startswith("*"): # *.gbk 124 | gbk_list = glob.glob(os.path.join(".", gbk_list[0])) # Get names of all GenBank files under the current working directory 125 | 126 | return(gbk_list) 127 | 128 | 129 | if __name__ == "__main__": 130 | main() 131 | -------------------------------------------------------------------------------- /gbk2tbl.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | """ 4 | This script converts a GenBank file (.gbk or .gb) from Stdin into a Sequin feature table (.tbl), which is an input file of tbl2asn used for creating an ASN.1 file (.sqn). 5 | 6 | Package requirement: BioPython and argparse 7 | 8 | Usage: 9 | Simple command: 10 | python gbk2tbl.py --mincontigsize 200 --prefix any_prefix --modifiers modifier_file.txt < annotation.gbk 11 | cat annotation.gbk | python gbk2tbl.py --mincontigsize 200 --prefix any_prefix --modifiers modifier_file.txt # integrate gbk2tbl into a pipeline 12 | Redirecting error messages to a text file (optional): 13 | python gbk2tbl.py --mincontigsize 200 --prefix any_prefix --modifiers modifier_file.txt < annotation.gbk 2> stderr.txt 14 | cat annotation.gbk | python gbk2tbl.py --mincontigsize 200 --prefix any_prefix --modifiers modifier_file.txt 2> stderr.txt 15 | Note that this script reads the GenBank file through the stdin ("< annotation.gbk") and you may want to redirect the stderr to a file via "> stderr.txt" (redirection). 16 | 17 | Inputs: 18 | A GenBank file, which ought to be passed to the script through the standard input (stdin). 19 | A modifier file: a plain text file containing modifiers for every FASTA definition line. 20 | All FASTA header modifiers must be written in a single line and are separated by a space character. This line will 21 | be copied and directly printed along with the record name as the definition line of every contig sequence. 22 | No space should be placed besides the '=' sign. Check http://www.ncbi.nlm.nih.gov/Sequin/modifiers.html for choosing a proper format for modifiers. 23 | For example, the content of a modifier file can be (no tab character): 24 | [organism=Serratia marcescens subsp. marcescens] [sub-species=marcescens] [strain=AH0650_Sm1] [topology=linear] [moltype=DNA] [tech=wgs] [gcode=11] [country=Australia] [isolation-source=sputum] 25 | Furthermore, regarding the modifier 'topology': 26 | [topology=?]: the molecular topology (circular/linear) of the sequence if this information is not contained in records 27 | For contigs: linear (the default value) 28 | For finished genomes of plasmids and bacterial chromosomes: circular 29 | 30 | Outputs: 31 | any_prefix.tbl: the Sequin feature table 32 | any_prefix.fsa: the corresponding fasta file 33 | These files are inputs for tbl2asn which generates ASN.1 files (*.sqn). 34 | 35 | Arguments: 36 | --mincontigsize: the minimum contig size, default = 200 in accordance with NCBI's regulation 37 | --prefix: the prefix of output filenames, default = 'seq' 38 | --modifiers: the filename of the modifier file, default = 'modifiers.txt' 39 | 40 | Development notes: 41 | This script is derived from the one developed by SEQanswers users nickloman (https://gist.github.com/nickloman/2660685/genbank_to_tbl.py) and ErinL who modified nickloman's script and put it 42 | on the forum post (http://seqanswers.com/forums/showthread.php?t=19975). 43 | 44 | Author of this version: Yu Wan (wanyuac@gmail.com, github.com/wanyuac) 45 | Creation: 20 June 2015 - 11 July 2015; the latest edition: 21 October 2019 46 | 47 | Dependency: Python versions 2 and 3 compatible. 48 | 49 | Licence: GNU GPL 2.1 50 | """ 51 | 52 | from __future__ import print_function 53 | import sys 54 | from Bio import SeqIO 55 | from argparse import ArgumentParser 56 | 57 | def parse_args(): 58 | # Extract arguments from the command line 59 | parser = ArgumentParser(description= 'Read arguments: species, strain, BioProject, prefix') 60 | parser.add_argument('--mincontigsize', type = int, required = False, default = 200, help = 'The minimum contig length') 61 | parser.add_argument('--prefix', type = str, required = False, default = 'seq', help = 'The prefix of output filenames') 62 | parser.add_argument('--modifiers', type = str, required = True, default = 'modifiers.txt', help = 'The text file containing a single line of FASTA head modifiers') 63 | return parser.parse_args() 64 | 65 | def read_modifiers(file): 66 | # This function only reads the first line of the modifier file. So please ensure that all modifiers are put in the first line. 67 | with open(file, 'rU') as f: 68 | s = f.readline() # only read once 69 | return s 70 | 71 | allowed_qualifiers = ['locus_tag', 'gene', 'product', 'pseudo', 'protein_id', 'gene_desc', 'old_locus_tag', 'note', 'inference', \ 72 | 'organism', 'mol_type', 'strain', 'sub_species', 'isolation-source', 'country', \ 73 | 'collection_date'] # In GenBank files, the qualifier 'collection-date' is written as 'collection_date'. 74 | ''' 75 | These are selected qualifiers because we do not want to see qualifiers such as 'translation', 'transl_table', or 'codon_start' in the feature table. 76 | Qualifiers 'organism', 'mol_type', 'strain', 'sub_species', 'isolation-source', 'country' belong to the feature 'source'. 77 | ''' 78 | 79 | def main(): 80 | args = parse_args() # read arguments 81 | contig_num = 0 82 | fasta_fh = open(args.prefix + '.fsa', 'w') # the file handle for the fasta file 83 | feature_fh = open(args.prefix + '.tbl', 'w') # the file handle for the feature table 84 | modifiers = read_modifiers(args.modifiers) # read the modifiers from a text file 85 | records = list(SeqIO.parse(sys.stdin, 'genbank')) # read a GenBank file from the standard input and convert it into a list of SeqRecord objects 86 | 87 | for rec in records: # for every SeqRecord object in the list 'records' 88 | if len(rec) <= args.mincontigsize: # filter out small contigs 89 | print('skipping small contig %s' % (rec.id), file=sys.stderr) 90 | continue # start a new 'for' loop 91 | contig_num += 1 92 | print(rec.name) # print the contig name to STDOUT 93 | 94 | # write the fasta file 95 | rec.description = modifiers 96 | SeqIO.write([rec], fasta_fh, 'fasta') # Prints this contig's sequence to the fasta file. The sequence header will be rec.description. 97 | 98 | # write the feature table 99 | print('>Feature %s' % (rec.name), file = feature_fh) # write the first line of this record in the feature table: the LOCUS name 100 | for f in rec.features: 101 | # print the coordinates 102 | if f.strand == 1: 103 | print('%d\t%d\t%s' % (f.location.nofuzzy_start + 1, f.location.nofuzzy_end, f.type), file = feature_fh) 104 | else: 105 | print('%d\t%d\t%s' % (f.location.nofuzzy_end, f.location.nofuzzy_start + 1, f.type), file = feature_fh) 106 | 107 | if (f.type == 'CDS') and ('product' not in f.qualifiers): 108 | f.qualifiers['product'] = 'hypothetical protein' 109 | # print qualifiers (keys and values) 110 | for (key, values) in f.qualifiers.items(): 111 | ''' 112 | Apply the iteritems() method of the dictionary f.qualifiers for (key, values) pairs 113 | iteritems() is a generator that yields 2-tuples for a dictionary. It saves time and memory but is slower than the items() method. 114 | ''' 115 | if key not in allowed_qualifiers: 116 | continue # start a new 'for' loop of f, skipping the following 'for' statement of v 117 | for v in values: # else, write all values under this key (qualifier's name) 118 | print('\t\t\t%s\t%s' % (key, v), file = feature_fh) 119 | fasta_fh.close() # finish the generation of the FASTA file 120 | feature_fh.close() # finish the generation of the feature table 121 | print(str(contig_num) + ' records have been converted.') 122 | 123 | # call the main function 124 | if __name__ == '__main__': 125 | main() 126 | -------------------------------------------------------------------------------- /parse_biosample.py: -------------------------------------------------------------------------------- 1 | """ 2 | Download records from the NCBI BioSample database and extract values of certain attributes. The main 3 | output file is a tab-delimited file, which can be readily imported to Excel. The column order of this 4 | file is determined by that of attribute names in the parameter '-a'. 5 | 6 | Usage: 7 | samples='SAMN0001,SAMN0002,SAMN0003,SAMN0004,SAMN0005' 8 | attr='strain,collection_date,geo_loc_name,host,host_disease,isolation_source' 9 | python parse_biosample.py -i $samples -a $attr -e 'xx@xx.xx' 10 | python parse_biosample.py -i 'file:accessions.txt' -a $attr -e 'xx@xx.xx' # One accession number a line in accession.txt 11 | 12 | Copyright 2020 Yu Wan 13 | Publication: 15 March 2020 14 | Licenced under GNU GPL 2.1. 15 | """ 16 | 17 | from __future__ import print_function 18 | from Bio import Entrez 19 | from argparse import ArgumentParser 20 | import os 21 | import sys 22 | import time 23 | import xml.etree.ElementTree as xmlTree 24 | 25 | 26 | def parse_arguments(): 27 | parser = ArgumentParser(description = 'Download BioSample records and extract specific attributes') 28 | parser.add_argument('-i', type = str, required = True, help = 'Comma-delimited accessions or a file of a list of acceessions') 29 | parser.add_argument('-a', type = str, required = True, help = 'Comma-delimited names of attributes whose values will be extracted') 30 | parser.add_argument('-e', type = str, required = True, help = 'User\'s email address required for accessing the NCBI database') 31 | parser.add_argument('-o', type = str, required = False, default = 'metadata.tsv', help = 'Filename for extracted attribute values') 32 | parser.add_argument('-d', type = str, required = False, default = './record', help = 'Directory name for downloaded BioSample records') 33 | parser.add_argument('-r', action = 'store_true', required = False, help = 'Flag it to override existing XML files') 34 | parser.add_argument('-n', action = 'store_true', required = False, help = 'Flag it to not replace missing values with NAs') 35 | 36 | return parser.parse_args() 37 | 38 | 39 | def main(): 40 | args = parse_arguments() 41 | check_outdir(args.d) 42 | attributes = args.a.split(',') 43 | 44 | # Download BioSample records 45 | print('Start to download BioSample records.') 46 | records = download_records(accessions = get_accession_numbers(args.i), email = args.e, \ 47 | out_dir = args.d, override = args.r) # Create an XML file [acc].xml in the record directory 48 | 49 | # Parse the records 50 | print('\nStart to parse BioSample records.') 51 | f = open(args.o, 'w') # Create the output file 52 | f.write('\t'.join(['BioSample'] + attributes) + '\n') # Print the header 53 | for a, p in records.items(): # Accession number and XML path 54 | if p == None: 55 | continue 56 | else: 57 | extract_attributes(accession = a, xml_path = p, out_file = f, attrs = attributes, \ 58 | fill_null = not args.n) # By default, fill missing values with NAs. 59 | f.close() 60 | 61 | return 62 | 63 | 64 | def extract_attributes(accession, xml_path, out_file, attrs, fill_null): 65 | print("Parsing record %s:" % accession) 66 | xml = xmlTree.parse(xml_path).getroot() # Read and parse the XML file 67 | parental_domain = xml[0] # Tag: BioSample 68 | 69 | for d in parental_domain: 70 | if d.tag == 'Attributes': 71 | attr_dict = parse_attribute_domain(d) # Write attributes into the output file 72 | parsed_attrs = attr_dict.keys() # Note that not all attrs can always be found in parsed_attrs. 73 | new_line = [accession] 74 | for a in attrs: 75 | if a in parsed_attrs: 76 | v = attr_dict[a] 77 | if v == '' and fill_null: 78 | new_line.append('NA') 79 | else: 80 | new_line.append(v) 81 | else: 82 | print(' Warning: attribute %s is not found in record %s.' % (a, accession)) 83 | if fill_null: 84 | new_line.append('NA') 85 | else: 86 | new_line.append('') # A NULL space holder. 87 | out_file.write('\t'.join(new_line) + '\n') 88 | print(' Record %s has been successfully parsed.' % accession) 89 | 90 | return 91 | 92 | 93 | def parse_attribute_domain(d): 94 | """ 95 | Converts the 'Attributes' domain into a dictionary. 96 | """ 97 | attrs = {} 98 | for a in d: 99 | # Compared to display_name and attribute_name, harmonized_name is expected to be conserved across records. 100 | attrs[a.attrib['harmonized_name']] = a.text # For example: {isolation_source: urine} 101 | 102 | return attrs 103 | 104 | 105 | def download_records(accessions, email, out_dir, override): 106 | """ 107 | Download and save BioSample records as XML files, and return a dictionary {accession: file path}. 108 | Existing XML files are skipped by default, which saves lots of time. (Particularly when users re-run this script) 109 | """ 110 | paths = {} 111 | Entrez.email = email 112 | 113 | for a in accessions: 114 | xml_path = os.path.join(out_dir, a + '.xml') 115 | if os.path.exists(xml_path): 116 | if override: 117 | print('Download and override existing record %s.' % a) 118 | try: 119 | handle = Entrez.efetch(db = 'biosample', id = a, rettype = 'xml', retmode = 'text') # In XML format 120 | paths[a] = xml_path 121 | f = open(xml_path, 'w') 122 | f.write(handle.read()) # Save the current record as an XML file 123 | f.close() 124 | except: 125 | print(' Warning: record %s is not found. Skip this record.') 126 | paths[a] = None 127 | time.sleep(1) 128 | else: 129 | print('Skip existing record %s.' % a) 130 | paths[a] = xml_path 131 | else: 132 | print('Download record %s.' % a) 133 | try: 134 | handle = Entrez.efetch(db = 'biosample', id = a, rettype = 'xml', retmode = 'text') # In XML format 135 | paths[a] = xml_path 136 | f = open(xml_path, 'w') 137 | f.write(handle.read()) # Save the current record as an XML file 138 | f.close() 139 | except: 140 | print(' Warning: record %s is not found. Skip this record.') 141 | paths[a] = None 142 | time.sleep(1) 143 | 144 | return paths 145 | 146 | 147 | def get_accession_numbers(s): 148 | """ 149 | Returns a list of BioSample accession numbers 150 | """ 151 | PREFIX = 'file:' 152 | if s.startswith(PREFIX): 153 | s = s[len(PREFIX) : ] # Drop the prefix 154 | try: 155 | with open(s, 'r') as f: 156 | accs = f.read().splitlines() 157 | except: 158 | sys.exit('Error: accession file %s is not accessible.' % s) 159 | else: 160 | accs = s.split(',') 161 | print(' Altogether %i accession numbers have been imported.' % len(accs)) 162 | 163 | return accs 164 | 165 | 166 | def check_outdir(d): 167 | if os.path.exists(d): 168 | print('Output directory %s exists.' % d) 169 | else: 170 | os.system('mkdir ' + d) 171 | 172 | return 173 | 174 | 175 | if __name__ == '__main__': 176 | main() 177 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # BINF_toolkit 2 | This directory consists of scripts developed by Yu Wan for routine bioinformatic analysis. 3 | 4 | ## A list of scripts 5 | * [add_sample_name_FASTA.py](#add_sample\_name\_FASTA) 6 | * [downloadSeqFromNCBI.py](#download\_NCBI\_records) 7 | * [extractNuclRegionFromFASTA.py](#extract\_nucl\_region) 8 | * [gbk2tbl.py](#gbk2tbl) 9 | * [gbk2tsv.py](#gbk2tsv) 10 | * [gc.py](#gc) 11 | * [extractSeqFromGBK.py](#get\_gene\_seq) 12 | * [parse_ENA_sampleInfo_XML.py](#parse\_ENA\_sampleInfo\_XML) 13 | * [run_CutAdapt.py](#run_CutAdapt) 14 | * [filename_generator.py](#filename_generator) 15 | 16 | ## Manual 17 | ### add\_sample\_name\_FASTA.py 18 | This script appends a sample name at the beginning of each sequence in a FASTA file. For example, the header "\>g1 description" becomes "\>sample1__g1 description" after running this script. 19 | 20 | Command example: ```python add_sample_name_FASTA.py -i filename.txt (or filename.fna) -o output_dir -n``` 21 |
22 | 23 | ### downloadSeqFromNCBI.py 24 | This script takes as input a list of NCBI accession numbers (one for each line) from the STDIN and downloads corresponding entries (either GenBank files or FASTA files) under the target directory. 25 | 26 | **Examples** 27 | 28 | ```shell 29 | python downloadSeqFromNCBI.py --records "file:objects.txt" --format fasta --email xxx@xxx.com --suffix fna --outdir ./ref --skip > download.log 30 | 31 | python downloadSeqFromNCBI.py --records "NC_0001,NC_0002" --format genbank --email xxx@xxx.com --suffix gbk --outdir ./ref --skip > download.log 32 | ``` 33 | 34 | Type ```python downloadSeqFromNCBI.py -h``` or ```--help``` for help information. 35 | 36 | **Notes about options and option arguments** 37 | * --records: can be either a file (must contain a suffix of ".txt") listing targets to be downloaded, or a string of accession IDs separated by commas (no space is allowed). 38 | * --format or -f: the format of files to be downloaded 39 | * --suffix or -s: the file extension, can be "fasta" (default), "fna", "gb", or "gbk". No dot preceding the extension is needed. 40 | * --outdir or -o: output directory, no backslash at the end. 41 | 42 | An example of the input list: seq_list.txt. Note that accession IDs may not include version numbers such as ".1" (HG326223.1, CP011642). 43 | 44 | **References** 45 | 1. This script is inspired by Mark Schultz's (dr.mark.schultz@gmail.com, GitHub: schultzm) script "downloadGenbankByAccessions.py". 46 | 47 | 2. [A post on the BioStars forum](https://www.biostars.org/p/63506/). 48 | 49 | 3. Damien Farrell's blog post: [Retrieving genome assemblies via Entrez with Python](https://dmnfarrell.github.io/bioinformatics/assemblies-genbank-python). 50 | 51 |
52 | 53 | ### extractNuclRegionFromFASTA.py 54 | This script extracts a region of nucleotides by positions from a fasta file. 55 | 56 | **Arguments** 57 | -i: the path of the input file 58 | -n: the name of your selected contig 59 | -f: feature name specified by the user 60 | -s: the first nucleotide to be selected 61 | -e: the last nucelotide to be selected 62 | -o: the filename of the output 63 | 64 | **Requirements** 65 | * Only one genomic region should be selected; 66 | * the start and end positions should not spill out. 67 |
68 | 69 | ### gbk2tbl.py 70 | 71 | This script converts a GenBank file (.gbk or .gb) from Stdin into a Sequin feature table (.tbl), which is an input file of tbl2asn used for creating an ASN.1 file (.sqn). 72 | 73 | Package requirement: BioPython and argparse 74 | 75 | **Usage** 76 | ```shell 77 | python gbk2tbl.py --mincontigsize 200 --prefix any_prefix --modifiers modifier_file.txt < annotation.gbk 2> stderr.txt 78 | ``` 79 | 80 | Note that this script reads the GenBank file through the stdin ("\< annotation.gbk") and you may want to redirect the stderr to a file via "\> stderr.txt" (redirection). 81 | 82 | **Inputs** 83 | A GenBank file, which ought to be passed to the script through the standard input (stdin). 84 | 85 | A modifier file: a plain text file containing modifiers for every FASTA definition line. 86 | * All modifiers must be written in a single line and are separated by a single space character. 87 | * No space should be placed besides the '=' sign. Check [NCBI help](http://www.ncbi.nlm.nih.gov/Sequin/modifiers.html) for choosing a proper format for modifiers. 88 | * For example: a line "[organism=Serratia marcescens subsp. marcescens] [sub-species=marcescens] [strain=AH0650_Sm1] [topology=linear] [moltype=DNA] [tech=wgs] [gcode=11] [country=Australia] [isolation-source=sputum]" will be copied and printed along with the record name as the definition line of every contig sequence. 89 | 90 | **Outputs** 91 | * any_prefix.tbl: the Sequin feature table 92 | * any_prefix.fsa: the corresponding fasta file 93 | These files are inputs for tbl2asn which generates ASN.1 files (*.sqn). 94 | 95 | **Arguments** 96 | * --mincontigsize: the minimum contig size, default = 200 in accordance with NCBI's regulation 97 | * --prefix: the prefix of output filenames, default = 'seq' 98 | * --modifiers: the filename of the modifier file, default = 'modifiers.txt' 99 | 100 | **Demonstration** 101 | A test data set for this script is provided in the directory _example_. This data set is composed of a compressed GenBank file *NJST258\_1\_\_CP006923.gbk.gz* and a modifier file *gbk2tbl\_modifiers.txt*. Users can run the following command line to produce a TBL file as well as a FASTA file: 102 | 103 | ```shell 104 | zcat ./example/NJST258_1__CP006923.gbk.gz | python gbk2tbl.py --mincontigsize 200 --prefix Kp --modifiers gbk2tbl_modifiers.txt 105 | ``` 106 |
107 | 108 | ### gbk2tsv.py 109 | This script converts one or multiple GenBank files into tab-delimited feature tables (plain text), which can be imported to Excel or R afterwards. 110 | 111 | Relevant blog [post](https://microbialsystems.cn/en/post/gbk2tsv/). 112 |
113 | 114 | ### gc.py 115 | This program calculates the length, GC content, and entropy for each record in a multi-fasta file. 116 | 117 | Input: a fasta file which contains multiple sequences from the standard input 118 | 119 | Output: for each sequence, the script prints: the header line, total sequence length, (G+C)% and entropy of the input sequence. 120 | 121 | Command line: 122 | ```bash 123 | python gc.py < filename.fasta 124 | ``` 125 | 126 | Treatment of the extended alphabet in this script: 127 | 1. consider all of 15 characters 128 | 2. construct a weighted-count table using dictionary 129 | 3. for each character in the table, take the probability of being A, G, C or T as effective counts 130 | 4. counts for A, G, C and T is computed by adding up the vectors for every character read from the sequence. 131 |
132 | 133 | ### extractSeqFromGBK.py 134 | This script extracts gene sequences from a GenBank file, in accordance with a list of (locus_tag, feature type) tuples. 135 | 136 | Required module: Bio, argparse, csv 137 | 138 | **Usage** 139 | ```bash 140 | python extractSeqFromGBK.py --tags locus_tag.tsv --gb demo.gbk > genes.fna 141 | ``` 142 | 143 | **Inputs** 144 | 1. A GenBank file. 145 | 2. A text file listing selected locus_tags in the following format: locus_tag"\t"feature_type. This file MUST use ASCII codes because [the module csv/2.3 does not support Unicode inputs](https://docs.python.org/2/library/csv.html). 146 | 3. Allowed feature types are: CDS, tRNA, rRNA and tmRNA. For example: 147 | SMDB11_RS00910 rRNA
148 | SMDB11_RS21915 rRNA
149 | SMDB11_RS00015 CDS
150 | 151 | **Output** 152 | Nucleotide sequences in FASTA format with the header in the format: \>feature type|contig name|locus_tag|position|length|product 153 | 154 | **Warnings** 155 | 1. Although it is unlikely in a GenBank file, but please always ensure that there is no duplication of locus_tags in the table because this script treats locus_tag"s as keys for retrieving feature types. 156 | 2. An "IndexError: list index out of range" will arise if the tag list uses Unicode codes. 157 |
158 | 159 | ### parse_ENA_sampleInfo_XML.py 160 | This script parses an ENA metadata file in XML format and prints a subset of information. 161 | 162 | **Usage** 163 | 164 | ```bash 165 | python parse_ENA_sampleInfo_XML.py ERP000909.xml > samples.txt 166 | ``` 167 | 168 | Input: an XML file exported for a list of ERS accession numbers from ENA using the REST URLs API. For example, one can download an XML file for sample ERS086023 using the link [http://www.ebi.ac.uk/ena/data/view/ERS086023&display=xml](http://www.ebi.ac.uk/ena/data/view/ERS086023&display=xml). 169 | 170 | **Outputs** 171 | 172 | * tab-delimited text file containing information retrieved from the XML file. 173 | * study_accession, sample_accession, secondary_sample_accession, experiment_accession, run_accession, Isolate_ID, Host, Place_of_isolation, Year_of_isolation 174 |
175 | 176 | ### run_CutAdapt.py 177 | 178 | This script runs [CutAdapt](https://github.com/marcelm/cutadapt) for a list of paired-end readsets. 179 | 180 | Dependency: [slurm](http://slurm.schedmd.com) on a computational cluster (Linux OS) 181 |
182 | 183 | ### filename_generator.py 184 | This script generates a list of file names based on a list of strings. It is useful if you want to generate a list of file names for read sets from a list of bacterial strain names. 185 | 186 | Usage 187 | ```shell 188 | python filename_generator.py -i input_file -o output_file -p prefix -s suffix -f from -l to -pe 189 | ``` 190 | 191 | Input: a plain-text file consists of a list of filenames 192 | 193 | Example input files: (inlist.txt) 194 |     sample1\_\_genes\_\_results.txt 195 |     sample2\_\_genes\_\_results.txt 196 | 197 | Command 198 | ```shell 199 | python filename_generator.py -i inlist.txt -o outlist.txt -p /reads/ -s .fastq.gz -f 0 -l 7 -pe 200 | ``` 201 | 202 | Output: a list of new file names generated on the basis of strings in inlist.txt 203 | 204 | Example output items: (outlist.txt) 205 |     /reads/sample1\_1.fastq.gz 206 |     /reads/sample1\_2.fastq.gz 207 |     /reads/sample2\_1.fastq.gz 208 |     /reads/sample2\_2.fastq.gz 209 | -------------------------------------------------------------------------------- /extractSeqFromGBK.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | This script extracts nucleotide or protein sequences from a GenBank file in accordance with a list of 5 | (locus_tag/gene, feature type) tuples. 6 | 7 | Required modules: Python 3, BioPython, argparse, csv. 8 | 9 | Usage: 10 | python extractSeqFromGBK.py --targets [target file] --gbk [GenBank file(s)] > [output file name] 11 | python extractSeqFromGBK.py --targets [target file] --gbk [GenBank file(s)] --usegene > [output file name] 12 | python extractSeqFromGBK.py --targets [target file] --gbk [GenBank file(s)] --usegene --aa --extname > [output file name] 13 | 14 | Inputs 15 | 1. --gbk: A list of GenBank files. Each filename is a genome name, which will be parsed and put into output sequence headers. 16 | 2. --targets: 17 | (1) A tab-delimited text file (.tsv) listing selected locus_tags/gene names in the format: [tag]'\t'[feature_type]. 18 | (2) Single-target mode: an ID, preceded by a '^' sign and followed by a feature type with a colon as the delimiter. For example, 'repB,CDS'. 19 | 20 | Allowed feature types are: CDS, tRNA, rRNA, tmRNA. 21 | For example 22 | SMDB11_RS00910 rRNA 23 | SMDB11_RS21915 rRNA 24 | SMDB11_RS00015 CDS 25 | Or: 26 | gene1 CDS 27 | gene2 CDS 28 | gene3 CDS 29 | 30 | Output (to stdout) 31 | Nucleotide sequences in FASTA format with the header in the format: 32 | >[sequence ID] [gene name]|[Genome name]|[NCBI nucleotide accession or contig name]|[Coding strand (+/-)]|[Coordinates]|[Coordinate strand (+/-)]|[Locus tag]|[NCBI protein accession/NA]|[Product name] 33 | 34 | Example commands 35 | python extractSeqFromGBK.py --targets loci.tsv --gbk genome1.gb > genome1_genes.fna 36 | python extractSeqFromGBK.py --targets ^geneA:CDS --gbk *.gbk --usegene > genes.fna 37 | python extractSeqFromGBK.py --targets genes.tsv --gbk *.gbk --usegene --aa --extname > proteins.faa 38 | 39 | Notes 40 | 1. Locus tags are recommended when users want to extract sequences of exact features, since some features in a record 41 | may share the same gene name. (Check GenBank files before using this script). Nonetheless, users may want to use gene 42 | names rather than locus tags to include sequences of the same gene name. 43 | 2. Multiple sequences of any target that is shared by different loci will be extracted. For example, two sequences are 44 | printed for a gene when both sequences share the same gene name. 45 | 3. Note that some features, such as 'source', does not contain a qualifier "locus_tag". A KeyError will arise if call 46 | qualifiers["locus_tag"] for those features. Moreover, the 'gene' feature, although it shares the same locus_tag with 47 | its CDS, it does not contain a nucleotide sequence and hence should not be used as a legit type of target features. 48 | 4. Coordinate strand (+/-) always equals '+' when the GenBank file is created by Prokka or NCBI's Prokaryotic Genome 49 | Annotation Pipeline (PGAP). This scripts presumes that this is the case. 50 | 51 | Explanation of warning(s) 52 | An "IndexError: list index out of range" will arise if the tag list uses Unicode codes. 53 | 54 | Copyright (C) Yu Wan 2020 55 | Publication: 19 June 2015; latest update: 26 May 2020 56 | Licence: GNU General Public License v3.0 57 | Previous filename: get_gene_seq.py 58 | 59 | References 60 | Mark Schultz, https://github.com/schultzm/parseGenbank_extractGenes.py 61 | martineau, http://stackoverflow.com/questions/14734604/python-dictionary-of-lists-from-tab-delimited-file 62 | """ 63 | 64 | from Bio import SeqIO 65 | import os 66 | import sys 67 | import csv # User"s Python script name should not be the module name, otherwise, the former will be loaded and cause an error of no attribute loaded. 68 | from argparse import ArgumentParser 69 | 70 | 71 | def parse_args(): 72 | parser = ArgumentParser(description= "Extract nucleotide/protein sequences from GenBank files") 73 | parser.add_argument("--targets", "-t", dest = "targets", type = str, required = True, help = "A tab-delimited file listing (locus_tag/gene name, feature type) tuples, or ^tag:type") 74 | parser.add_argument("--gbk", "-g", dest = "gbk", nargs = "+", type = str, required = True, help = "One or multiple GenBank files") 75 | parser.add_argument("--usegene", "-u", dest = "usegene", action = "store_true", required = False, help = "A flag enabling the use of gene names rather than locus tags for feature match") 76 | parser.add_argument("--aa", "-a", dest = "aa", action = "store_true", required = False, help = "Set to print amino acid sequences instead of nucleotide sequences") 77 | parser.add_argument("--extname", "-x", dest = "extname", action = "store_true", required = False, help = "Set to attach genome names to sequence names, making an extended sequence name") 78 | 79 | return parser.parse_args() 80 | 81 | 82 | def main(): 83 | args = parse_args() 84 | 85 | # Read targets 86 | targets_def = args.targets 87 | if targets_def.startswith("^"): 88 | targets_def = targets_def[1 : ] 89 | tag, feature_type = targets_def.split(":") 90 | tags = {tag : feature_type} 91 | else: 92 | tags = read_table(args.targets) # read the table and store it as a dictionary 93 | 94 | # Validity check of tags 95 | if len(tags) == 0: # if the dictionary 'tags' is empty, then terminate the loop 96 | print("Warning: no target is read.") 97 | sys.exit(0) 98 | 99 | search_key = "gene" if args.usegene else "locus_tag" 100 | 101 | for gbk in args.gbk: 102 | print("Processing %s" % gbk, file = sys.stderr) 103 | process_gbk(gbk = gbk, tags = tags, search_key = search_key, usegene = args.usegene, get_protein = args.aa,\ 104 | att_name = args.extname, tag_num = len(tags)) 105 | 106 | return 107 | 108 | 109 | def process_gbk(gbk, tags, search_key, usegene, get_protein, att_name, tag_num): 110 | """ 111 | This function processes a single GenBank file. 112 | """ 113 | target_feature_types = set(tags.values()) # Creates a set of feature types (CDS, tRNA, tmRNA, rRNA, etc.) from the dictionary of targets 114 | if "gene" in target_feature_types: 115 | print("Error: 'gene' is not a legit feature type.") 116 | sys.exit(0) 117 | targets = list(tags.keys()) # Names of targets. For instance, a list of gene names or locus tags. 118 | g = os.path.splitext(os.path.basename(gbk))[0] # Remove path and filename extension from the path of the input GenBank file. 119 | loci_found = 0 # Number of target loci encountered in this GenBank file. This variable is useful when usegene = False. 120 | continue_search = True 121 | 122 | for contig in SeqIO.parse(gbk, "genbank"): 123 | """ 124 | Do not use list(SeqIO.parse(gbk, "genbank")) in order to save memory. 125 | Object 'contig' belongs to class SeqRecord and corresponds to a LOCUS feature in the GenBank file. 126 | A GenBank file may be comprised of multiple contigs. The following loop goes through every feature of the contig. 127 | """ 128 | if continue_search: # Go through features of the current contig. 129 | for f in contig.features: 130 | if f.type in target_feature_types: # Skipping unwanted feature types saves time. 131 | f_qualifier_keys = list(f.qualifiers.keys()) 132 | if search_key in f_qualifier_keys: # type(f.qualifiers): collections.OrderedDict 133 | tag_name = f.qualifiers[search_key][0] # Equals gene name when search_key is 'gene' or locus tag when search_key is 'locus_tag'. 134 | if tag_name in targets and f.type == tags[tag_name]: # If the true feature type matches the anticipated type, then it is a true discovery. 135 | strand = "+" if f.strand == 1 else "-" 136 | start = int(f.location.start) + 1 # An alias for f.location.nofuzzy_start. Conventional start position is 1 bp greater than the Python-style coordinate. 137 | end = int(f.location.end) # An alias for f.location.nofuzzy_end 138 | 139 | # Get the sequence 140 | if get_protein and f.type == "CDS": 141 | if "translation" in f_qualifier_keys: 142 | seq = f.qualifiers["translation"][0] # Type: str 143 | else: # It happens when the CDS is a pseudo gene. 144 | print("Warning: CDS of feature %s in %s does not have a translated sequence." % (tag_name, gbk),\ 145 | file = sys.stderr) 146 | continue # Skip the current feature and move to the next one. 147 | else: 148 | seq = str(f.extract(contig.seq)) 149 | 150 | # Determine the output sequence ID 151 | if att_name: 152 | seq_id = "%s.%s" % (tag_name, g) 153 | else: 154 | seq_id = tag_name 155 | 156 | # Determine the gene name where available 157 | if "gene" in f_qualifier_keys: 158 | gene_name = f.qualifiers["gene"][0] 159 | else: 160 | gene_name = "NA" 161 | 162 | # Determine protein accession number when available 163 | if f.type == "CDS" and "protein_id" in f_qualifier_keys: 164 | protein_accession = f.qualifiers["protein_id"][0] 165 | else: 166 | protein_accession = "NA" 167 | 168 | # Get the locus tag 169 | if usegene and "locus_tag" in f_qualifier_keys: 170 | locus_tag = f.qualifiers['locus_tag'][0] 171 | else: 172 | locus_tag = tag_name 173 | 174 | # Get the product name 175 | if "product" in f_qualifier_keys: 176 | product = f.qualifiers["product"][0] 177 | else: 178 | product = "NA" 179 | 180 | # Print the target sequence 181 | print(">%s %s|%s|%s|%s|%i-%i|+|%s|%s|%s" % (seq_id, gene_name, g, contig.id, strand, start, end, locus_tag,\ 182 | protein_accession, product)) # print the header 183 | print(seq) # extract nucleotide sequence of this feature 184 | 185 | """ 186 | In order to save time, the for loop is terminated when locus tags (which are unique in every GenBank file) 187 | are used as search keys and all target locus tags have been found. 188 | """ 189 | if not usegene: 190 | loci_found += 1 191 | if loci_found == tag_num: # Do not need to do further search. 192 | continue_search = False # To break the outer for loop 193 | break # Terminate the current for loop 194 | else: 195 | break # This termination happens when usegene = False and all target locus tags have been found. 196 | 197 | return 198 | 199 | 200 | def read_table(f): 201 | """ 202 | This function reads a tab-delimited file and saves it as a dictionary through using the first column as keys. 203 | """ 204 | d = {} # create an empty dictionary 205 | with open(f, "r") as csv_file: # a wrapper for reading a file instead of using open() and f.close() 206 | csv_reader = csv.reader(csv_file, delimiter = "\t") 207 | try: 208 | for row in csv_reader: 209 | d[row[0]] = row[1] # use the first column as keys and the second column as values 210 | except: 211 | print("Error: cannot read tags values. Your tag file should use ASCII or utf-8 encoding system.") 212 | sys.exit(1) 213 | return d 214 | 215 | 216 | if __name__ == "__main__": 217 | main() 218 | -------------------------------------------------------------------------------- /other_licence/Apache Licence-2.0.txt: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /shell/download_reads_from_sra.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (C) 2020-2025 Yu Wan 3 | # Licensed under the GNU General Public Licence version 3 (GPLv3) . 4 | # Publication: 11 March 2020; last modification: 5 May 2025 5 | # Important update on 16/4/2025: added fastq-dump arguments "--skip-technical --clip --dumpbase --read-filter pass" 6 | # according to https://edwards.flinders.edu.au/fastq-dump/. (Thanks to Sophie Mannix for pointing this out) 7 | 8 | # Help information ######################### 9 | show_help() { 10 | echo " 11 | Download files of sequencing reads from the NCBI SRA database. 12 | Arguments: 13 | -d: (optional) Directory that contains the program fastq-dump (does not need to end by a slash character '/'). 14 | -o: Output directory (no forward slash). Default: ${HOME}/SRA_reads. 15 | -a: A comma-delimited string of target accession numbers (SRR*) 16 | -f: A single-column text file of SRR numbers or a two-column CSV file of genome names (1st column) 17 | and SRR numbers (2nd column). 18 | -r: A logical flag turning on replacement of SRR numbers with genome names for read files. 19 | -s: A logical flag notifying this script that the reads to be downloaded are single-end. 20 | -u: Skip the dos2unix step if your input file is known to follow the Unix-style line ending. 21 | -p: Prefix of the log file (Markdown format) in the output directory (default: download_reads_from_sra_[date (YYYY-MM-DD)]_[HH-MM-SS]) 22 | Example command: 23 | ./download_reads_from_sra.sh -d=\"\$HOME/bin/sra_toolkit/bin\" -o=\"\$PWD\" -f=readsets.csv -r -p=download_reads_from_sra 24 | Note that: 25 | 1. The -a argument is ignored when the -f argument is set. 26 | 2. Newline characters in the input file must be '\n' rather than '\r\n'. 27 | 3. Fastq-dump sometimes fails in downloading or parsing read files. Remember to check the log and error files after each run. 28 | 4. Please ensure genome names are unique throughout your dataset. Otherwise, files of the same names may be overridden at the renaming step. 29 | 5. Dependency: SRA Toolkit v3.0.6 and later versions, dos2unix. 30 | 6. This script was called download_sra_reads.sh. 31 | " 32 | } 33 | 34 | if [[ $# -eq 0 ]]; then # When $1 does not exist (Error "$1: unbound variable" arises when use `if [[ $# -eq 0 ]] || [ "$1" = "-h" ]`). Do not use `-z "$1"`. 35 | show_help 36 | exit 0 37 | elif [ "$1" = "-h" ]; then 38 | show_help 39 | exit 0 40 | fi 41 | 42 | # Remove leading and trailing whitespace (spaces, tabs, etc.) while handling carriage returns (\r) and newlines (\n) ######################### 43 | function trim_whitespace() { 44 | echo -e "$1" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//' 45 | } 46 | 47 | function time_stamp() { 48 | echo "[$(date +"%Y-%m-%d %H:%M:%S")]" 49 | } 50 | 51 | function write_log() { 52 | echo "$1" >> "$log" # The global variable log will be defined later. 53 | } 54 | 55 | # Main function ######################### 56 | out_dir="${HOME}/SRA_reads" # The default output directory 57 | replace_names=false # By default, do not replace accession numbers with genome names. 58 | paired_end=true # Assumes all read files are paired-end. 59 | read_file=false # Assumes that by default accessions are not provided in a file. 60 | unix_format=false # Assumes the input file has non-Unix-style line endings ('\n\r' etc). 61 | prefix=download_reads_from_sra 62 | suffix=$(date +"%Y-%m-%d_%H-%M-%S") # Name suffix of log and error files 63 | wait_time=2 # In seconds. Time to pause before the start of a new download iteration 64 | 65 | # Read arguments 66 | for i in "$@"; do 67 | case $i in 68 | -a=*) 69 | accessions="${i#*=}" 70 | accessions=( $(echo $accessions | tr ',' '\n') ) 71 | ;; 72 | -f=*) 73 | acc_list="${i#*=}" # A file listing accession numbers and probably genome names as well 74 | read_file=true 75 | ;; 76 | -r) 77 | replace_names=true 78 | ;; 79 | -s) 80 | paired_end=false # Single-end reads 81 | ;; 82 | -d=*) 83 | program_dir="${i#*=}" 84 | ;; 85 | -o=*) 86 | out_dir="${i#*=}" 87 | ;; 88 | -u) 89 | unix_format=true 90 | ;; 91 | -p=*) 92 | prefix="${i#*=}" 93 | ;; 94 | -w=*) 95 | wait_time="${i#*=}" 96 | ;; 97 | *) # Do nothing otherwise. 98 | ;; 99 | esac 100 | done 101 | 102 | log="${out_dir}/${prefix}_${suffix}.md" # A plain-text Markdown file 103 | 104 | # Check the output directory 105 | if [ ! -d "$out_dir" ]; then 106 | mkdir -p "$out_dir" 107 | write_log "# Global configurations" 108 | write_log "$(time_stamp) Created the output directory: ${out_dir}." 109 | fi 110 | 111 | # Load module 112 | if [ ! -z "$program_dir" ]; then 113 | export PATH="${program_dir}:$PATH" 114 | fi 115 | 116 | # Download and parse read files 117 | write_log "Waiting time between consecutive download iterations: $wait_time seconds." 118 | accession_count=0 # Assign a default value to accession_count in case neither -f nor -a is set. 119 | successes=0 120 | failures=0 121 | 122 | if [ "$read_file" = true ]; then 123 | if [ ! -f "$acc_list" ]; then 124 | write_log "$(time_stamp) Error: $acc_list was not found." 125 | exit 1 126 | fi 127 | if [ "$unix_format" = false ]; then 128 | dos2unix "$acc_list" >> "$log" 2>&1 129 | fi 130 | 131 | # Read lines of the input file into an array, skipping empty lines 132 | # https://stackoverflow.com/questions/15685736/how-to-extract-a-particular-element-from-an-array-in-bash 133 | mapfile -t lines_array < "$acc_list" 134 | accession_count="${#lines_array[@]}" # Number of accessions 135 | write_log "$(time_stamp) Imported $accession_count entries from file ${acc_list}." 136 | if [ "$replace_names" = true ]; then 137 | write_log "Run accessions in FASTQ filenames will be replaced by genome names." 138 | echo >> "$log" # Create an empty line in the log file 139 | write_log "# Task records" 140 | for line in "${lines_array[@]}"; do 141 | # write_log "$(time_stamp) Parse line '${line}'" # This command is used for debugging 142 | IFS=',' read -r -a line_fields <<< "${line}" 143 | genome="${line_fields[0]}" # Genome or isolate name 144 | accession="${line_fields[1]}" # Run accession 145 | genome="$(trim_whitespace "$genome")" # Sometimes people include whitespaces or tab characters in some genome names or accessions by accident. 146 | accession="$(trim_whitespace "$accession")" 147 | if [[ -z "$genome" || -z "$accession" ]]; then 148 | write_log "$(time_stamp) Warning: Skipping malformed line: '$line'" 149 | continue 150 | fi 151 | fastq_original_prefix="${out_dir}/${accession}" 152 | fastq_renamed_prefix="${out_dir}/${genome}" 153 | # write_log "$(time_stamp) genome name ${genome} and its Run accession ${accession}" # For debugging 154 | if [ "$paired_end" = true ]; then # Paired-end mode 155 | write_log "## Download paired-end reads under Run accession $accession of genome ${genome}" 156 | write_log "$(time_stamp) Run fastq-dump to download the reads." 157 | if fastq-dump --readids --skip-technical --clip --dumpbase --read-filter pass --outdir "$out_dir" --split-3 "$accession" >> "$log" 2>&1; then # Download and split the read file, and create the output directory if necessary 158 | write_log "$(time_stamp) fastq-dump command finished for $accession of genome ${genome}." 159 | else 160 | write_log "$(time_stamp) fastq-dump failed for $accession." 161 | ((failures++)) 162 | continue 163 | fi 164 | f1="${fastq_original_prefix}_pass_1.fastq" 165 | f2="${fastq_original_prefix}_pass_2.fastq" 166 | f1_renamed="${fastq_renamed_prefix}_1.fastq" 167 | f2_renamed="${fastq_renamed_prefix}_2.fastq" 168 | if [ -f "$f1" ] && [ -f "$f2" ]; then 169 | #write_log "$(time_stamp) Compress FASTQ files $f1 and $f2" 170 | #gzip "$f1" # Removed the gzip step because occasionally it fails to execute on my university's HPC 171 | #gzip "$f2" 172 | #if [ -f "${f1}.gz" ] && [ -f "${f2}.gz"]; then 173 | #mv "${f1}.gz" "$f1_renamed" 174 | #mv "${f2}.gz" "$f2_renamed" 175 | #write_log "$(time_stamp) Successfully compressed and renamed FASTQ files: ${f1}.gz -> ${f1_renamed}; ${f2}.gz -> ${f2_renamed}" 176 | mv "$f1" "$f1_renamed" 177 | mv "$f2" "$f2_renamed" 178 | if [ -f "$f1_renamed" ] && [ -f "$f2_renamed" ]; then 179 | write_log "$(time_stamp) Successfully renamed FASTQ files: $f1 -> ${f1_renamed}; $f2 -> ${f2_renamed}." 180 | ((successes++)) 181 | else 182 | write_log "$(time_stamp) Error: FASTQ files $f1 and/or $f2 could not be renamed." 183 | ((failures++)) 184 | fi 185 | else 186 | write_log "$(time_stamp) Error: $f1 and/or $f2 could not be created by fastq-dump. Skip the step of renaming FASTQ files." 187 | ((failures++)) 188 | fi 189 | else # Single-end mode (for instance, PacBio or Nanopore reads) 190 | write_log "## Download single-end read set $accession of genome ${genome}" 191 | write_log "$(time_stamp) Run fastq-dump to download the reads." 192 | if fastq-dump --readids --skip-technical --clip --dumpbase --read-filter pass --outdir "$out_dir" --split-3 "$accession" >> "$log" 2>&1; then 193 | write_log "$(time_stamp) fastq-dump command finished for $accession of genome ${genome}." 194 | else 195 | write_log "$(time_stamp) fastq-dump failed for $accession." 196 | ((failures++)) 197 | continue 198 | fi 199 | f1="${fastq_original_prefix}_pass.fastq" 200 | f1_renamed="${fastq_renamed_prefix}.fastq" 201 | if [ -f "$f1" ]; then 202 | #gzip "$f1" 203 | mv "$f1" "$f1_renamed" 204 | if [ -f "$f1_renamed" ]; then 205 | write_log "$(time_stamp) Successfully renamed FASTQ file: $f1 -> ${f1_renamed}." 206 | ((successes++)) 207 | else 208 | write_log "$(time_stamp) Error: FASTQ file $f1 could not be renamed." 209 | ((failures++)) 210 | fi 211 | else 212 | write_log "$(time_stamp) Error: $f1 could not be created by fastq-dump." 213 | ((failures++)) 214 | fi 215 | fi 216 | echo >> "$log" # Add an empty line before the start of a new download iteration 217 | sleep "$wait_time" # Pause, to avoid too many connection requests to NCBI's server. 218 | done 219 | else # A single-column input file of Run accessions 220 | write_log "Use Run accessions as filenames of downloaded read sets." 221 | echo >> "$log" 222 | write_log "# Task records" 223 | for line in "${lines_array[@]}"; do 224 | accession="$(trim_whitespace "$line")" 225 | if [ -z "$accession" ]; then 226 | write_log "$(time_stamp) Warning: Skipping malformed line: '$line'" 227 | continue 228 | fi 229 | fastq_original_prefix="${out_dir}/${accession}" 230 | write_log "## Download reads under Run accession $accession" 231 | write_log "$(time_stamp) Run fastq-dump to download reads under ${accession}." 232 | if fastq-dump --readids --skip-technical --clip --dumpbase --read-filter pass --outdir "$out_dir" --split-3 "$accession" >> "$log" 2>&1; then 233 | write_log "$(time_stamp) fastq-dump command finished for ${accession}." 234 | else 235 | write_log "$(time_stamp) fastq-dump failed for $accession." 236 | ((failures++)) 237 | continue 238 | fi 239 | if [ "$paired_end" = true ]; then # Paired-end reads 240 | f1="${fastq_original_prefix}_pass_1.fastq" 241 | f2="${fastq_original_prefix}_pass_2.fastq" 242 | if [ -f "$f1" ] && [ -f "$f2" ]; then 243 | #gzip "$f1" 244 | #gzip "$f2" 245 | write_log "$(time_stamp) Successfully created FASTQ files $f1 and ${f2}." 246 | ((successes++)) 247 | else 248 | write_log "$(time_stamp) Error: $f1 and/or $f2 could not be created by fastq-dump." 249 | ((failures++)) 250 | fi 251 | else # Single-end reads 252 | f1="${fastq_original_prefix}_pass.fastq" 253 | if [ -f "$f1" ]; then 254 | #gzip "$f1" 255 | write_log "$(time_stamp) Successfully created FASTQ file ${f1}." 256 | ((successes++)) 257 | else 258 | write_log "$(time_stamp) Error: $f1 could not be created by fastq-dump." 259 | ((failures++)) 260 | fi 261 | fi 262 | echo >> "$log" 263 | sleep "$wait_time" 264 | done 265 | fi 266 | else # When accession numbers come from the -a parameter 267 | accession_count="${#accessions[@]}" 268 | write_log "$(time_stamp) Imported $accession_count entries from the -a argument." 269 | echo >> "$log" 270 | write_log "# Task records" 271 | for entry in "${accessions[@]}"; do # Filename replacement is not supported under this mode. 272 | accession="$(trim_whitespace "$entry")" 273 | if [ -z "$accession" ]; then 274 | write_log "$(time_stamp) Warning: Skipping malformed entry: '$entry'" 275 | continue 276 | fi 277 | fastq_original_prefix="${out_dir}/${accession}" 278 | write_log "## Download reads under Run accession $accession" 279 | if fastq-dump --readids --skip-technical --clip --dumpbase --read-filter pass --outdir "$out_dir" --split-3 "$accession" >> "$log" 2>&1; then 280 | write_log "$(time_stamp) fastq-dump command finished for ${accession}." 281 | else 282 | write_log "$(time_stamp) fastq-dump failed for $accession." 283 | ((failures++)) 284 | continue 285 | fi 286 | if [ "${paired_end}" = true ]; then 287 | f1="${fastq_original_prefix}_1.fastq" 288 | f2="${fastq_original_prefix}_2.fastq" 289 | if [ -f "$f1" ] && [ -f "$f2" ]; then 290 | #gzip "$f1" 291 | #gzip "$f2" 292 | write_log "$(time_stamp) Successfully created FASTQ files $f1 and ${f2}." 293 | ((successes++)) 294 | else 295 | echo "$(time_stamp) Error: $f1 and/or $f2 could not be downloaded." >&2 296 | ((failures++)) 297 | fi 298 | else 299 | f1="${fastq_original_prefix}.fastq" 300 | if [ -f "$f1" ]; then 301 | #gzip "$f1" 302 | write_log "$(time_stamp) Successfully created FASTQ file ${f1}." 303 | ((successes++)) 304 | else 305 | write_log "$(time_stamp) Error: $f1 could not be created by fastq-dump." 306 | ((failures++)) 307 | fi 308 | fi 309 | echo >> "$log" 310 | sleep "$wait_time" 311 | done 312 | fi 313 | 314 | write_log '# Conclusion' 315 | write_log "$(time_stamp) Finished all $accession_count tasks. Downloaded $successes readsets and failed to download $failures readsets." 316 | exit 0 317 | -------------------------------------------------------------------------------- /downloadSeqFromNCBI.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | This script takes a list of NCBI accession numbers (one for each line) from the STDIN and downloads corresponding entries (either GenBank files or FASTA files) under the target directory. 5 | 6 | Usage: python downloadSeqFromNCBI.py --records "file:objects.txt" --with_prefix --format fasta --email xxx@xxx.com --ext fna --outdir ./ref --skip > download.log 7 | python downloadSeqFromNCBI.py --records "NC_0001,NC_0002" --format genbank --email xxx@xxx.com --ext gbk --outdir ./ref --skip > download.log 8 | python downloadSeqFromNCBI.py --records "NC_0001,NC_0002" --format genbank --email xxx@xxx.com --prefix K12 --ext gbk --outdir ./ref --skip > download.log 9 | python downloadSeqFromNCBI.py --records "file:objects.tsv" --with_prefix --format fasta --email xxx@xxx.com --ext fna --outdir ./ref --skip > download.log 10 | Type python downloadSeqFromNCBI.py -h or --help for help information. 11 | 12 | Important options and arguments: 13 | --records or -r: Can be either a file (must contain a suffix of ".txt") listing targets to be downloaded, or a string of accession IDs separated by commas (no space is allowed). 14 | --with_prefix: A logical option specifying that the record file is a tab-delimited file of two columns (without a header line) for accession numbers and prefixes. 15 | --no_accession: Set this flag to not attach an NCBI accession number after the genome name in each file name. Only applicable when --prefix != None. This option may cause overwriting output files when multiple NCBI accessions share the same prefix. 16 | --format or -f: The format of files to be downloaded. This option is not used when --db = assembly. 17 | --db: Customised specification of an NCBI database to retrieve records from 18 | --ext or -x: The file extension, can be "fasta" (default), "fna", "fna.gz", "gb", "gb.gz" or "gbk". No dot preceding the extension is needed. 19 | --outdir or -o: Output directory, no backslash at the end. 20 | 21 | An example of the input list: seq_list.txt. Note that accession IDs may not include version numbers, such as ".1". 22 | HG326223.1\n 23 | CP011642\n 24 | 25 | The input file may be composed of two columns: accession number, prefix (genome name), sepearated by a tab character. For instance, 26 | HG326223.1\tDb11\n 27 | CP011642\tCAV1492\n 28 | 29 | References: 30 | 1. This script is inspired by Mark Schultz's (dr.mark.schultz@gmail.com, GitHub: schultzm) script "downloadGenbankByAccessions.py" stored under the master branch of github.com/katholt/holtlab. 31 | 2. Forum post: www.biostars.org/p/63506/ 32 | 3. Damien Farrell's blog post: Retrieving genome assemblies via Entrez with Python (dmnfarrell.github.io/bioinformatics/assemblies-genbank-python) 33 | 34 | Copyright (C) 2015-2020 Yu Wan 35 | First publication: 27 June 2015 - 14 July 2015; the latest modification: 28 June 2020 36 | Python version 2 and 3 compatible 37 | Licensed under the GNU General Public Licence version 3 (GPLv3) . 38 | Previous names: download_gbk.py, download_NCBI_records.py 39 | """ 40 | 41 | from __future__ import print_function 42 | import os 43 | import sys 44 | import time 45 | import xml.etree.ElementTree as xmlTree 46 | from Bio import Entrez 47 | from ftplib import FTP 48 | from collections import namedtuple 49 | from argparse import ArgumentParser 50 | 51 | 52 | def parse_arguments(): 53 | parser = ArgumentParser(description = "Read options and arguments") 54 | parser.add_argument("--records", "-r", dest = "records", type = str, required = True, \ 55 | help = "Items you want to fetch from the NCBI database") 56 | parser.add_argument("--with_prefix", "-w", dest = "with_prefix", action = "store_true", required = False, \ 57 | help = "Set when the accession file contains two columns for accessions and prefixes, respectively") 58 | parser.add_argument("--db", "-d", dest = "db", type = str, default = "nucleotide", required = False, \ 59 | help = "NCBI database to be retrieved from. Options: nucleotide (default), assembly") 60 | parser.add_argument("--format", "-f", dest = "format", type = str, default = "fasta", required = False, \ 61 | help = "Format: fasta(default)/genbank") 62 | parser.add_argument("--refseq", "-q", dest = "refseq", action = "store_true", required = False, \ 63 | help = "Set it to specify the RefSeq database for downloading assemblies") 64 | parser.add_argument("--email", "-e", dest = "email", type = str, required = True, \ 65 | help = "User email address") 66 | parser.add_argument("--prefix", "-p", dest="prefix", type = str, default = None, required = False, \ 67 | help = "Common prefix adding to all files") 68 | parser.add_argument("--no_accession", "-n", dest = "no_accession", action = "store_true", required = False, \ 69 | help = "Set this flag to not attach an NCBI accession number after the genome name in each file name. Only applicable when --prefix != None.") 70 | parser.add_argument("--ext", "-x", dest = "ext", type = str, default = "fasta", required = False, \ 71 | help = "File extension: fasta (default), fna, gb, gbk, fna.gz, gbff.gz (For assemblies, usually with .gz)") 72 | parser.add_argument("--outdir", "-o", dest = "outdir", type = str, default = ".", required = False, \ 73 | help = "Destination directory, no backslash at the end") 74 | parser.add_argument("--skip", "-sk", dest = "skip", action = "store_true", required = False, \ 75 | help = "Set to skip downloaded files") 76 | parser.add_argument("--ftp", "-t", dest = "ftp", type = str, default = "ftp.ncbi.nlm.nih.gov", required = False, \ 77 | help = "Address of the NCBI FTP site from which assemblies are downloaded. Default: ftp.ncbi.nlm.nih.gov.") 78 | return parser.parse_args() 79 | 80 | 81 | def main(): 82 | args = parse_arguments() 83 | Entrez.email = args.email 84 | 85 | # Read input and set up output file names 86 | check_output_dir(args.outdir) 87 | accessions = extract_accessions(targets = args.records, with_prefix = args.with_prefix) 88 | new_files = create_output_filenames(accessions = accessions, with_prefix = args.with_prefix, \ 89 | no_accession = args.no_accession, outdir = args.outdir, \ 90 | out_prefix = args.prefix, extension = "." + args.ext) 91 | 92 | # Iteratively download files 93 | if args.db == "nucleotide": 94 | """ 95 | Download nucleotide records (FASTA or GenBank files) through Entrez.efetch utility. 96 | """ 97 | if args.format == "fasta": 98 | download_records(new_files = new_files, skip_existing = args.skip, record_type = "fasta", \ 99 | outdir = args.outdir) 100 | else: 101 | """ 102 | Download nucleotide records as GenBank files. Do not use "gb" for rettype, as it only includes 103 | contig locations if the entry is built from contigs. 104 | """ 105 | download_records(new_files = new_files, skip_existing = args.skip, record_type = "gbwithparts", \ 106 | outdir = args.outdir) 107 | elif args.db == "assembly": 108 | """ 109 | Download assemblies (FASTA or GenBank files) from NCBI's FTP server. 110 | """ 111 | if args.format == "fasta": 112 | download_assemblies(new_files = new_files, skip_existing = args.skip, outdir = args.outdir, \ 113 | use_refseq = args.refseq, site = args.ftp, fasta = True) 114 | else: 115 | download_assemblies(new_files = new_files, skip_existing = args.skip, outdir = args.outdir, \ 116 | use_refseq = args.refseq, site = args.ftp, fasta = False) 117 | else: 118 | print("Error: only databases 'nucleotide' and 'assembly' are supported by far. No download task will be launched.") 119 | 120 | return 121 | 122 | 123 | def extract_accessions(targets, with_prefix): 124 | """ 125 | Parsing the argument for '--records'. 126 | This function deals with two kinds of input: a string of accession numbers or a file. 127 | """ 128 | if targets.startswith("file:"): # treat "targets" as a file name 129 | with open(targets[5 : ], "r") as f: 130 | lines = f.read().splitlines() # read each line into a component of a list and drop newline characters 131 | 132 | # parse every line when there are two columns separated by a tab in each line 133 | if with_prefix: 134 | accessions = {} 135 | for line in lines: 136 | fields = line.split("\t") # "accession\tprefix" 137 | accessions[fields[0]] = fields[1] # return a dictionary 138 | else: 139 | accessions = lines # a list 140 | else: # treat "targets" as a series of accession numbers separated by commas. 141 | accessions = targets.split(",") 142 | 143 | return accessions 144 | 145 | 146 | def create_output_filenames(accessions, with_prefix, no_accession, outdir, out_prefix, extension): 147 | """ 148 | Set up file names for download. Return variable: a dictionary. 149 | """ 150 | new_files = {} 151 | if with_prefix: # when "accessions" is a dictionary 152 | for entry, prefix in accessions.items(): # entry: an accession number, the key of the dictionary "accessions" 153 | if no_accession: # Do not attach an accession number after the genome name in the output filename. 154 | new_files[entry] = os.path.join(outdir, prefix + extension) 155 | else: 156 | new_files[entry] = os.path.join(outdir, prefix + "__" + entry + extension) 157 | else: 158 | for entry in accessions: # when "accessions" is a list 159 | if out_prefix != None: 160 | if no_accession: 161 | new_files[entry] = os.path.join(outdir, out_prefix + extension) 162 | else: 163 | new_files[entry] = os.path.join(outdir, out_prefix + "__" + entry + extension) 164 | else: 165 | new_files[entry] = os.path.join(outdir, entry + extension) 166 | 167 | return new_files 168 | 169 | 170 | def download_records(new_files, skip_existing, record_type, outdir): 171 | """ 172 | Download from the nucleotide database (through Entrez.efetch) and save files in FASTA or GenBank format 173 | """ 174 | print("Start to download records from the NCBI Nucleotide database.") 175 | n = 0 # the counter for downloaded files 176 | for entry, new_file in new_files.items(): 177 | if os.path.exists(new_file) and skip_existing: 178 | print(new_file + " already exists, skipped.") 179 | continue # go to the next entry 180 | try: 181 | handle = Entrez.efetch(db = "nucleotide", id = entry, rettype = record_type, retmode = "text") 182 | with open(new_file, "w") as output_file: 183 | print("Downloading " + entry + " to " + new_file) 184 | output_file.write(handle.read()) # read and write this entry into a new file named after its accession number 185 | n += 1 186 | handle.close() 187 | except: 188 | print("The record "+ entry + " is not found.") 189 | continue 190 | time.sleep(1) # Pause for one second to obviate submitting too many concurrent requests to NCBI 191 | 192 | if outdir == ".": 193 | outdir = "the current workding directory" 194 | print("Done. Altogether %d files were downloaded and stored in %s successfully." % (n, outdir)) 195 | 196 | return 197 | 198 | 199 | def download_assemblies(new_files, skip_existing, outdir, use_refseq, site, fasta): 200 | """ 201 | Download nucleotide sequences from the NCBI Assembly database through FTP 202 | """ 203 | print("Start to download records from the NCBI Assembly database.") 204 | urls = get_urls(new_files, use_refseq, skip_existing, fasta) # urls is a named tuple with three fields 205 | 206 | # Download files through FTP 207 | print("Connecting to site " + site + ".") 208 | try: 209 | ftp = FTP(site, timeout = 30) # In general, urlsplit(URL).netloc returns the site address. Paramenter "timeout" is mandatory. 210 | ftp.login() # '230 Anonymous access granted, restrictions apply' 211 | print("Successfully logged in.") 212 | except: 213 | sys.exit("Error: could not log in to the site.") 214 | 215 | n = 0 216 | prefix_len = len("ftp://" + site) # For example, len("ftp://ftp.ncbi.nlm.nih.gov") = 26 217 | output_format = "FASTA" if fasta else "GenBank" 218 | 219 | for assembly in urls: 220 | try: 221 | with open(assembly.local, "wb") as f: # Create a binary file 222 | """ 223 | The retrbinary method of an FTP object does not work for the full FTP address, such as 224 | ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/.../file.fna.gz. It only accepts an relative path 225 | such as /genomes/all/GCA/000/.../file.fna.gz. Hence we need to extract the path from an URL 226 | using a simple command: assembly.url[prefix_len : ]. 227 | """ 228 | ftp.retrbinary("RETR " + assembly.url[prefix_len : ], f.write) 229 | print("Saved %s file %s as %s." % (output_format, assembly.url, assembly.local)) 230 | n += 1 231 | time.sleep(1) 232 | except: 233 | print("Warning: remote file " + assembly.url + " is not accessible. Skip.") 234 | os.system("rm " + assembly.local) 235 | ftp.quit() 236 | print("Done. Altogether %d files were downloaded and stored in %s successfully." % (n, outdir)) 237 | 238 | return 239 | 240 | 241 | def get_urls(new_files, use_refseq, skip_existing, fasta): 242 | """ 243 | This is a subordinate function of function download_assemblies. It retrieves FTP addresses of 244 | assembly files on the NCBI server. Sequence files are downloaded after gathering all addresses. 245 | """ 246 | Assembly = namedtuple("Assembly", ["accession", "url", "local"]) 247 | urls = [] 248 | filename_suffix = "_genomic.fna.gz" if fasta else "_genomic.gbff.gz" # FASTA or GenBank file 249 | 250 | for entry, new_file in new_files.items(): 251 | if os.path.exists(new_file) and skip_existing: 252 | print(new_file + " already exists, skipped.") 253 | continue # go to the next entry 254 | 255 | try: 256 | handle = Entrez.esearch(db = "assembly", term = entry, retmax = "1") # There is only one record per accession number. 257 | record = Entrez.read(handle) 258 | uid = record["IdList"][0] # Convert this single-element list into a string 259 | except: 260 | print("The record "+ entry + " was not found in the database.") 261 | continue 262 | 263 | summary = get_assembly_summary(uid) 264 | if summary != None: 265 | if use_refseq: 266 | url = summary["FtpPath_RefSeq"] # Field in the summary XML file 267 | else: 268 | url = summary["FtpPath_GenBank"] # Field in the summary XML file 269 | 270 | if url == "": 271 | print("Warning: URL of assembly %s (sequence ID: %s) was not found in the record summary." % (entry, uid)) 272 | continue 273 | else: 274 | urls.append(Assembly(accession = entry, url = os.path.join(url, os.path.basename(url) + filename_suffix),\ 275 | local = new_file)) 276 | 277 | time.sleep(1) 278 | 279 | return urls 280 | 281 | 282 | def get_assembly_summary(uid): 283 | """ 284 | Retrieve details of an assembly under a given ID (the parameter 'id') 285 | This is a subordinate function of get_urls and it returns either a dictionary or a None object. 286 | 287 | The follow command is equivalent to visiting the URL: 288 | https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=assembly&id=uid" in a web browser. 289 | The command returns a document summary in the format of DocSums for uid. 290 | (Reference: https://www.ncbi.nlm.nih.gov/books/NBK25499/) 291 | """ 292 | summary_handle = Entrez.esummary(db = "assembly", id = uid, report = "full", retmax = 1, retmode = "xml") 293 | try: 294 | summary = Entrez.read(summary_handle) # Not working anymore 295 | summary = summary["DocumentSummarySet"]["DocumentSummary"][0] # Returns a dictionary 296 | except (ValueError, TypeError): 297 | """ 298 | A note created on 28 June 2020: The Entrez.read method seems not working recently and returns an internal 299 | exception "ValidationError". "Bio.Entrez.Parser.ValidationError: Failed to find tag 'AssemblyStatusSort' 300 | in the DTD. To skip all tags that are not represented in the DTD, please call Bio.Entrez.read or 301 | Bio.Entrez.parse with validate=False." 302 | 303 | I do not know the reason but have managed to find out an alternation to this 304 | method through directly parsing the summary XML file from the NCBI server. This solution requires model xml. 305 | """ 306 | print("Document summary could not be parsed for assembly %s. Trying an alternative approach to extract the FTP address..." % uid) 307 | try: 308 | # The handle need to be regenerated as it seems to be damaged by the method Entrez.read. 309 | summary_handle = Entrez.esummary(db = "assembly", id = uid, report = "full", retmax = 1, retmode = "xml") 310 | xml = xmlTree.parse(summary_handle).getroot() 311 | summary = {} 312 | 313 | """ 314 | Structure of the object xml: 315 | xml[0]: 316 | xml[0][0]: 317 | xml[0][1]: 318 | """ 319 | for field in xml[0][1]: 320 | if field.tag in ["FtpPath_GenBank", "FtpPath_RefSeq"]: 321 | summary[field.tag] = field.text 322 | 323 | if len(summary) > 0: 324 | print(" FTP address(es) has/have been successfully extracted for assembly %s." % uid) 325 | else: 326 | print(" FTP address still could not be obtained for assembly %s." % uid) 327 | summary = None 328 | except (ValueError, TypeError): # The worst scenario: no FTP path is found 329 | print(" FTP address still could not be obtained for assembly %s." % uid) 330 | summary = None 331 | 332 | return summary 333 | 334 | 335 | def check_output_dir(outdir): 336 | """ 337 | Prepare the output directory 338 | """ 339 | if outdir != ".": 340 | if not os.path.exists(outdir): 341 | os.system("mkdir " + outdir) 342 | else: 343 | print("Output directory " + outdir + " exists.") 344 | else: 345 | print("Skipped checking the output directory as it is the current working directory.") 346 | 347 | return 348 | 349 | 350 | if __name__ == "__main__": 351 | main() 352 | -------------------------------------------------------------------------------- /other_licence/GPL-2.0.txt: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Lesser General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | 282 | How to Apply These Terms to Your New Programs 283 | 284 | If you develop a new program, and you want it to be of the greatest 285 | possible use to the public, the best way to achieve this is to make it 286 | free software which everyone can redistribute and change under these terms. 287 | 288 | To do so, attach the following notices to the program. It is safest 289 | to attach them to the start of each source file to most effectively 290 | convey the exclusion of warranty; and each file should have at least 291 | the "copyright" line and a pointer to where the full notice is found. 292 | 293 | 294 | Copyright (C) 295 | 296 | This program is free software; you can redistribute it and/or modify 297 | it under the terms of the GNU General Public License as published by 298 | the Free Software Foundation; either version 2 of the License, or 299 | (at your option) any later version. 300 | 301 | This program is distributed in the hope that it will be useful, 302 | but WITHOUT ANY WARRANTY; without even the implied warranty of 303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 304 | GNU General Public License for more details. 305 | 306 | You should have received a copy of the GNU General Public License along 307 | with this program; if not, write to the Free Software Foundation, Inc., 308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 309 | 310 | Also add information on how to contact you by electronic and paper mail. 311 | 312 | If the program is interactive, make it output a short notice like this 313 | when it starts in an interactive mode: 314 | 315 | Gnomovision version 69, Copyright (C) year name of author 316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 317 | This is free software, and you are welcome to redistribute it 318 | under certain conditions; type `show c' for details. 319 | 320 | The hypothetical commands `show w' and `show c' should show the appropriate 321 | parts of the General Public License. Of course, the commands you use may 322 | be called something other than `show w' and `show c'; they could even be 323 | mouse-clicks or menu items--whatever suits your program. 324 | 325 | You should also get your employer (if you work as a programmer) or your 326 | school, if any, to sign a "copyright disclaimer" for the program, if 327 | necessary. Here is a sample; alter the names: 328 | 329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 330 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 331 | 332 | , 1 April 1989 333 | Ty Coon, President of Vice 334 | 335 | This General Public License does not permit incorporating your program into 336 | proprietary programs. If your program is a subroutine library, you may 337 | consider it more useful to permit linking proprietary applications with the 338 | library. If this is what you want to do, use the GNU Lesser General 339 | Public License instead of this License. 340 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | The GNU General Public License is a free, copyleft license for 11 | software and other kinds of works. 12 | 13 | The licenses for most software and other practical works are designed 14 | to take away your freedom to share and change the works. By contrast, 15 | the GNU General Public License is intended to guarantee your freedom to 16 | share and change all versions of a program--to make sure it remains free 17 | software for all its users. We, the Free Software Foundation, use the 18 | GNU General Public License for most of our software; it applies also to 19 | any other work released this way by its authors. You can apply it to 20 | your programs, too. 21 | 22 | When we speak of free software, we are referring to freedom, not 23 | price. Our General Public Licenses are designed to make sure that you 24 | have the freedom to distribute copies of free software (and charge for 25 | them if you wish), that you receive source code or can get it if you 26 | want it, that you can change the software or use pieces of it in new 27 | free programs, and that you know you can do these things. 28 | 29 | To protect your rights, we need to prevent others from denying you 30 | these rights or asking you to surrender the rights. Therefore, you have 31 | certain responsibilities if you distribute copies of the software, or if 32 | you modify it: responsibilities to respect the freedom of others. 33 | 34 | For example, if you distribute copies of such a program, whether 35 | gratis or for a fee, you must pass on to the recipients the same 36 | freedoms that you received. You must make sure that they, too, receive 37 | or can get the source code. And you must show them these terms so they 38 | know their rights. 39 | 40 | Developers that use the GNU GPL protect your rights with two steps: 41 | (1) assert copyright on the software, and (2) offer you this License 42 | giving you legal permission to copy, distribute and/or modify it. 43 | 44 | For the developers' and authors' protection, the GPL clearly explains 45 | that there is no warranty for this free software. For both users' and 46 | authors' sake, the GPL requires that modified versions be marked as 47 | changed, so that their problems will not be attributed erroneously to 48 | authors of previous versions. 49 | 50 | Some devices are designed to deny users access to install or run 51 | modified versions of the software inside them, although the manufacturer 52 | can do so. This is fundamentally incompatible with the aim of 53 | protecting users' freedom to change the software. The systematic 54 | pattern of such abuse occurs in the area of products for individuals to 55 | use, which is precisely where it is most unacceptable. Therefore, we 56 | have designed this version of the GPL to prohibit the practice for those 57 | products. If such problems arise substantially in other domains, we 58 | stand ready to extend this provision to those domains in future versions 59 | of the GPL, as needed to protect the freedom of users. 60 | 61 | Finally, every program is threatened constantly by software patents. 62 | States should not allow patents to restrict development and use of 63 | software on general-purpose computers, but in those that do, we wish to 64 | avoid the special danger that patents applied to a free program could 65 | make it effectively proprietary. To prevent this, the GPL assures that 66 | patents cannot be used to render the program non-free. 67 | 68 | The precise terms and conditions for copying, distribution and 69 | modification follow. 70 | 71 | TERMS AND CONDITIONS 72 | 73 | 0. Definitions. 74 | 75 | "This License" refers to version 3 of the GNU General Public License. 76 | 77 | "Copyright" also means copyright-like laws that apply to other kinds of 78 | works, such as semiconductor masks. 79 | 80 | "The Program" refers to any copyrightable work licensed under this 81 | License. Each licensee is addressed as "you". "Licensees" and 82 | "recipients" may be individuals or organizations. 83 | 84 | To "modify" a work means to copy from or adapt all or part of the work 85 | in a fashion requiring copyright permission, other than the making of an 86 | exact copy. The resulting work is called a "modified version" of the 87 | earlier work or a work "based on" the earlier work. 88 | 89 | A "covered work" means either the unmodified Program or a work based 90 | on the Program. 91 | 92 | To "propagate" a work means to do anything with it that, without 93 | permission, would make you directly or secondarily liable for 94 | infringement under applicable copyright law, except executing it on a 95 | computer or modifying a private copy. Propagation includes copying, 96 | distribution (with or without modification), making available to the 97 | public, and in some countries other activities as well. 98 | 99 | To "convey" a work means any kind of propagation that enables other 100 | parties to make or receive copies. Mere interaction with a user through 101 | a computer network, with no transfer of a copy, is not conveying. 102 | 103 | An interactive user interface displays "Appropriate Legal Notices" 104 | to the extent that it includes a convenient and prominently visible 105 | feature that (1) displays an appropriate copyright notice, and (2) 106 | tells the user that there is no warranty for the work (except to the 107 | extent that warranties are provided), that licensees may convey the 108 | work under this License, and how to view a copy of this License. If 109 | the interface presents a list of user commands or options, such as a 110 | menu, a prominent item in the list meets this criterion. 111 | 112 | 1. Source Code. 113 | 114 | The "source code" for a work means the preferred form of the work 115 | for making modifications to it. "Object code" means any non-source 116 | form of a work. 117 | 118 | A "Standard Interface" means an interface that either is an official 119 | standard defined by a recognized standards body, or, in the case of 120 | interfaces specified for a particular programming language, one that 121 | is widely used among developers working in that language. 122 | 123 | The "System Libraries" of an executable work include anything, other 124 | than the work as a whole, that (a) is included in the normal form of 125 | packaging a Major Component, but which is not part of that Major 126 | Component, and (b) serves only to enable use of the work with that 127 | Major Component, or to implement a Standard Interface for which an 128 | implementation is available to the public in source code form. A 129 | "Major Component", in this context, means a major essential component 130 | (kernel, window system, and so on) of the specific operating system 131 | (if any) on which the executable work runs, or a compiler used to 132 | produce the work, or an object code interpreter used to run it. 133 | 134 | The "Corresponding Source" for a work in object code form means all 135 | the source code needed to generate, install, and (for an executable 136 | work) run the object code and to modify the work, including scripts to 137 | control those activities. However, it does not include the work's 138 | System Libraries, or general-purpose tools or generally available free 139 | programs which are used unmodified in performing those activities but 140 | which are not part of the work. For example, Corresponding Source 141 | includes interface definition files associated with source files for 142 | the work, and the source code for shared libraries and dynamically 143 | linked subprograms that the work is specifically designed to require, 144 | such as by intimate data communication or control flow between those 145 | subprograms and other parts of the work. 146 | 147 | The Corresponding Source need not include anything that users 148 | can regenerate automatically from other parts of the Corresponding 149 | Source. 150 | 151 | The Corresponding Source for a work in source code form is that 152 | same work. 153 | 154 | 2. Basic Permissions. 155 | 156 | All rights granted under this License are granted for the term of 157 | copyright on the Program, and are irrevocable provided the stated 158 | conditions are met. This License explicitly affirms your unlimited 159 | permission to run the unmodified Program. The output from running a 160 | covered work is covered by this License only if the output, given its 161 | content, constitutes a covered work. This License acknowledges your 162 | rights of fair use or other equivalent, as provided by copyright law. 163 | 164 | You may make, run and propagate covered works that you do not 165 | convey, without conditions so long as your license otherwise remains 166 | in force. You may convey covered works to others for the sole purpose 167 | of having them make modifications exclusively for you, or provide you 168 | with facilities for running those works, provided that you comply with 169 | the terms of this License in conveying all material for which you do 170 | not control copyright. Those thus making or running the covered works 171 | for you must do so exclusively on your behalf, under your direction 172 | and control, on terms that prohibit them from making any copies of 173 | your copyrighted material outside their relationship with you. 174 | 175 | Conveying under any other circumstances is permitted solely under 176 | the conditions stated below. Sublicensing is not allowed; section 10 177 | makes it unnecessary. 178 | 179 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 180 | 181 | No covered work shall be deemed part of an effective technological 182 | measure under any applicable law fulfilling obligations under article 183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 184 | similar laws prohibiting or restricting circumvention of such 185 | measures. 186 | 187 | When you convey a covered work, you waive any legal power to forbid 188 | circumvention of technological measures to the extent such circumvention 189 | is effected by exercising rights under this License with respect to 190 | the covered work, and you disclaim any intention to limit operation or 191 | modification of the work as a means of enforcing, against the work's 192 | users, your or third parties' legal rights to forbid circumvention of 193 | technological measures. 194 | 195 | 4. Conveying Verbatim Copies. 196 | 197 | You may convey verbatim copies of the Program's source code as you 198 | receive it, in any medium, provided that you conspicuously and 199 | appropriately publish on each copy an appropriate copyright notice; 200 | keep intact all notices stating that this License and any 201 | non-permissive terms added in accord with section 7 apply to the code; 202 | keep intact all notices of the absence of any warranty; and give all 203 | recipients a copy of this License along with the Program. 204 | 205 | You may charge any price or no price for each copy that you convey, 206 | and you may offer support or warranty protection for a fee. 207 | 208 | 5. Conveying Modified Source Versions. 209 | 210 | You may convey a work based on the Program, or the modifications to 211 | produce it from the Program, in the form of source code under the 212 | terms of section 4, provided that you also meet all of these conditions: 213 | 214 | a) The work must carry prominent notices stating that you modified 215 | it, and giving a relevant date. 216 | 217 | b) The work must carry prominent notices stating that it is 218 | released under this License and any conditions added under section 219 | 7. This requirement modifies the requirement in section 4 to 220 | "keep intact all notices". 221 | 222 | c) You must license the entire work, as a whole, under this 223 | License to anyone who comes into possession of a copy. This 224 | License will therefore apply, along with any applicable section 7 225 | additional terms, to the whole of the work, and all its parts, 226 | regardless of how they are packaged. This License gives no 227 | permission to license the work in any other way, but it does not 228 | invalidate such permission if you have separately received it. 229 | 230 | d) If the work has interactive user interfaces, each must display 231 | Appropriate Legal Notices; however, if the Program has interactive 232 | interfaces that do not display Appropriate Legal Notices, your 233 | work need not make them do so. 234 | 235 | A compilation of a covered work with other separate and independent 236 | works, which are not by their nature extensions of the covered work, 237 | and which are not combined with it such as to form a larger program, 238 | in or on a volume of a storage or distribution medium, is called an 239 | "aggregate" if the compilation and its resulting copyright are not 240 | used to limit the access or legal rights of the compilation's users 241 | beyond what the individual works permit. Inclusion of a covered work 242 | in an aggregate does not cause this License to apply to the other 243 | parts of the aggregate. 244 | 245 | 6. Conveying Non-Source Forms. 246 | 247 | You may convey a covered work in object code form under the terms 248 | of sections 4 and 5, provided that you also convey the 249 | machine-readable Corresponding Source under the terms of this License, 250 | in one of these ways: 251 | 252 | a) Convey the object code in, or embodied in, a physical product 253 | (including a physical distribution medium), accompanied by the 254 | Corresponding Source fixed on a durable physical medium 255 | customarily used for software interchange. 256 | 257 | b) Convey the object code in, or embodied in, a physical product 258 | (including a physical distribution medium), accompanied by a 259 | written offer, valid for at least three years and valid for as 260 | long as you offer spare parts or customer support for that product 261 | model, to give anyone who possesses the object code either (1) a 262 | copy of the Corresponding Source for all the software in the 263 | product that is covered by this License, on a durable physical 264 | medium customarily used for software interchange, for a price no 265 | more than your reasonable cost of physically performing this 266 | conveying of source, or (2) access to copy the 267 | Corresponding Source from a network server at no charge. 268 | 269 | c) Convey individual copies of the object code with a copy of the 270 | written offer to provide the Corresponding Source. This 271 | alternative is allowed only occasionally and noncommercially, and 272 | only if you received the object code with such an offer, in accord 273 | with subsection 6b. 274 | 275 | d) Convey the object code by offering access from a designated 276 | place (gratis or for a charge), and offer equivalent access to the 277 | Corresponding Source in the same way through the same place at no 278 | further charge. You need not require recipients to copy the 279 | Corresponding Source along with the object code. If the place to 280 | copy the object code is a network server, the Corresponding Source 281 | may be on a different server (operated by you or a third party) 282 | that supports equivalent copying facilities, provided you maintain 283 | clear directions next to the object code saying where to find the 284 | Corresponding Source. Regardless of what server hosts the 285 | Corresponding Source, you remain obligated to ensure that it is 286 | available for as long as needed to satisfy these requirements. 287 | 288 | e) Convey the object code using peer-to-peer transmission, provided 289 | you inform other peers where the object code and Corresponding 290 | Source of the work are being offered to the general public at no 291 | charge under subsection 6d. 292 | 293 | A separable portion of the object code, whose source code is excluded 294 | from the Corresponding Source as a System Library, need not be 295 | included in conveying the object code work. 296 | 297 | A "User Product" is either (1) a "consumer product", which means any 298 | tangible personal property which is normally used for personal, family, 299 | or household purposes, or (2) anything designed or sold for incorporation 300 | into a dwelling. In determining whether a product is a consumer product, 301 | doubtful cases shall be resolved in favor of coverage. For a particular 302 | product received by a particular user, "normally used" refers to a 303 | typical or common use of that class of product, regardless of the status 304 | of the particular user or of the way in which the particular user 305 | actually uses, or expects or is expected to use, the product. A product 306 | is a consumer product regardless of whether the product has substantial 307 | commercial, industrial or non-consumer uses, unless such uses represent 308 | the only significant mode of use of the product. 309 | 310 | "Installation Information" for a User Product means any methods, 311 | procedures, authorization keys, or other information required to install 312 | and execute modified versions of a covered work in that User Product from 313 | a modified version of its Corresponding Source. The information must 314 | suffice to ensure that the continued functioning of the modified object 315 | code is in no case prevented or interfered with solely because 316 | modification has been made. 317 | 318 | If you convey an object code work under this section in, or with, or 319 | specifically for use in, a User Product, and the conveying occurs as 320 | part of a transaction in which the right of possession and use of the 321 | User Product is transferred to the recipient in perpetuity or for a 322 | fixed term (regardless of how the transaction is characterized), the 323 | Corresponding Source conveyed under this section must be accompanied 324 | by the Installation Information. But this requirement does not apply 325 | if neither you nor any third party retains the ability to install 326 | modified object code on the User Product (for example, the work has 327 | been installed in ROM). 328 | 329 | The requirement to provide Installation Information does not include a 330 | requirement to continue to provide support service, warranty, or updates 331 | for a work that has been modified or installed by the recipient, or for 332 | the User Product in which it has been modified or installed. Access to a 333 | network may be denied when the modification itself materially and 334 | adversely affects the operation of the network or violates the rules and 335 | protocols for communication across the network. 336 | 337 | Corresponding Source conveyed, and Installation Information provided, 338 | in accord with this section must be in a format that is publicly 339 | documented (and with an implementation available to the public in 340 | source code form), and must require no special password or key for 341 | unpacking, reading or copying. 342 | 343 | 7. Additional Terms. 344 | 345 | "Additional permissions" are terms that supplement the terms of this 346 | License by making exceptions from one or more of its conditions. 347 | Additional permissions that are applicable to the entire Program shall 348 | be treated as though they were included in this License, to the extent 349 | that they are valid under applicable law. If additional permissions 350 | apply only to part of the Program, that part may be used separately 351 | under those permissions, but the entire Program remains governed by 352 | this License without regard to the additional permissions. 353 | 354 | When you convey a copy of a covered work, you may at your option 355 | remove any additional permissions from that copy, or from any part of 356 | it. (Additional permissions may be written to require their own 357 | removal in certain cases when you modify the work.) You may place 358 | additional permissions on material, added by you to a covered work, 359 | for which you have or can give appropriate copyright permission. 360 | 361 | Notwithstanding any other provision of this License, for material you 362 | add to a covered work, you may (if authorized by the copyright holders of 363 | that material) supplement the terms of this License with terms: 364 | 365 | a) Disclaiming warranty or limiting liability differently from the 366 | terms of sections 15 and 16 of this License; or 367 | 368 | b) Requiring preservation of specified reasonable legal notices or 369 | author attributions in that material or in the Appropriate Legal 370 | Notices displayed by works containing it; or 371 | 372 | c) Prohibiting misrepresentation of the origin of that material, or 373 | requiring that modified versions of such material be marked in 374 | reasonable ways as different from the original version; or 375 | 376 | d) Limiting the use for publicity purposes of names of licensors or 377 | authors of the material; or 378 | 379 | e) Declining to grant rights under trademark law for use of some 380 | trade names, trademarks, or service marks; or 381 | 382 | f) Requiring indemnification of licensors and authors of that 383 | material by anyone who conveys the material (or modified versions of 384 | it) with contractual assumptions of liability to the recipient, for 385 | any liability that these contractual assumptions directly impose on 386 | those licensors and authors. 387 | 388 | All other non-permissive additional terms are considered "further 389 | restrictions" within the meaning of section 10. If the Program as you 390 | received it, or any part of it, contains a notice stating that it is 391 | governed by this License along with a term that is a further 392 | restriction, you may remove that term. If a license document contains 393 | a further restriction but permits relicensing or conveying under this 394 | License, you may add to a covered work material governed by the terms 395 | of that license document, provided that the further restriction does 396 | not survive such relicensing or conveying. 397 | 398 | If you add terms to a covered work in accord with this section, you 399 | must place, in the relevant source files, a statement of the 400 | additional terms that apply to those files, or a notice indicating 401 | where to find the applicable terms. 402 | 403 | Additional terms, permissive or non-permissive, may be stated in the 404 | form of a separately written license, or stated as exceptions; 405 | the above requirements apply either way. 406 | 407 | 8. Termination. 408 | 409 | You may not propagate or modify a covered work except as expressly 410 | provided under this License. Any attempt otherwise to propagate or 411 | modify it is void, and will automatically terminate your rights under 412 | this License (including any patent licenses granted under the third 413 | paragraph of section 11). 414 | 415 | However, if you cease all violation of this License, then your 416 | license from a particular copyright holder is reinstated (a) 417 | provisionally, unless and until the copyright holder explicitly and 418 | finally terminates your license, and (b) permanently, if the copyright 419 | holder fails to notify you of the violation by some reasonable means 420 | prior to 60 days after the cessation. 421 | 422 | Moreover, your license from a particular copyright holder is 423 | reinstated permanently if the copyright holder notifies you of the 424 | violation by some reasonable means, this is the first time you have 425 | received notice of violation of this License (for any work) from that 426 | copyright holder, and you cure the violation prior to 30 days after 427 | your receipt of the notice. 428 | 429 | Termination of your rights under this section does not terminate the 430 | licenses of parties who have received copies or rights from you under 431 | this License. If your rights have been terminated and not permanently 432 | reinstated, you do not qualify to receive new licenses for the same 433 | material under section 10. 434 | 435 | 9. Acceptance Not Required for Having Copies. 436 | 437 | You are not required to accept this License in order to receive or 438 | run a copy of the Program. Ancillary propagation of a covered work 439 | occurring solely as a consequence of using peer-to-peer transmission 440 | to receive a copy likewise does not require acceptance. However, 441 | nothing other than this License grants you permission to propagate or 442 | modify any covered work. These actions infringe copyright if you do 443 | not accept this License. Therefore, by modifying or propagating a 444 | covered work, you indicate your acceptance of this License to do so. 445 | 446 | 10. Automatic Licensing of Downstream Recipients. 447 | 448 | Each time you convey a covered work, the recipient automatically 449 | receives a license from the original licensors, to run, modify and 450 | propagate that work, subject to this License. You are not responsible 451 | for enforcing compliance by third parties with this License. 452 | 453 | An "entity transaction" is a transaction transferring control of an 454 | organization, or substantially all assets of one, or subdividing an 455 | organization, or merging organizations. If propagation of a covered 456 | work results from an entity transaction, each party to that 457 | transaction who receives a copy of the work also receives whatever 458 | licenses to the work the party's predecessor in interest had or could 459 | give under the previous paragraph, plus a right to possession of the 460 | Corresponding Source of the work from the predecessor in interest, if 461 | the predecessor has it or can get it with reasonable efforts. 462 | 463 | You may not impose any further restrictions on the exercise of the 464 | rights granted or affirmed under this License. For example, you may 465 | not impose a license fee, royalty, or other charge for exercise of 466 | rights granted under this License, and you may not initiate litigation 467 | (including a cross-claim or counterclaim in a lawsuit) alleging that 468 | any patent claim is infringed by making, using, selling, offering for 469 | sale, or importing the Program or any portion of it. 470 | 471 | 11. Patents. 472 | 473 | A "contributor" is a copyright holder who authorizes use under this 474 | License of the Program or a work on which the Program is based. The 475 | work thus licensed is called the contributor's "contributor version". 476 | 477 | A contributor's "essential patent claims" are all patent claims 478 | owned or controlled by the contributor, whether already acquired or 479 | hereafter acquired, that would be infringed by some manner, permitted 480 | by this License, of making, using, or selling its contributor version, 481 | but do not include claims that would be infringed only as a 482 | consequence of further modification of the contributor version. For 483 | purposes of this definition, "control" includes the right to grant 484 | patent sublicenses in a manner consistent with the requirements of 485 | this License. 486 | 487 | Each contributor grants you a non-exclusive, worldwide, royalty-free 488 | patent license under the contributor's essential patent claims, to 489 | make, use, sell, offer for sale, import and otherwise run, modify and 490 | propagate the contents of its contributor version. 491 | 492 | In the following three paragraphs, a "patent license" is any express 493 | agreement or commitment, however denominated, not to enforce a patent 494 | (such as an express permission to practice a patent or covenant not to 495 | sue for patent infringement). To "grant" such a patent license to a 496 | party means to make such an agreement or commitment not to enforce a 497 | patent against the party. 498 | 499 | If you convey a covered work, knowingly relying on a patent license, 500 | and the Corresponding Source of the work is not available for anyone 501 | to copy, free of charge and under the terms of this License, through a 502 | publicly available network server or other readily accessible means, 503 | then you must either (1) cause the Corresponding Source to be so 504 | available, or (2) arrange to deprive yourself of the benefit of the 505 | patent license for this particular work, or (3) arrange, in a manner 506 | consistent with the requirements of this License, to extend the patent 507 | license to downstream recipients. "Knowingly relying" means you have 508 | actual knowledge that, but for the patent license, your conveying the 509 | covered work in a country, or your recipient's use of the covered work 510 | in a country, would infringe one or more identifiable patents in that 511 | country that you have reason to believe are valid. 512 | 513 | If, pursuant to or in connection with a single transaction or 514 | arrangement, you convey, or propagate by procuring conveyance of, a 515 | covered work, and grant a patent license to some of the parties 516 | receiving the covered work authorizing them to use, propagate, modify 517 | or convey a specific copy of the covered work, then the patent license 518 | you grant is automatically extended to all recipients of the covered 519 | work and works based on it. 520 | 521 | A patent license is "discriminatory" if it does not include within 522 | the scope of its coverage, prohibits the exercise of, or is 523 | conditioned on the non-exercise of one or more of the rights that are 524 | specifically granted under this License. You may not convey a covered 525 | work if you are a party to an arrangement with a third party that is 526 | in the business of distributing software, under which you make payment 527 | to the third party based on the extent of your activity of conveying 528 | the work, and under which the third party grants, to any of the 529 | parties who would receive the covered work from you, a discriminatory 530 | patent license (a) in connection with copies of the covered work 531 | conveyed by you (or copies made from those copies), or (b) primarily 532 | for and in connection with specific products or compilations that 533 | contain the covered work, unless you entered into that arrangement, 534 | or that patent license was granted, prior to 28 March 2007. 535 | 536 | Nothing in this License shall be construed as excluding or limiting 537 | any implied license or other defenses to infringement that may 538 | otherwise be available to you under applicable patent law. 539 | 540 | 12. No Surrender of Others' Freedom. 541 | 542 | If conditions are imposed on you (whether by court order, agreement or 543 | otherwise) that contradict the conditions of this License, they do not 544 | excuse you from the conditions of this License. If you cannot convey a 545 | covered work so as to satisfy simultaneously your obligations under this 546 | License and any other pertinent obligations, then as a consequence you may 547 | not convey it at all. For example, if you agree to terms that obligate you 548 | to collect a royalty for further conveying from those to whom you convey 549 | the Program, the only way you could satisfy both those terms and this 550 | License would be to refrain entirely from conveying the Program. 551 | 552 | 13. Use with the GNU Affero General Public License. 553 | 554 | Notwithstanding any other provision of this License, you have 555 | permission to link or combine any covered work with a work licensed 556 | under version 3 of the GNU Affero General Public License into a single 557 | combined work, and to convey the resulting work. The terms of this 558 | License will continue to apply to the part which is the covered work, 559 | but the special requirements of the GNU Affero General Public License, 560 | section 13, concerning interaction through a network will apply to the 561 | combination as such. 562 | 563 | 14. Revised Versions of this License. 564 | 565 | The Free Software Foundation may publish revised and/or new versions of 566 | the GNU General Public License from time to time. Such new versions will 567 | be similar in spirit to the present version, but may differ in detail to 568 | address new problems or concerns. 569 | 570 | Each version is given a distinguishing version number. If the 571 | Program specifies that a certain numbered version of the GNU General 572 | Public License "or any later version" applies to it, you have the 573 | option of following the terms and conditions either of that numbered 574 | version or of any later version published by the Free Software 575 | Foundation. If the Program does not specify a version number of the 576 | GNU General Public License, you may choose any version ever published 577 | by the Free Software Foundation. 578 | 579 | If the Program specifies that a proxy can decide which future 580 | versions of the GNU General Public License can be used, that proxy's 581 | public statement of acceptance of a version permanently authorizes you 582 | to choose that version for the Program. 583 | 584 | Later license versions may give you additional or different 585 | permissions. However, no additional obligations are imposed on any 586 | author or copyright holder as a result of your choosing to follow a 587 | later version. 588 | 589 | 15. Disclaimer of Warranty. 590 | 591 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 592 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 596 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 597 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 599 | 600 | 16. Limitation of Liability. 601 | 602 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 610 | SUCH DAMAGES. 611 | 612 | 17. Interpretation of Sections 15 and 16. 613 | 614 | If the disclaimer of warranty and limitation of liability provided 615 | above cannot be given local legal effect according to their terms, 616 | reviewing courts shall apply local law that most closely approximates 617 | an absolute waiver of all civil liability in connection with the 618 | Program, unless a warranty or assumption of liability accompanies a 619 | copy of the Program in return for a fee. 620 | 621 | END OF TERMS AND CONDITIONS 622 | 623 | How to Apply These Terms to Your New Programs 624 | 625 | If you develop a new program, and you want it to be of the greatest 626 | possible use to the public, the best way to achieve this is to make it 627 | free software which everyone can redistribute and change under these terms. 628 | 629 | To do so, attach the following notices to the program. It is safest 630 | to attach them to the start of each source file to most effectively 631 | state the exclusion of warranty; and each file should have at least 632 | the "copyright" line and a pointer to where the full notice is found. 633 | 634 | 635 | Copyright (C) 636 | 637 | This program is free software: you can redistribute it and/or modify 638 | it under the terms of the GNU General Public License as published by 639 | the Free Software Foundation, either version 3 of the License, or 640 | (at your option) any later version. 641 | 642 | This program is distributed in the hope that it will be useful, 643 | but WITHOUT ANY WARRANTY; without even the implied warranty of 644 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 645 | GNU General Public License for more details. 646 | 647 | You should have received a copy of the GNU General Public License 648 | along with this program. If not, see . 649 | 650 | Also add information on how to contact you by electronic and paper mail. 651 | 652 | If the program does terminal interaction, make it output a short 653 | notice like this when it starts in an interactive mode: 654 | 655 | Copyright (C) 656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 657 | This is free software, and you are welcome to redistribute it 658 | under certain conditions; type `show c' for details. 659 | 660 | The hypothetical commands `show w' and `show c' should show the appropriate 661 | parts of the General Public License. Of course, your program's commands 662 | might be different; for a GUI interface, you would use an "about box". 663 | 664 | You should also get your employer (if you work as a programmer) or school, 665 | if any, to sign a "copyright disclaimer" for the program, if necessary. 666 | For more information on this, and how to apply and follow the GNU GPL, see 667 | . 668 | 669 | The GNU General Public License does not permit incorporating your program 670 | into proprietary programs. If your program is a subroutine library, you 671 | may consider it more useful to permit linking proprietary applications with 672 | the library. If this is what you want to do, use the GNU Lesser General 673 | Public License instead of this License. But first, please read 674 | . 675 | --------------------------------------------------------------------------------