├── .gitignore ├── README.md ├── LICENSE ├── ExampleList.txt ├── GetProtein.py ├── GetOrganism.py ├── GetNucleotide.py ├── GetCDS.py ├── SeqExtract.py └── Get16S.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # C extensions 4 | *.so 5 | 6 | # Packages 7 | *.egg 8 | *.egg-info 9 | dist 10 | build 11 | eggs 12 | parts 13 | bin 14 | var 15 | sdist 16 | develop-eggs 17 | .installed.cfg 18 | lib 19 | lib64 20 | __pycache__ 21 | 22 | # Installer logs 23 | pip-log.txt 24 | 25 | # Unit test / coverage reports 26 | .coverage 27 | .tox 28 | nosetests.xml 29 | 30 | # Translations 31 | *.mo 32 | 33 | # Mr Developer 34 | .mr.developer.cfg 35 | .project 36 | .pydevproject 37 | .idea 38 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Genbank-Downloaders 2 | =================== 3 | 4 | A series of small Biopython scripts for downloading sequence data off NCBI's Genbank. 5 | 6 | Here is a short description of each script: 7 | 8 | * **GetCDS.py** - Takes a list of Genbank accessions and downloads the coding sequences for each accession. 9 | * **GetNucleotide.py** - Takes a list of Genbank accessions and downloads a nucleotide sequence for each accession. 10 | * **Get16s.py** - Takes a list of Genbank accessions and downloads a 16S gene for each accession. 11 | * **GetOrganism.py** - Finds the taxa of each accession number in given list. 12 | 13 | For more thorough descriptions and information on usage please check the [**wiki!**] (https://github.com/LeeBergstrand/Genbank-Downloaders/wiki) 14 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Lee Bergstrand 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /ExampleList.txt: -------------------------------------------------------------------------------- 1 | CP001220 2 | AL123456 3 | AE000516 4 | CP000611 5 | HE608151 6 | CP001658 7 | CU458896 8 | CP004374 9 | CP004376 10 | AE016958 11 | CP000479 12 | CP005928 13 | FR878060 14 | CP006936 15 | CP002329 16 | CP000580 17 | CP000518 18 | CP000384 19 | CP003491 20 | CP003347 21 | CP003900 22 | CP002095 23 | AM408590 24 | AP010918 25 | HE572590 26 | FO203507 27 | FO203508 28 | FO203509 29 | FO203510 30 | CP003322 31 | CP003323 32 | CP003324 33 | CP006835 34 | FM211192 35 | CP000854 36 | CP003078 37 | CP001663 38 | CP003169 39 | CP003053 40 | CP002275 41 | CP000656 42 | CP002385 43 | CP003899 44 | CP000511 45 | CP000325 46 | CP003761 47 | AP008957 48 | FN563149 49 | AP011115 50 | CP006996 51 | CP000431 52 | CP002993 53 | CP003990 54 | CP002994 55 | AP009493 56 | CP003275 57 | CP003720 58 | CP006567 59 | DSM40593 60 | CP006259 61 | HE971709 62 | CU458745 63 | CP004375 64 | CP000385 65 | CP006997 66 | CP006998 67 | EF079106 68 | AP011118 69 | AP011119 70 | AP011120 71 | CP000657 72 | CP000658 73 | CP000659 74 | CP006836 75 | CP000519 76 | CP000520 77 | CP000895 78 | CP002386 79 | CP002387 80 | BX649209 81 | CP003054 82 | CP003055 83 | CP003079 84 | CP003080 85 | CP003081 86 | JQ657805 87 | JQ657806 88 | AP008932 89 | AP008933 90 | CP003762 91 | AP008931 92 | CP000432 93 | CP000433 94 | CP000434 95 | AP011116 96 | AP011117 97 | CP006260 98 | CP006261 99 | HE971710 100 | CP003276 101 | CP003277 102 | CP003721 103 | CP003722 104 | CP003991 105 | CP002995 106 | CP002996 107 | AWQX01000000 108 | AWQW01000000 109 | AOCJ00000000 110 | ADNW02000000 111 | ASHF00000000 112 | ASHE00000000 113 | ASHE01000000 114 | AFHJ00000000 115 | AWQW00000000 116 | AWQX00000000 117 | AYNP00000000 118 | AFVW00000000 119 | CAVJ000000000 120 | AOEX00000000 121 | AHBW00000000 122 | APMY00000000 123 | AROY00000000 124 | AMLP00000000 125 | AOPY00000000 126 | AOPZ00000000 127 | AOHP00000000 128 | AHIL00000000 129 | ADVQ00000000 130 | AKUX00000000 131 | AKUY00000000 132 | AYLW00000000 133 | ANPM00000000 134 | ANPL00000000 135 | AGIQ00000000 136 | CBMO000000000 137 | APKD00000000 138 | CBMJ000000000 139 | ALQB00000000 140 | ALQA00000000 141 | AJFJ00000000 142 | AJFI00000000 143 | AGVE00000000 144 | ADNV00000000 145 | CAPS00000000 146 | AKKP00000000 147 | AVCO00000000 148 | AUZK00000000 149 | ACNO00000000 150 | APJC00000000 151 | AJYC00000000 152 | AODN00000000 153 | AODO00000000 154 | ANIU00000000 155 | AJJH00000000 156 | ANSJ00000000 157 | AORZ00000000 158 | -------------------------------------------------------------------------------- /GetProtein.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Created by: Lee Bergstrand 3 | # Description: A simple program that takes a list of genbank protein accession numbers and downloads 4 | # their associated sequences. It then stores the genes or complete genomes as a single 5 | # peptide fasta file. 6 | # 7 | # Requirements: - This script requires the Biopython module: http://biopython.org/wiki/Download 8 | # - This script requires the SeqExtract module (included in the Bio-Scripts repository) 9 | # - All accessions must link to regular nucleotide genbank records (gene or genome), 10 | # however if a genome is shotgun sequenced you must provide the accession to its Whole 11 | # Genome Shotgun Sequence Project record. 12 | # - Before using this script to access the NCBI's online resources please read the NCBI's 13 | # Entrez User Requirements. If the NCBI finds you are abusing their systems, they can 14 | # and will ban your access! Use the optional email parameter so the NCBI can contact 15 | # you if there is a problem. 16 | # 17 | # Usage: GetProtein.py [email@mail.com] 18 | # Example: GetProtein.py mySeqs.txt JBro@YOLO.com 19 | # ---------------------------------------------------------------------------------------- 20 | # =========================================================================================================== 21 | # Imports: 22 | 23 | import sys 24 | 25 | from SeqExtract import entrezEmail 26 | from SeqExtract import getSeqRecords 27 | 28 | 29 | # =========================================================================================================== 30 | # Functions: 31 | 32 | 33 | # 1: Checks if in proper number of arguments are passed gives instructions on proper use. 34 | def argsCheck(numArgs): 35 | if len(sys.argv) < numArgs or len(sys.argv) > numArgs: 36 | print("Sequence Downloader") 37 | print("By Lee Bergstrand\n") 38 | print("Usage: " + sys.argv[0] + " [email@mail.com]") 39 | print("Examples: " + sys.argv[0] + " mySeqs.txt JBro@YOLO.com\n") 40 | print("Please Note:") 41 | print("Before using this script to access the NCBI's online resources please read the NCBI's") 42 | print("Entrez User Requirements. If the NCBI finds you are abusing their systems, they can") 43 | print("and will ban your access! Use the optional email parameter so the NCBI can contact") 44 | print("you if there is a problem.") 45 | sys.exit(1) # Aborts program. (exit(1) indicates that an error occurred) 46 | 47 | 48 | # =========================================================================================================== 49 | # Main program code: 50 | 51 | # House keeping... 52 | argsCheck(3) # Checks if the number of arguments are correct. 53 | entrezEmail(sys.argv[2]) # Sets up arguments email require for genbank file extraction. 54 | 55 | # Stores file one for input checking. 56 | print(">> Opening sequence list...") 57 | inFile = sys.argv[1] 58 | 59 | # File extension check 60 | if not inFile.endswith(".txt"): 61 | print("[Warning] " + inFile + " may not be a txt file!") 62 | 63 | # Reads sequence file list and stores it as a string object. Safely closes file: 64 | try: 65 | with open(inFile, "r") as newFile: 66 | sequences = newFile.read() 67 | newFile.close() 68 | except IOError: 69 | print("Failed to open " + inFile) 70 | sys.exit(1) 71 | 72 | seqList = sequences.splitlines() # Splits string into a list. Each element is a single line from the string. 73 | 74 | print("You have listed", len(seqList), "sequences. They are:") 75 | print(sequences + "\n\n") 76 | 77 | seqRecords = getSeqRecords(seqList, database_type="protein") # Gets sequence record objects from NCBI using the sequence list as reference. 78 | 79 | outFile = inFile + ".faa" 80 | try: 81 | # Attempted to create to output file. 82 | writeFile = open(outFile, "w") 83 | print("Writing " + outFile + " to file...") 84 | for sequence in seqRecords: 85 | writeFile.write(sequence.format("fasta")) # Write genome as fasta to file. 86 | writeFile.close() 87 | except IOError: 88 | print("Failed to create " + outFile) 89 | sys.exit(1) 90 | 91 | print("Done!") 92 | -------------------------------------------------------------------------------- /GetOrganism.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Created by: Lee Bergstrand 3 | # Description: A simple program that takes a list of genbank geneome accession numbers and finds the name 4 | # of the taxa that these accession numbers represent. 5 | # 6 | # Requirements: - This script requires the Biopython module: http://biopython.org/wiki/Download 7 | # - This script requires the SeqExtract module (included in the Bio-Scripts repository) 8 | # - All accessions must link to regular nucleotide genome records however if a genome 9 | # is shotgun sequenced you must provide the accession to its Whole Genome Shotgun 10 | # Sequence Project record. 11 | # - Before using this script to access the NCBI's online resources please read the NCBI's 12 | # Entrez User Requirements. If the NCBI finds you are abusing their systems, they can 13 | # and will ban your access! Use the optional email parameter so the NCBI can contact 14 | # you if there is a problem. 15 | # 16 | # Usage: GetOrganism.py [email@mail.com] 17 | # Example: GetOrganism.py mySeqs.txt JBro@YOLO.com 18 | # ---------------------------------------------------------------------------------------- 19 | # =========================================================================================================== 20 | # Imports: 21 | 22 | import sys 23 | 24 | from Bio import Entrez 25 | 26 | from SeqExtract import entrezEmail 27 | 28 | 29 | # =========================================================================================================== 30 | # Functions: 31 | 32 | 33 | # 1: Checks if in proper number of arguments are passed gives instructions on proper use. 34 | def argsCheck(numArgs): 35 | if len(sys.argv) < numArgs or len(sys.argv) > numArgs: 36 | print("Sequence Downloader") 37 | print("By Lee Bergstrand\n") 38 | print("Usage: " + sys.argv[0] + " [email@mail.com]") 39 | print("Examples: " + sys.argv[0] + " mySeqs.txt JBro@YOLO.com\n") 40 | print("Please Note:") 41 | print("Before using this script to access the NCBI's online resources please read the NCBI's") 42 | print("Entrez User Requirements. If the NCBI finds you are abusing their systems, they can") 43 | print("and will ban your access! Use the optional email parameter so the NCBI can contact") 44 | print("you if there is a problem.") 45 | sys.exit(1) # Aborts program. (exit(1) indicates that an error occurred) 46 | 47 | 48 | # =========================================================================================================== 49 | # Main program code: 50 | 51 | # House keeping... 52 | argsCheck(3) # Checks if the number of arguments are correct. 53 | entrezEmail(sys.argv[2]) # Sets up arguments email require for genbank file extraction. 54 | 55 | # Stores file one for input checking. 56 | print("Opening sequence list...") 57 | inFile = sys.argv[1] 58 | 59 | # File extension check 60 | if not inFile.endswith(".txt"): 61 | print("[Warning] " + inFile + " may not be a txt file!") 62 | 63 | # Reads sequence file list and stores it as a string object. Safely closes file: 64 | try: 65 | with open(inFile, "r") as newFile: 66 | sequences = newFile.read() 67 | newFile.close() 68 | except IOError: 69 | print("Failed to open " + inFile) 70 | sys.exit(1) 71 | 72 | seqList = sequences.splitlines() # Splits string into a list. Each element is a single line from the string. 73 | 74 | print("You have listed", len(seqList), "sequences. They are:") 75 | 76 | for seq in seqList: 77 | GenbankAccession = seq 78 | 79 | handle = Entrez.esearch(db="nuccore", term=GenbankAccession) 80 | GenbankSearchResults = Entrez.read(handle) 81 | if not GenbankSearchResults["IdList"]: 82 | print(seq + ": Taxa's name not found.") 83 | continue 84 | else: 85 | AccessionGenId = GenbankSearchResults["IdList"][0] 86 | 87 | GenbankInfo = Entrez.esummary(db="nuccore", id=AccessionGenId) 88 | GenbankSummery = Entrez.read(GenbankInfo) 89 | TaxaID = GenbankSummery[0]["TaxId"] 90 | 91 | TaxonomyInfo = Entrez.esummary(db="taxonomy", id=TaxaID) 92 | TaxonomySummary = Entrez.read(TaxonomyInfo) 93 | TaxaName = TaxonomySummary[0]["ScientificName"] 94 | 95 | print(seq + ": " + TaxaName) 96 | -------------------------------------------------------------------------------- /GetNucleotide.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Created by: Lee Bergstrand 3 | # Description: A simple program that takes a list of genbank nucleotide accession numbers and downloads 4 | # their associated sequences. It then stores the genes or complete genomes as a nucleotide 5 | # fasta file. For shotgun sequenced data it stores the contigs in a multiple sequence 6 | # nucleotide fasta file. 7 | # 8 | # Requirements: - This script requires the Biopython module: http://biopython.org/wiki/Download 9 | # - This script requires the SeqExtract module (included in the Bio-Scripts repository) 10 | # - All accessions must link to regular nucleotide genbank records (gene or genome), 11 | # however if a genome is shotgun sequenced you must provide the accession to its Whole 12 | # Genome Shotgun Sequence Project record. 13 | # - Before using this script to access the NCBI's online resources please read the NCBI's 14 | # Entrez User Requirements. If the NCBI finds you are abusing their systems, they can 15 | # and will ban your access! Use the optional email parameter so the NCBI can contact 16 | # you if there is a problem. 17 | # 18 | # Usage: GetNucleotide.py [email@mail.com] 19 | # Example: GetNucleotide.py mySeqs.txt JBro@YOLO.com 20 | # ---------------------------------------------------------------------------------------- 21 | # =========================================================================================================== 22 | # Imports: 23 | 24 | import sys 25 | 26 | from SeqExtract import entrezEmail 27 | from SeqExtract import extractContigs 28 | from SeqExtract import getSeqRecords 29 | from SeqExtract import isSSProject 30 | 31 | 32 | # =========================================================================================================== 33 | # Functions: 34 | 35 | 36 | # 1: Checks if in proper number of arguments are passed gives instructions on proper use. 37 | def argsCheck(numArgs): 38 | if len(sys.argv) < numArgs or len(sys.argv) > numArgs: 39 | print("Sequence Downloader") 40 | print("By Lee Bergstrand\n") 41 | print("Usage: " + sys.argv[0] + " [email@mail.com]") 42 | print("Examples: " + sys.argv[0] + " mySeqs.txt JBro@YOLO.com\n") 43 | print("Please Note:") 44 | print("Before using this script to access the NCBI's online resources please read the NCBI's") 45 | print("Entrez User Requirements. If the NCBI finds you are abusing their systems, they can") 46 | print("and will ban your access! Use the optional email parameter so the NCBI can contact") 47 | print("you if there is a problem.") 48 | sys.exit(1) # Aborts program. (exit(1) indicates that an error occurred) 49 | 50 | 51 | # =========================================================================================================== 52 | # Main program code: 53 | 54 | # House keeping... 55 | argsCheck(3) # Checks if the number of arguments are correct. 56 | entrezEmail(sys.argv[2]) # Sets up arguments email require for genbank file extraction. 57 | 58 | # Stores file one for input checking. 59 | print(">> Opening sequence list...") 60 | inFile = sys.argv[1] 61 | 62 | # File extension check 63 | if not inFile.endswith(".txt"): 64 | print("[Warning] " + inFile + " may not be a txt file!") 65 | 66 | # Reads sequence file list and stores it as a string object. Safely closes file: 67 | try: 68 | with open(inFile, "r") as newFile: 69 | sequences = newFile.read() 70 | newFile.close() 71 | except IOError: 72 | print("Failed to open " + inFile) 73 | sys.exit(1) 74 | 75 | seqList = sequences.splitlines() # Splits string into a list. Each element is a single line from the string. 76 | 77 | print("You have listed", len(seqList), "sequences. They are:") 78 | print(sequences + "\n\n") 79 | 80 | seqRecords = getSeqRecords(seqList) # Gets sequence record objects from NCBI using the sequence list as reference. 81 | 82 | for sequence in seqRecords: 83 | outFile = sequence.id + ".fna" 84 | try: 85 | # Attempted to create to output file. 86 | writeFile = open(outFile, "w") 87 | print("Writing " + outFile + " to file...") 88 | 89 | # Checks if the accession leads to a WGSS project. 90 | # If accession is a WGSS project... 91 | if isSSProject(sequence): 92 | contigList = extractContigs(sequence.id) # Extract all contig accessions. 93 | contigRecords = getSeqRecords(contigList) # Extract sequence record object for each contig. 94 | for contig in contigRecords: 95 | writeFile.write(contig.format("fasta")) # Write each contig to the same file in fasta format. 96 | # If accession is a regular genome... 97 | else: 98 | writeFile.write(sequence.format("fasta")) # Write genome as fasta to file 99 | writeFile.close() 100 | except IOError: 101 | print("Failed to create " + outFile) 102 | sys.exit(1) 103 | 104 | print("Done!") 105 | -------------------------------------------------------------------------------- /GetCDS.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Created by: Lee Bergstrand 3 | # Description: A simple program that takes a list of nucleotide genbank accession numbers and 4 | # downloads the Coding Sequences (CDS) contained within the sequences linked by 5 | # those accessions. It then stores these CDSs within a multi-sequence protein fasta 6 | # file. The script also creates a CSV file containing some essential info about each CDS. 7 | # 8 | # Requirements: - This script requires the Biopython module: http://biopython.org/wiki/Download 9 | # - This script requires the SeqExtract module (included in the Genbank-Downloaders repository) 10 | # - All accessions must link to regular nucleotide genbank records (gene or genome), 11 | # however if a genome is shotgun sequenced you must provide the accession to its Whole 12 | # Genome Shotgun Sequence Project record. 13 | # - Before using this script to access the NCBI's online resources please read the NCBI's 14 | # Entrez User Requirements. If the NCBI finds you are abusing their systems, they can 15 | # and will ban your access! Use the optional email parameter so the NCBI can contact 16 | # you if there is a problem. 17 | # 18 | # Usage: Get16S.py [email@mail.com] 19 | # Example:Get16S.py mySeqs.txt JBro@YOLO.com 20 | # ---------------------------------------------------------------------------------------- 21 | # =========================================================================================================== 22 | # Imports: 23 | 24 | import csv 25 | import sys 26 | 27 | from SeqExtract import entrezEmail 28 | from SeqExtract import extractContigs 29 | from SeqExtract import getProteinAnnotationCSV 30 | from SeqExtract import getProteinAnnotationFasta 31 | from SeqExtract import getSeqRecords 32 | from SeqExtract import isSSProject 33 | 34 | 35 | # =========================================================================================================== 36 | # Functions: 37 | 38 | 39 | # 1: Checks if in proper number of arguments are passed gives instructions on proper use. 40 | def argsCheck(numArgs): 41 | if len(sys.argv) < numArgs or len(sys.argv) > numArgs: 42 | print("Coding Sequence Downloader") 43 | print("By Lee Bergstrand\n") 44 | print("Usage: " + sys.argv[0] + " [email@mail.com]") 45 | print("Examples: " + sys.argv[0] + " mySeqs.txt JBro@YOLO.com\n") 46 | print("Please Note:") 47 | print("Before using this script to access the NCBI's online resources please read the NCBI's") 48 | print("Entrez User Requirements. If the NCBI finds you are abusing their systems, they can") 49 | print("and will ban your access! Use the optional email parameter so the NCBI can contact") 50 | print("you if there is a problem.") 51 | sys.exit(1) # Aborts program. (exit(1) indicates that an error occurred) 52 | 53 | 54 | # =========================================================================================================== 55 | # Main program code: 56 | 57 | # House keeping... 58 | argsCheck(3) # Checks if the number of arguments are correct. 59 | entrezEmail(sys.argv[2]) # Sets up arguments email require for genbank file extraction. 60 | 61 | # Stores file one for input checking. 62 | print(">> Opening sequence list...") 63 | inFile = sys.argv[1] 64 | 65 | # File extension check 66 | if not inFile.endswith(".txt"): 67 | print("[Warning] " + inFile + " may not be a txt file!") 68 | 69 | # Reads sequence file list and stores it as a string object. Safely closes file.try: 70 | try: 71 | with open(inFile, "r") as newFile: 72 | sequences = newFile.read() 73 | newFile.close() 74 | except IOError: 75 | print("Failed to open " + inFile) 76 | sys.exit(1) 77 | 78 | seqList = sequences.splitlines() # Splits string into a list. Each element is a single line from the string. 79 | 80 | print("You have listed", len(seqList), "sequences. They are:") 81 | print(sequences + "\n\n") 82 | 83 | seqRecords = getSeqRecords(seqList) # Gets sequence record objects from NCBI using the sequence list as reference. 84 | 85 | for sequence in seqRecords: 86 | outFile = sequence.id + ".faa" 87 | outCSV = sequence.id + ".csv" 88 | try: 89 | # Attempted to create to output file. 90 | writeFile = open(outFile, "w") 91 | print("Writing " + outFile + " to file...") 92 | csvFile = open(outCSV, "w") 93 | CSVWriter = csv.writer(csvFile) 94 | print("Writing " + outCSV + " to file...") 95 | 96 | # Checks if the accession leads to a WGSS project. 97 | # If accession is a WGSS project... 98 | if isSSProject(sequence): 99 | contigList = extractContigs(sequence.id) # Extract all contig accessions. 100 | contigRecords = getSeqRecords(contigList) # Extract sequence record object for each contig. 101 | for contig in contigRecords: 102 | fasta = getProteinAnnotationFasta(contig) # Builds list fasta files. 103 | csvRows = getProteinAnnotationCSV(contig) # Builds list of csv rows. 104 | for annotation in fasta: 105 | writeFile.write(annotation) 106 | for row in csvRows: 107 | CSVWriter.writerow(row) 108 | # If accession is a regular genome... 109 | else: 110 | OrganismGenomeLength = len(sequence.seq) # Gets Genome Length 111 | fasta = getProteinAnnotationFasta(sequence) # Builds list fasta files. 112 | csvRows = getProteinAnnotationCSV(sequence) # Builds list of csv rows. 113 | for annotation in fasta: 114 | writeFile.write(annotation) 115 | for row in csvRows: 116 | row.append(OrganismGenomeLength) # Appends genome length to the rest of the csv file. 117 | CSVWriter.writerow(row) 118 | writeFile.close() 119 | csvFile.close() 120 | except IOError: 121 | print("Failed to create " + outFile) 122 | sys.exit(1) 123 | print("Done!") 124 | -------------------------------------------------------------------------------- /SeqExtract.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Created by: Lee Bergstrand 3 | # Description: A module that contains functions for extraction of genbank records. 4 | # 5 | # Requirements: - This module requires the Biopython module: http://biopython.org/wiki/Download 6 | # ------------------------------------------------------------------------------------------------------------ 7 | # ============================================================================================================ 8 | # Imports and Setup: 9 | 10 | import re 11 | import sys 12 | 13 | from Bio import Entrez 14 | from Bio import SeqIO 15 | 16 | AccessionBaseRegex = re.compile("^[a-zA-Z]{4}\d{2}") 17 | WGSSProjectRegex = re.compile("[a-zA-Z_]{4,7}\d{8,10}") 18 | 19 | 20 | # ============================================================================================================ 21 | # Functions: 22 | 23 | 24 | # 1: Sets up user email. 25 | def entrezEmail(email): 26 | Entrez.email = email 27 | 28 | 29 | # ------------------------------------------------------------------------------------------------------------ 30 | # 2: When passed an array of accessions from NCBI it returns a list of sequence objects matching those accessions. 31 | def getSeqRecords(seqList, database_type="nucleotide"): 32 | try: 33 | print("Requesting sequence data from genbank...") 34 | handle = Entrez.efetch(db=database_type, id=seqList, rettype="gb", 35 | retmode="genbank") # Gets records and stores them. 36 | print("Starting download...") 37 | SeqRecords = list(SeqIO.parse(handle, "genbank")) # Creates a list of SeqRecord objects from genbank files e. 38 | print("Download Complete.") 39 | handle.close() # Closes handle since it is no longer needed. 40 | except IOError: 41 | print("Failed to connect to NCBI server. ") 42 | sys.exit(1) 43 | return SeqRecords 44 | 45 | 46 | # ------------------------------------------------------------------------------------------------------------ 47 | # 3: When passed a sequence record object returns an array of fasta strings for each annotation. 48 | def getProteinAnnotationFasta(seqRecord): 49 | fasta = [] 50 | features = seqRecord.features # Each sequence has a list (called features) that stores seqFeature objects. 51 | for feature in features: # For each feature on the sequence 52 | if feature.type == "CDS": # CDS means coding sequence (These are the only feature we're interested in) 53 | featQualifers = feature.qualifiers # Each feature contains a dictionary called qualifiers which contains 54 | # data about the sequence feature (for example the translation) 55 | 56 | # Gets the required qualifier. Uses featQualifers.get to return the qualifier or a default value if the quantifier 57 | # is not found. Calls strip to remove unwanted brackets and ' from qualifier before storing it as a string. 58 | protein_id = str(featQualifers.get('protein_id', 'no_protein_id')).strip('\'[]') 59 | if protein_id == 'no_protein_id': 60 | continue # Skips the iteration if protein has no id. 61 | gene = str(featQualifers.get('gene', 'no_gene_name')).strip('\'[]') 62 | product = str(featQualifers.get('product', 'no_product_name')).strip('\'[]') 63 | translated_protein = str(featQualifers.get('translation', 'no_translation')).strip('\'[]') 64 | fasta.append((">" + protein_id + " " + gene + "-" + product + "\n" + translated_protein + "\n")) 65 | return fasta 66 | 67 | 68 | # ------------------------------------------------------------------------------------------------------------ 69 | # 4: Passed a sequence record object returns a list of csv rows. Each row is list containing info for each annotation. 70 | def getProteinAnnotationCSV(seqRecord): 71 | csvRowSet = [] # Master list of all rows. 72 | for feature in seqRecord.features: 73 | if feature.type == "CDS": 74 | csvRow = [] # Created new list for each row. (This is for compatibility with the csv module's row write) 75 | CDSLocal = feature.location # Gets the feature location 76 | featQualifers = feature.qualifiers # Gets sequence quantifiers. 77 | 78 | # Gets sequence quantifiers. 79 | gene = str(featQualifers.get('gene', 'no_gene_name')).strip('\'[]') 80 | product = str(featQualifers.get('product', 'no_product_name')).strip('\'[]') 81 | proteinID = str(featQualifers.get('protein_id', 'no_protein_id')).strip('\'[]') 82 | locus = str(featQualifers.get('locus_tag', 'no_locus_tag')).strip('\'[]') 83 | if proteinID == 'no_protein_id': 84 | continue # Skips the iteration if protein has no id. 85 | # Append quantifers and other information to the csv row list. 86 | csvRow.append(seqRecord.annotations["organism"]) 87 | csvRow.append(seqRecord.id) 88 | csvRow.append(proteinID) 89 | csvRow.append(gene) 90 | csvRow.append(str(CDSLocal.start)) 91 | csvRow.append(str(CDSLocal.end)) 92 | csvRow.append(str(CDSLocal.strand)) 93 | csvRow.append(locus) 94 | csvRow.append(product) 95 | 96 | csvRowSet.append(csvRow) # Appends the csvRow list to the master list of rows. 97 | 98 | return csvRowSet 99 | 100 | 101 | # ------------------------------------------------------------------------------------------------------------ 102 | # 5: Checks if genome is a WGSS project. 103 | def isSSProject(sequence): 104 | is_ssp = False 105 | 106 | if sequence.annotations.get('wgs'): 107 | is_ssp = True 108 | 109 | return is_ssp 110 | 111 | 112 | # ------------------------------------------------------------------------------------------------------------ 113 | # 6: When passed a list of sequence record objects returns an list of fasta strings for each annotation. 114 | # This implementation is "quick and dirty" shall be replaced in later versions. 115 | def extractContigs(seqList): 116 | SeqRecords = getSeqRecords(seqList) 117 | contigList = [] 118 | 119 | for WGSS in SeqRecords: 120 | 121 | WGSSRange = WGSS.annotations["wgs"] # Extracts WGSS contig accession range. 122 | 123 | if len(WGSSRange) == 2: 124 | # Extracts the accession base from first contig accession number. 125 | AccessionBase = (AccessionBaseRegex.findall(WGSSRange[0])[0]) 126 | 127 | # Takes both the the min and max accession and slices off (using python's string slice syntax s[start:end:step]) 128 | # the accession base code leaving the numerical difference between the contigs. 129 | # Converts these differences to integers. 130 | WGSSRangeMin = int(WGSSRange[0][6:]) 131 | WGSSRangeMax = int(WGSSRange[1][6:]) 132 | 133 | # WGSS accession number length actually varies. Its normally 12 characters but I have seen 13 before. 134 | # The code block below accounts for this. 135 | zeroOffset = 6 136 | accessionLength = len(WGSSRange[0]) 137 | if accessionLength != 12: 138 | zeroOffset = accessionLength - 6 # 6 is the length of the standard accession base. 139 | 140 | # Creates accession list 141 | for x in range(WGSSRangeMin, (WGSSRangeMax + 1)): 142 | contigAccession = AccessionBase 143 | contigAccession += ("{0:0" + str(zeroOffset) + "d}").format( 144 | x) # Uses zero offset to make accessions proper length. 145 | contigList.append(contigAccession) 146 | else: 147 | contigList.append(WGSSRange[0]) # If one contig, simply append it to the list. 148 | 149 | return contigList 150 | -------------------------------------------------------------------------------- /Get16S.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Created by: Lee Bergstrand 3 | # Descript: A simple program that takes a list of nucleotide genbank accession numbers and 4 | # downloads the 16S ribosomal RNA contained within the sequences linked to those 5 | # accessions. Its then stores these 16S genes within a multi-sequence fasta file. The 6 | # script also creates a CSV file containing the genomes for which no 16S annotation was found. 7 | # 8 | # Requirements: - This script requires the Biopython module: http://biopython.org/wiki/Download 9 | # - This script requires the SeqExtract module (included in the Genbank-Downloaders repository) 10 | # - All accessions must link to regular nucleotide genbank records (genome), 11 | # however if a genome is shotgun sequenced you must provide the accession to its Whole 12 | # Genome Shotgun Sequence Project record. 13 | # - Before using this script to access the NCBI's online resources please read the NCBI's 14 | # Entrez User Requirements. If the NCBI finds you are abusing their systems, they can 15 | # and will ban your access! Use the optional email parameter so the NCBI can contact 16 | # you if there is a problem. 17 | # 18 | # Notes: - This script only extracts the first 16S gene found not all the 16S genes. 19 | # 20 | # Usage: Get16S.py [email@mail.com] 21 | # Example: Get16S.py mySeqs.txt JBro@YOLO.com 22 | # ---------------------------------------------------------------------------------------- 23 | # =========================================================================================================== 24 | # Imports: 25 | 26 | import csv 27 | import sys 28 | 29 | from SeqExtract import entrezEmail 30 | from SeqExtract import extractContigs 31 | from SeqExtract import getSeqRecords 32 | from SeqExtract import isSSProject 33 | 34 | 35 | # =========================================================================================================== 36 | # Functions: 37 | 38 | 39 | # 1: Checks if in proper number of arguments are passed gives instructions on proper use. 40 | def argsCheck(numArgs): 41 | if len(sys.argv) < numArgs or len(sys.argv) > numArgs: 42 | print("Coding Sequence Downloader") 43 | print("By Lee Bergstrand\n") 44 | print("Usage: " + sys.argv[0] + " [email@mail.com]") 45 | print("Examples: " + sys.argv[0] + " mySeqs.txt JBro@YOLO.com\n") 46 | print("Please Note:") 47 | print("Before using this script to access the NCBI's online resources please read the NCBI's") 48 | print("Entrez User Requirements. If the NCBI finds you are abusing their systems, they can") 49 | print("and will ban your access! Use the optional email parameter so the NCBI can contact") 50 | print("you if there is a problem.") 51 | sys.exit(1) # Aborts program. (exit(1) indicates that an error occurred) 52 | 53 | 54 | # --------------------------------------------------------------------------------------------------------- 55 | # 2: Gets reverse complement of DNA string. 56 | def reverseCompliment(sequence): 57 | sequence.upper() 58 | 59 | basecomplement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'} 60 | letters = list(sequence) 61 | letters = [basecomplement[base] for base in letters] 62 | sequence = "".join(letters) 63 | 64 | sequence = sequence[::-1] # Reverses sequence using pythons string slice syntax. 65 | return sequence 66 | 67 | 68 | # --------------------------------------------------------------------------------------------------------- 69 | # 3: Gets 16S DNA as a fasta. 70 | def extract16sFasta(organismID, feature, record): 71 | start = feature.location.nofuzzy_start 72 | end = feature.location.nofuzzy_end 73 | strand = feature.location.strand 74 | sequence = str(record.seq[start:end]) # Extracts subsequence from the genome according to location of the feature. 75 | 76 | if strand == -1: # Converts subsequence to reverse complement if on negative strand. 77 | sequence = reverseCompliment(sequence) 78 | 79 | fasta = ">%s\n%s" % (organismID, sequence) 80 | return fasta 81 | 82 | 83 | # --------------------------------------------------------------------------------------------------------- 84 | # 4: Gets a list of 16s FASTAs from a genome. 85 | def get16sFasta(sequenceID, record): 86 | FASTAS = [] 87 | organismID = sequenceID 88 | features = record.features 89 | for feature in features: 90 | if feature.type == "rRNA": 91 | if "16S" in feature.qualifiers["product"][0]: 92 | fasta = extract16sFasta(organismID, feature, record) 93 | if 1000 < len(fasta) < 2000: # Removes partial sequences and really large sequence do to genbank. 94 | FASTAS.append(fasta) 95 | return FASTAS 96 | 97 | 98 | # =========================================================================================================== 99 | # Main program code: 100 | 101 | # House keeping... 102 | argsCheck(3) # Checks if the number of arguments are correct. 103 | entrezEmail(sys.argv[2]) # Sets up arguments email require for genbank file extraction. 104 | 105 | # Stores file one for input checking. 106 | print() 107 | ">> Opening sequence list..." 108 | inFile = sys.argv[1] 109 | 110 | # File extension check 111 | if not inFile.endswith(".txt"): 112 | print("[Warning] " + inFile + " may not be a txt file!") 113 | 114 | # Reads sequence file list and stores it as a string object. Safely closes file.try: 115 | try: 116 | with open(inFile, "r") as newFile: 117 | sequences = newFile.read() 118 | newFile.close() 119 | except IOError: 120 | print("Failed to open " + inFile) 121 | sys.exit(1) 122 | 123 | seqList = sequences.splitlines() # Splits string into a list. Each element is a single line from the string. 124 | 125 | print() 126 | "You have listed", len(seqList), "sequences. They are:" 127 | print(sequences + "\n") 128 | 129 | seqRecords = getSeqRecords( 130 | seqList) # Acquires list of sequence record objects from NCBI using the sequence list as reference. 131 | 132 | No16sGenomes = [] 133 | SixTeens = [] 134 | for sequence in seqRecords: 135 | sequenceID = sequence.id 136 | No16s = True 137 | if "plasmid" in sequence.description.lower(): # If sequence is from a plasmid skip the iteration. 138 | continue 139 | if isSSProject(sequence): # If accession is a WGSS project... 140 | contigList = extractContigs(sequence.id) # Extract all contig accessions. 141 | contigRecords = getSeqRecords(contigList) # Extract sequence record object for each contig. 142 | for contig in contigRecords: 143 | fasta = get16sFasta(sequenceID, contig) # Builds list fasta files. 144 | if fasta: # If 16S is found. 145 | No16s = False 146 | SixTeens.append(fasta[0]) 147 | break # If 16S is found in one contig break out and skip all the other contigs 148 | else: # If accession is a regular genome... 149 | fasta = get16sFasta(sequenceID, sequence) # Builds list fasta files. 150 | if fasta: 151 | No16s = False 152 | SixTeens.append(fasta[0]) 153 | if No16s: # If not 16S is found add genome to the no 16s found list. 154 | No16sGenomeInfo = [sequenceID, sequence.annotations["organism"]] 155 | No16sGenomes.append(No16sGenomeInfo) 156 | OutSixTeens = "\n".join(SixTeens) 157 | 158 | try: 159 | # Attempted to create to output files. 160 | outFile = "SixTeenSSFromGenbank.fna" 161 | outCSVFile = "NoSixTeenGenomes.csv" 162 | 163 | print("Writing " + outFile + " to file...") 164 | writeFile = open(outFile, "w") 165 | writeFile.write(OutSixTeens) 166 | writeFile.close() 167 | 168 | if No16sGenomes: 169 | print("Writing " + outCSVFile + " to file...") 170 | csvFile = open(outCSVFile, "w") 171 | CSVWriter = csv.writer(csvFile) 172 | for genome in No16sGenomes: 173 | CSVWriter.writerow(genome) 174 | csvFile.close() 175 | except IOError: 176 | print("Failed to create " + outFile) 177 | sys.exit(1) 178 | 179 | print("Done!") 180 | --------------------------------------------------------------------------------