├── .gitignore
├── README.md
├── LICENSE
├── ExampleList.txt
├── GetProtein.py
├── GetOrganism.py
├── GetNucleotide.py
├── GetCDS.py
├── SeqExtract.py
└── Get16S.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[cod]
 2 | 
 3 | # C extensions
 4 | *.so
 5 | 
 6 | # Packages
 7 | *.egg
 8 | *.egg-info
 9 | dist
10 | build
11 | eggs
12 | parts
13 | bin
14 | var
15 | sdist
16 | develop-eggs
17 | .installed.cfg
18 | lib
19 | lib64
20 | __pycache__
21 | 
22 | # Installer logs
23 | pip-log.txt
24 | 
25 | # Unit test / coverage reports
26 | .coverage
27 | .tox
28 | nosetests.xml
29 | 
30 | # Translations
31 | *.mo
32 | 
33 | # Mr Developer
34 | .mr.developer.cfg
35 | .project
36 | .pydevproject
37 | .idea
38 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Genbank-Downloaders
 2 | ===================
 3 | 
 4 | A series of small Biopython scripts for downloading sequence data off NCBI's Genbank.
 5 | 
 6 | Here is a short description of each script:
 7 | 
 8 | * **GetCDS.py** - Takes a list of Genbank accessions and downloads the coding sequences for each accession.
 9 | * **GetNucleotide.py** - Takes a list of Genbank accessions and downloads a nucleotide sequence for each accession.
10 | * **Get16s.py** - Takes a list of Genbank accessions and downloads a 16S gene for each accession. 
11 | * **GetOrganism.py** - Finds the taxa of each accession number in given list.
12 | 
13 | For more thorough descriptions and information on usage please check the [**wiki!**] (https://github.com/LeeBergstrand/Genbank-Downloaders/wiki)
14 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2014 Lee Bergstrand
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 6 | this software and associated documentation files (the "Software"), to deal in
 7 | the Software without restriction, including without limitation the rights to
 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 9 | the Software, and to permit persons to whom the Software is furnished to do so,
10 | subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/ExampleList.txt:
--------------------------------------------------------------------------------
  1 | CP001220
  2 | AL123456
  3 | AE000516
  4 | CP000611
  5 | HE608151
  6 | CP001658
  7 | CU458896
  8 | CP004374
  9 | CP004376
 10 | AE016958
 11 | CP000479
 12 | CP005928
 13 | FR878060
 14 | CP006936
 15 | CP002329
 16 | CP000580
 17 | CP000518
 18 | CP000384
 19 | CP003491
 20 | CP003347
 21 | CP003900
 22 | CP002095
 23 | AM408590
 24 | AP010918
 25 | HE572590
 26 | FO203507
 27 | FO203508
 28 | FO203509
 29 | FO203510
 30 | CP003322
 31 | CP003323
 32 | CP003324
 33 | CP006835
 34 | FM211192
 35 | CP000854
 36 | CP003078
 37 | CP001663
 38 | CP003169
 39 | CP003053
 40 | CP002275
 41 | CP000656
 42 | CP002385
 43 | CP003899
 44 | CP000511
 45 | CP000325
 46 | CP003761
 47 | AP008957
 48 | FN563149
 49 | AP011115
 50 | CP006996
 51 | CP000431
 52 | CP002993
 53 | CP003990
 54 | CP002994
 55 | AP009493
 56 | CP003275
 57 | CP003720
 58 | CP006567
 59 | DSM40593
 60 | CP006259
 61 | HE971709
 62 | CU458745
 63 | CP004375
 64 | CP000385
 65 | CP006997
 66 | CP006998
 67 | EF079106
 68 | AP011118
 69 | AP011119
 70 | AP011120
 71 | CP000657
 72 | CP000658
 73 | CP000659
 74 | CP006836
 75 | CP000519
 76 | CP000520
 77 | CP000895
 78 | CP002386
 79 | CP002387
 80 | BX649209
 81 | CP003054
 82 | CP003055
 83 | CP003079
 84 | CP003080
 85 | CP003081
 86 | JQ657805
 87 | JQ657806
 88 | AP008932
 89 | AP008933
 90 | CP003762
 91 | AP008931
 92 | CP000432
 93 | CP000433
 94 | CP000434
 95 | AP011116
 96 | AP011117
 97 | CP006260
 98 | CP006261
 99 | HE971710
100 | CP003276
101 | CP003277
102 | CP003721
103 | CP003722
104 | CP003991
105 | CP002995
106 | CP002996
107 | AWQX01000000
108 | AWQW01000000
109 | AOCJ00000000
110 | ADNW02000000
111 | ASHF00000000
112 | ASHE00000000
113 | ASHE01000000
114 | AFHJ00000000
115 | AWQW00000000
116 | AWQX00000000
117 | AYNP00000000
118 | AFVW00000000
119 | CAVJ000000000
120 | AOEX00000000
121 | AHBW00000000
122 | APMY00000000
123 | AROY00000000
124 | AMLP00000000
125 | AOPY00000000
126 | AOPZ00000000
127 | AOHP00000000
128 | AHIL00000000
129 | ADVQ00000000
130 | AKUX00000000
131 | AKUY00000000
132 | AYLW00000000
133 | ANPM00000000
134 | ANPL00000000
135 | AGIQ00000000
136 | CBMO000000000
137 | APKD00000000
138 | CBMJ000000000
139 | ALQB00000000
140 | ALQA00000000
141 | AJFJ00000000
142 | AJFI00000000
143 | AGVE00000000
144 | ADNV00000000
145 | CAPS00000000
146 | AKKP00000000
147 | AVCO00000000
148 | AUZK00000000
149 | ACNO00000000
150 | APJC00000000
151 | AJYC00000000
152 | AODN00000000
153 | AODO00000000
154 | ANIU00000000
155 | AJJH00000000
156 | ANSJ00000000
157 | AORZ00000000
158 | 


--------------------------------------------------------------------------------
/GetProtein.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python 
 2 | # Created by: Lee Bergstrand 
 3 | # Description: A simple program that takes a list of genbank protein accession numbers and downloads
 4 | #           their associated sequences. It then stores the genes or complete genomes as a single 
 5 | #           peptide fasta file. 
 6 | #
 7 | # Requirements: - This script requires the Biopython module: http://biopython.org/wiki/Download
 8 | #               - This script requires the SeqExtract module (included in the Bio-Scripts repository)
 9 | #               - All accessions must link to regular nucleotide genbank records (gene or genome),
10 | #                 however if a genome is shotgun sequenced you must provide the accession to its Whole
11 | #                 Genome Shotgun Sequence Project record.
12 | #               - Before using this script to access the NCBI's online resources please read the NCBI's 
13 | #                 Entrez User Requirements. If the NCBI finds you are abusing their systems, they can 
14 | #                 and will ban your access! Use the optional email parameter so the NCBI can contact 
15 | #                 you if there is a problem.
16 | #  
17 | # Usage: GetProtein.py <sequences.faa> [email@mail.com]
18 | # Example: GetProtein.py mySeqs.txt JBro@YOLO.com
19 | # ----------------------------------------------------------------------------------------
20 | # ===========================================================================================================
21 | # Imports:
22 | 
23 | import sys
24 | 
25 | from SeqExtract import entrezEmail
26 | from SeqExtract import getSeqRecords
27 | 
28 | 
29 | # ===========================================================================================================
30 | # Functions:
31 | 
32 | 
33 | # 1: Checks if in proper number of arguments are passed gives instructions on proper use.
34 | def argsCheck(numArgs):
35 |     if len(sys.argv) < numArgs or len(sys.argv) > numArgs:
36 |         print("Sequence Downloader")
37 |         print("By Lee Bergstrand\n")
38 |         print("Usage: " + sys.argv[0] + " <sequences.txt> [email@mail.com]")
39 |         print("Examples: " + sys.argv[0] + " mySeqs.txt JBro@YOLO.com\n")
40 |         print("Please Note:")
41 |         print("Before using this script to access the NCBI's online resources please read the NCBI's")
42 |         print("Entrez User Requirements. If the NCBI finds you are abusing their systems, they can")
43 |         print("and will ban your access! Use the optional email parameter so the NCBI can contact")
44 |         print("you if there is a problem.")
45 |         sys.exit(1)  # Aborts program. (exit(1) indicates that an error occurred)
46 | 
47 | 
48 | # ===========================================================================================================
49 | # Main program code:
50 | 
51 | # House keeping...
52 | argsCheck(3)  # Checks if the number of arguments are correct.
53 | entrezEmail(sys.argv[2])  # Sets up arguments email require for genbank file extraction.
54 | 
55 | # Stores file one for input checking.
56 | print(">> Opening sequence list...")
57 | inFile = sys.argv[1]
58 | 
59 | # File extension check
60 | if not inFile.endswith(".txt"):
61 |     print("[Warning] " + inFile + " may not be a txt file!")
62 | 
63 | # Reads sequence file list and stores it as a string object. Safely closes file:
64 | try:
65 |     with open(inFile, "r") as newFile:
66 |         sequences = newFile.read()
67 |         newFile.close()
68 | except IOError:
69 |     print("Failed to open " + inFile)
70 |     sys.exit(1)
71 | 
72 | seqList = sequences.splitlines()  # Splits string into a list. Each element is a single line from the string.
73 | 
74 | print("You have listed", len(seqList), "sequences. They are:")
75 | print(sequences + "\n\n")
76 | 
77 | seqRecords = getSeqRecords(seqList, database_type="protein")  # Gets sequence record objects from NCBI using the sequence list as reference.
78 | 
79 | outFile = inFile + ".faa"
80 | try:
81 |     # Attempted to create to output file.
82 |     writeFile = open(outFile, "w")
83 |     print("Writing " + outFile + " to file...")
84 |     for sequence in seqRecords:
85 |         writeFile.write(sequence.format("fasta"))  # Write genome as fasta to file.
86 |     writeFile.close()
87 | except IOError:
88 |     print("Failed to create " + outFile)
89 |     sys.exit(1)
90 | 
91 | print("Done!")
92 | 


--------------------------------------------------------------------------------
/GetOrganism.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python 
 2 | # Created by: Lee Bergstrand 
 3 | # Description: A simple program that takes a list of genbank geneome accession numbers and finds the name
 4 | #			of the taxa that these accession numbers represent.
 5 | #
 6 | # Requirements: - This script requires the Biopython module: http://biopython.org/wiki/Download
 7 | #               - This script requires the SeqExtract module (included in the Bio-Scripts repository)
 8 | #               - All accessions must link to regular nucleotide genome records however if a genome
 9 | #                 is shotgun sequenced you must provide the accession to its Whole Genome Shotgun 
10 | #                 Sequence Project record.
11 | #               - Before using this script to access the NCBI's online resources please read the NCBI's 
12 | #                 Entrez User Requirements. If the NCBI finds you are abusing their systems, they can 
13 | #                 and will ban your access! Use the optional email parameter so the NCBI can contact 
14 | #                 you if there is a problem.
15 | #  
16 | # Usage: GetOrganism.py <sequences.txt> [email@mail.com]
17 | # Example: GetOrganism.py mySeqs.txt JBro@YOLO.com
18 | # ----------------------------------------------------------------------------------------
19 | # ===========================================================================================================
20 | # Imports:
21 | 
22 | import sys
23 | 
24 | from Bio import Entrez
25 | 
26 | from SeqExtract import entrezEmail
27 | 
28 | 
29 | # ===========================================================================================================
30 | # Functions:
31 | 
32 | 
33 | # 1: Checks if in proper number of arguments are passed gives instructions on proper use.
34 | def argsCheck(numArgs):
35 |     if len(sys.argv) < numArgs or len(sys.argv) > numArgs:
36 |         print("Sequence Downloader")
37 |         print("By Lee Bergstrand\n")
38 |         print("Usage: " + sys.argv[0] + " <sequences.txt> [email@mail.com]")
39 |         print("Examples: " + sys.argv[0] + " mySeqs.txt JBro@YOLO.com\n")
40 |         print("Please Note:")
41 |         print("Before using this script to access the NCBI's online resources please read the NCBI's")
42 |         print("Entrez User Requirements. If the NCBI finds you are abusing their systems, they can")
43 |         print("and will ban your access! Use the optional email parameter so the NCBI can contact")
44 |         print("you if there is a problem.")
45 |         sys.exit(1)  # Aborts program. (exit(1) indicates that an error occurred)
46 | 
47 | 
48 | # ===========================================================================================================
49 | # Main program code:
50 | 
51 | # House keeping...
52 | argsCheck(3)  # Checks if the number of arguments are correct.
53 | entrezEmail(sys.argv[2])  # Sets up arguments email require for genbank file extraction.
54 | 
55 | # Stores file one for input checking.
56 | print("Opening sequence list...")
57 | inFile = sys.argv[1]
58 | 
59 | # File extension check
60 | if not inFile.endswith(".txt"):
61 |     print("[Warning] " + inFile + " may not be a txt file!")
62 | 
63 | # Reads sequence file list and stores it as a string object. Safely closes file:
64 | try:
65 |     with open(inFile, "r") as newFile:
66 |         sequences = newFile.read()
67 |         newFile.close()
68 | except IOError:
69 |     print("Failed to open " + inFile)
70 |     sys.exit(1)
71 | 
72 | seqList = sequences.splitlines()  # Splits string into a list. Each element is a single line from the string.
73 | 
74 | print("You have listed", len(seqList), "sequences. They are:")
75 | 
76 | for seq in seqList:
77 |     GenbankAccession = seq
78 | 
79 |     handle = Entrez.esearch(db="nuccore", term=GenbankAccession)
80 |     GenbankSearchResults = Entrez.read(handle)
81 |     if not GenbankSearchResults["IdList"]:
82 |         print(seq + ": Taxa's name not found.")
83 |         continue
84 |     else:
85 |         AccessionGenId = GenbankSearchResults["IdList"][0]
86 | 
87 |         GenbankInfo = Entrez.esummary(db="nuccore", id=AccessionGenId)
88 |         GenbankSummery = Entrez.read(GenbankInfo)
89 |         TaxaID = GenbankSummery[0]["TaxId"]
90 | 
91 |         TaxonomyInfo = Entrez.esummary(db="taxonomy", id=TaxaID)
92 |         TaxonomySummary = Entrez.read(TaxonomyInfo)
93 |         TaxaName = TaxonomySummary[0]["ScientificName"]
94 | 
95 |         print(seq + ": " + TaxaName)
96 | 


--------------------------------------------------------------------------------
/GetNucleotide.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python 
  2 | # Created by: Lee Bergstrand 
  3 | # Description: A simple program that takes a list of genbank nucleotide accession numbers and downloads
  4 | #           their associated sequences. It then stores the genes or complete genomes as a nucleotide
  5 | #           fasta file. For shotgun sequenced data it stores the contigs in a multiple sequence
  6 | #	    	nucleotide fasta file. 
  7 | #
  8 | # Requirements: - This script requires the Biopython module: http://biopython.org/wiki/Download
  9 | #               - This script requires the SeqExtract module (included in the Bio-Scripts repository)
 10 | #               - All accessions must link to regular nucleotide genbank records (gene or genome),
 11 | #                 however if a genome is shotgun sequenced you must provide the accession to its Whole
 12 | #                 Genome Shotgun Sequence Project record.
 13 | #               - Before using this script to access the NCBI's online resources please read the NCBI's 
 14 | #                 Entrez User Requirements. If the NCBI finds you are abusing their systems, they can 
 15 | #                 and will ban your access! Use the optional email parameter so the NCBI can contact 
 16 | #                 you if there is a problem.
 17 | #  
 18 | # Usage: GetNucleotide.py <sequences.txt> [email@mail.com]
 19 | # Example: GetNucleotide.py mySeqs.txt JBro@YOLO.com
 20 | # ----------------------------------------------------------------------------------------
 21 | # ===========================================================================================================
 22 | # Imports:
 23 | 
 24 | import sys
 25 | 
 26 | from SeqExtract import entrezEmail
 27 | from SeqExtract import extractContigs
 28 | from SeqExtract import getSeqRecords
 29 | from SeqExtract import isSSProject
 30 | 
 31 | 
 32 | # ===========================================================================================================
 33 | # Functions:
 34 | 
 35 | 
 36 | # 1: Checks if in proper number of arguments are passed gives instructions on proper use.
 37 | def argsCheck(numArgs):
 38 |     if len(sys.argv) < numArgs or len(sys.argv) > numArgs:
 39 |         print("Sequence Downloader")
 40 |         print("By Lee Bergstrand\n")
 41 |         print("Usage: " + sys.argv[0] + " <sequences.txt> [email@mail.com]")
 42 |         print("Examples: " + sys.argv[0] + " mySeqs.txt JBro@YOLO.com\n")
 43 |         print("Please Note:")
 44 |         print("Before using this script to access the NCBI's online resources please read the NCBI's")
 45 |         print("Entrez User Requirements. If the NCBI finds you are abusing their systems, they can")
 46 |         print("and will ban your access! Use the optional email parameter so the NCBI can contact")
 47 |         print("you if there is a problem.")
 48 |         sys.exit(1)  # Aborts program. (exit(1) indicates that an error occurred)
 49 | 
 50 | 
 51 | # ===========================================================================================================
 52 | # Main program code:
 53 | 
 54 | # House keeping...
 55 | argsCheck(3)  # Checks if the number of arguments are correct.
 56 | entrezEmail(sys.argv[2])  # Sets up arguments email require for genbank file extraction.
 57 | 
 58 | # Stores file one for input checking.
 59 | print(">> Opening sequence list...")
 60 | inFile = sys.argv[1]
 61 | 
 62 | # File extension check
 63 | if not inFile.endswith(".txt"):
 64 |     print("[Warning] " + inFile + " may not be a txt file!")
 65 | 
 66 | # Reads sequence file list and stores it as a string object. Safely closes file:
 67 | try:
 68 |     with open(inFile, "r") as newFile:
 69 |         sequences = newFile.read()
 70 |         newFile.close()
 71 | except IOError:
 72 |     print("Failed to open " + inFile)
 73 |     sys.exit(1)
 74 | 
 75 | seqList = sequences.splitlines()  # Splits string into a list. Each element is a single line from the string.
 76 | 
 77 | print("You have listed", len(seqList), "sequences. They are:")
 78 | print(sequences + "\n\n")
 79 | 
 80 | seqRecords = getSeqRecords(seqList)  # Gets sequence record objects from NCBI using the sequence list as reference.
 81 | 
 82 | for sequence in seqRecords:
 83 |     outFile = sequence.id + ".fna"
 84 |     try:
 85 |         # Attempted to create to output file.
 86 |         writeFile = open(outFile, "w")
 87 |         print("Writing " + outFile + " to file...")
 88 | 
 89 |         # Checks if the accession leads to a WGSS project.
 90 |         # If accession is a WGSS project...
 91 |         if isSSProject(sequence):
 92 |             contigList = extractContigs(sequence.id)  # Extract all contig accessions.
 93 |             contigRecords = getSeqRecords(contigList)  # Extract sequence record object for each contig.
 94 |             for contig in contigRecords:
 95 |                 writeFile.write(contig.format("fasta"))  # Write each contig to the same file in fasta format.
 96 |         # If accession is a regular genome...
 97 |         else:
 98 |             writeFile.write(sequence.format("fasta"))  # Write genome as fasta to file
 99 |         writeFile.close()
100 |     except IOError:
101 |         print("Failed to create " + outFile)
102 |         sys.exit(1)
103 | 
104 | print("Done!")
105 | 


--------------------------------------------------------------------------------
/GetCDS.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python 
  2 | # Created by: Lee Bergstrand 
  3 | # Description: A simple program that takes a list of nucleotide genbank accession numbers and
  4 | #			downloads the Coding Sequences (CDS) contained within the sequences linked by  
  5 | #  			those accessions. It then stores these CDSs within a multi-sequence protein fasta
  6 | #			file. The script also creates a CSV file containing some essential info about each CDS.
  7 | #
  8 | # Requirements: - This script requires the Biopython module: http://biopython.org/wiki/Download
  9 | #               - This script requires the SeqExtract module (included in the Genbank-Downloaders repository)
 10 | #               - All accessions must link to regular nucleotide genbank records (gene or genome),
 11 | #                 however if a genome is shotgun sequenced you must provide the accession to its Whole
 12 | #                 Genome Shotgun Sequence Project record.
 13 | #               - Before using this script to access the NCBI's online resources please read the NCBI's 
 14 | #                 Entrez User Requirements. If the NCBI finds you are abusing their systems, they can 
 15 | #                 and will ban your access! Use the optional email parameter so the NCBI can contact 
 16 | #                 you if there is a problem.
 17 | #  
 18 | # Usage: Get16S.py <sequences.txt> [email@mail.com]
 19 | # Example:Get16S.py mySeqs.txt JBro@YOLO.com
 20 | # ----------------------------------------------------------------------------------------
 21 | # ===========================================================================================================
 22 | # Imports:
 23 | 
 24 | import csv
 25 | import sys
 26 | 
 27 | from SeqExtract import entrezEmail
 28 | from SeqExtract import extractContigs
 29 | from SeqExtract import getProteinAnnotationCSV
 30 | from SeqExtract import getProteinAnnotationFasta
 31 | from SeqExtract import getSeqRecords
 32 | from SeqExtract import isSSProject
 33 | 
 34 | 
 35 | # ===========================================================================================================
 36 | # Functions:
 37 | 
 38 | 
 39 | # 1: Checks if in proper number of arguments are passed gives instructions on proper use.
 40 | def argsCheck(numArgs):
 41 |     if len(sys.argv) < numArgs or len(sys.argv) > numArgs:
 42 |         print("Coding Sequence Downloader")
 43 |         print("By Lee Bergstrand\n")
 44 |         print("Usage: " + sys.argv[0] + " <sequences.txt> [email@mail.com]")
 45 |         print("Examples: " + sys.argv[0] + " mySeqs.txt JBro@YOLO.com\n")
 46 |         print("Please Note:")
 47 |         print("Before using this script to access the NCBI's online resources please read the NCBI's")
 48 |         print("Entrez User Requirements. If the NCBI finds you are abusing their systems, they can")
 49 |         print("and will ban your access! Use the optional email parameter so the NCBI can contact")
 50 |         print("you if there is a problem.")
 51 |         sys.exit(1)  # Aborts program. (exit(1) indicates that an error occurred)
 52 | 
 53 | 
 54 | # ===========================================================================================================
 55 | # Main program code:
 56 | 
 57 | # House keeping...
 58 | argsCheck(3)  # Checks if the number of arguments are correct.
 59 | entrezEmail(sys.argv[2])  # Sets up arguments email require for genbank file extraction.
 60 | 
 61 | # Stores file one for input checking.
 62 | print(">> Opening sequence list...")
 63 | inFile = sys.argv[1]
 64 | 
 65 | # File extension check
 66 | if not inFile.endswith(".txt"):
 67 |     print("[Warning] " + inFile + " may not be a txt file!")
 68 | 
 69 | # Reads sequence file list and stores it as a string object. Safely closes file.try:
 70 | try:
 71 |     with open(inFile, "r") as newFile:
 72 |         sequences = newFile.read()
 73 |         newFile.close()
 74 | except IOError:
 75 |     print("Failed to open " + inFile)
 76 |     sys.exit(1)
 77 | 
 78 | seqList = sequences.splitlines()  # Splits string into a list. Each element is a single line from the string.
 79 | 
 80 | print("You have listed", len(seqList), "sequences. They are:")
 81 | print(sequences + "\n\n")
 82 | 
 83 | seqRecords = getSeqRecords(seqList)  # Gets sequence record objects from NCBI using the sequence list as reference.
 84 | 
 85 | for sequence in seqRecords:
 86 |     outFile = sequence.id + ".faa"
 87 |     outCSV = sequence.id + ".csv"
 88 |     try:
 89 |         # Attempted to create to output file.
 90 |         writeFile = open(outFile, "w")
 91 |         print("Writing " + outFile + " to file...")
 92 |         csvFile = open(outCSV, "w")
 93 |         CSVWriter = csv.writer(csvFile)
 94 |         print("Writing " + outCSV + " to file...")
 95 | 
 96 |         # Checks if the accession leads to a WGSS project.
 97 |         # If accession is a WGSS project...
 98 |         if isSSProject(sequence):
 99 |             contigList = extractContigs(sequence.id)  # Extract all contig accessions.
100 |             contigRecords = getSeqRecords(contigList)  # Extract sequence record object for each contig.
101 |             for contig in contigRecords:
102 |                 fasta = getProteinAnnotationFasta(contig)  # Builds list fasta files.
103 |                 csvRows = getProteinAnnotationCSV(contig)  # Builds list of csv rows.
104 |                 for annotation in fasta:
105 |                     writeFile.write(annotation)
106 |                 for row in csvRows:
107 |                     CSVWriter.writerow(row)
108 |         # If accession is a regular genome...
109 |         else:
110 |             OrganismGenomeLength = len(sequence.seq)  # Gets Genome Length
111 |             fasta = getProteinAnnotationFasta(sequence)  # Builds list fasta files.
112 |             csvRows = getProteinAnnotationCSV(sequence)  # Builds list of csv rows.
113 |             for annotation in fasta:
114 |                 writeFile.write(annotation)
115 |             for row in csvRows:
116 |                 row.append(OrganismGenomeLength)  # Appends genome length to the rest of the csv file.
117 |                 CSVWriter.writerow(row)
118 |         writeFile.close()
119 |         csvFile.close()
120 |     except IOError:
121 |         print("Failed to create " + outFile)
122 |         sys.exit(1)
123 | print("Done!")
124 | 


--------------------------------------------------------------------------------
/SeqExtract.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python 
  2 | # Created by: Lee Bergstrand 
  3 | # Description: A module that contains functions for extraction of genbank records.
  4 | #
  5 | # Requirements: - This module requires the Biopython module: http://biopython.org/wiki/Download
  6 | # ------------------------------------------------------------------------------------------------------------
  7 | # ============================================================================================================
  8 | # Imports and Setup:
  9 | 
 10 | import re
 11 | import sys
 12 | 
 13 | from Bio import Entrez
 14 | from Bio import SeqIO
 15 | 
 16 | AccessionBaseRegex = re.compile("^[a-zA-Z]{4}\d{2}")
 17 | WGSSProjectRegex = re.compile("[a-zA-Z_]{4,7}\d{8,10}")
 18 | 
 19 | 
 20 | # ============================================================================================================
 21 | # Functions:
 22 | 
 23 | 
 24 | # 1: Sets up user email.
 25 | def entrezEmail(email):
 26 |     Entrez.email = email
 27 | 
 28 | 
 29 | # ------------------------------------------------------------------------------------------------------------
 30 | # 2: When passed an array of accessions from NCBI it returns a list of sequence objects matching those accessions.
 31 | def getSeqRecords(seqList, database_type="nucleotide"):
 32 |     try:
 33 |         print("Requesting sequence data from genbank...")
 34 |         handle = Entrez.efetch(db=database_type, id=seqList, rettype="gb",
 35 |                                retmode="genbank")  # Gets records and stores them.
 36 |         print("Starting download...")
 37 |         SeqRecords = list(SeqIO.parse(handle, "genbank"))  # Creates a list of SeqRecord objects from genbank files e.
 38 |         print("Download Complete.")
 39 |         handle.close()  # Closes handle since it is no longer needed.
 40 |     except IOError:
 41 |         print("Failed to connect to NCBI server. ")
 42 |         sys.exit(1)
 43 |     return SeqRecords
 44 | 
 45 | 
 46 | # ------------------------------------------------------------------------------------------------------------
 47 | # 3: When passed a sequence record object returns an array of fasta strings for each annotation.
 48 | def getProteinAnnotationFasta(seqRecord):
 49 |     fasta = []
 50 |     features = seqRecord.features  # Each sequence has a list (called features) that stores seqFeature objects.
 51 |     for feature in features:  # For each feature on the sequence
 52 |         if feature.type == "CDS":  # CDS means coding sequence (These are the only feature we're interested in)
 53 |             featQualifers = feature.qualifiers  # Each feature contains a dictionary called qualifiers which contains
 54 |             # data about the sequence feature (for example the translation)
 55 | 
 56 |             # Gets the required qualifier. Uses featQualifers.get to return the qualifier or a default value if the quantifier
 57 |             # is not found. Calls strip to remove unwanted brackets and ' from qualifier before storing it as a string.
 58 |             protein_id = str(featQualifers.get('protein_id', 'no_protein_id')).strip('\'[]')
 59 |             if protein_id == 'no_protein_id':
 60 |                 continue  # Skips the iteration if protein has no id.
 61 |             gene = str(featQualifers.get('gene', 'no_gene_name')).strip('\'[]')
 62 |             product = str(featQualifers.get('product', 'no_product_name')).strip('\'[]')
 63 |             translated_protein = str(featQualifers.get('translation', 'no_translation')).strip('\'[]')
 64 |             fasta.append((">" + protein_id + " " + gene + "-" + product + "\n" + translated_protein + "\n"))
 65 |     return fasta
 66 | 
 67 | 
 68 | # ------------------------------------------------------------------------------------------------------------
 69 | # 4: Passed a sequence record object returns a list of csv rows. Each row is list containing info for each annotation.
 70 | def getProteinAnnotationCSV(seqRecord):
 71 |     csvRowSet = []  # Master list of all rows.
 72 |     for feature in seqRecord.features:
 73 |         if feature.type == "CDS":
 74 |             csvRow = []  # Created new list for each row. (This is for compatibility with the csv module's row write)
 75 |             CDSLocal = feature.location  # Gets the feature location
 76 |             featQualifers = feature.qualifiers  # Gets sequence quantifiers.
 77 | 
 78 |             # Gets sequence quantifiers.
 79 |             gene = str(featQualifers.get('gene', 'no_gene_name')).strip('\'[]')
 80 |             product = str(featQualifers.get('product', 'no_product_name')).strip('\'[]')
 81 |             proteinID = str(featQualifers.get('protein_id', 'no_protein_id')).strip('\'[]')
 82 |             locus = str(featQualifers.get('locus_tag', 'no_locus_tag')).strip('\'[]')
 83 |             if proteinID == 'no_protein_id':
 84 |                 continue  # Skips the iteration if protein has no id.
 85 |             # Append quantifers and other information to the csv row list.
 86 |             csvRow.append(seqRecord.annotations["organism"])
 87 |             csvRow.append(seqRecord.id)
 88 |             csvRow.append(proteinID)
 89 |             csvRow.append(gene)
 90 |             csvRow.append(str(CDSLocal.start))
 91 |             csvRow.append(str(CDSLocal.end))
 92 |             csvRow.append(str(CDSLocal.strand))
 93 |             csvRow.append(locus)
 94 |             csvRow.append(product)
 95 | 
 96 |             csvRowSet.append(csvRow)  # Appends the csvRow list to the master list of rows.
 97 | 
 98 |     return csvRowSet
 99 | 
100 | 
101 | # ------------------------------------------------------------------------------------------------------------
102 | # 5: Checks if genome is a WGSS project. 
103 | def isSSProject(sequence):
104 |     is_ssp = False
105 | 
106 |     if sequence.annotations.get('wgs'):
107 |         is_ssp = True
108 | 
109 |     return is_ssp
110 | 
111 | 
112 | # ------------------------------------------------------------------------------------------------------------
113 | # 6: When passed a list of sequence record objects returns an list of fasta strings for each annotation.
114 | #    This implementation is "quick and dirty" shall be replaced in later versions.
115 | def extractContigs(seqList):
116 |     SeqRecords = getSeqRecords(seqList)
117 |     contigList = []
118 | 
119 |     for WGSS in SeqRecords:
120 | 
121 |         WGSSRange = WGSS.annotations["wgs"]  # Extracts WGSS contig accession range.
122 | 
123 |         if len(WGSSRange) == 2:
124 |             # Extracts the accession base from first contig accession number.
125 |             AccessionBase = (AccessionBaseRegex.findall(WGSSRange[0])[0])
126 | 
127 |             # Takes both the the min and max accession and slices off (using python's string slice syntax s[start:end:step])
128 |             # the accession base code leaving the numerical difference between the contigs.
129 |             # Converts these differences to integers.
130 |             WGSSRangeMin = int(WGSSRange[0][6:])
131 |             WGSSRangeMax = int(WGSSRange[1][6:])
132 | 
133 |             # WGSS accession number length actually varies. Its normally 12 characters but I have seen 13 before.
134 |             # The code block below accounts for this.
135 |             zeroOffset = 6
136 |             accessionLength = len(WGSSRange[0])
137 |             if accessionLength != 12:
138 |                 zeroOffset = accessionLength - 6  # 6 is the length of the standard accession base.
139 | 
140 |             # Creates accession list
141 |             for x in range(WGSSRangeMin, (WGSSRangeMax + 1)):
142 |                 contigAccession = AccessionBase
143 |                 contigAccession += ("{0:0" + str(zeroOffset) + "d}").format(
144 |                     x)  # Uses zero offset to make accessions proper length.
145 |                 contigList.append(contigAccession)
146 |         else:
147 |             contigList.append(WGSSRange[0])  # If one contig, simply append it to the list.
148 | 
149 |     return contigList
150 | 


--------------------------------------------------------------------------------
/Get16S.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python 
  2 | # Created by: Lee Bergstrand 
  3 | # Descript: A simple program that takes a list of nucleotide genbank accession numbers and 
  4 | # downloads the 16S ribosomal RNA contained within the sequences linked to those
  5 | # accessions. Its then stores these 16S genes within a multi-sequence fasta file. The
  6 | # script also creates a CSV file containing the genomes for which no 16S annotation was found.
  7 | #
  8 | # Requirements: - This script requires the Biopython module: http://biopython.org/wiki/Download
  9 | #               - This script requires the SeqExtract module (included in the Genbank-Downloaders repository)
 10 | #               - All accessions must link to regular nucleotide genbank records (genome),
 11 | #                 however if a genome is shotgun sequenced you must provide the accession to its Whole
 12 | #                 Genome Shotgun Sequence Project record.
 13 | #               - Before using this script to access the NCBI's online resources please read the NCBI's 
 14 | #                 Entrez User Requirements. If the NCBI finds you are abusing their systems, they can 
 15 | #                 and will ban your access! Use the optional email parameter so the NCBI can contact 
 16 | #                 you if there is a problem.
 17 | #
 18 | # Notes: - This script only extracts the first 16S gene found not all the 16S genes.
 19 | #  
 20 | # Usage: Get16S.py <sequences.txt> [email@mail.com]
 21 | # Example: Get16S.py mySeqs.txt JBro@YOLO.com
 22 | # ----------------------------------------------------------------------------------------
 23 | # ===========================================================================================================
 24 | # Imports:
 25 | 
 26 | import csv
 27 | import sys
 28 | 
 29 | from SeqExtract import entrezEmail
 30 | from SeqExtract import extractContigs
 31 | from SeqExtract import getSeqRecords
 32 | from SeqExtract import isSSProject
 33 | 
 34 | 
 35 | # ===========================================================================================================
 36 | # Functions:
 37 | 
 38 | 
 39 | # 1: Checks if in proper number of arguments are passed gives instructions on proper use.
 40 | def argsCheck(numArgs):
 41 |     if len(sys.argv) < numArgs or len(sys.argv) > numArgs:
 42 |         print("Coding Sequence Downloader")
 43 |         print("By Lee Bergstrand\n")
 44 |         print("Usage: " + sys.argv[0] + " <sequences.txt> [email@mail.com]")
 45 |         print("Examples: " + sys.argv[0] + " mySeqs.txt JBro@YOLO.com\n")
 46 |         print("Please Note:")
 47 |         print("Before using this script to access the NCBI's online resources please read the NCBI's")
 48 |         print("Entrez User Requirements. If the NCBI finds you are abusing their systems, they can")
 49 |         print("and will ban your access! Use the optional email parameter so the NCBI can contact")
 50 |         print("you if there is a problem.")
 51 |         sys.exit(1)  # Aborts program. (exit(1) indicates that an error occurred)
 52 | 
 53 | 
 54 | # ---------------------------------------------------------------------------------------------------------
 55 | # 2: Gets reverse complement of DNA string.
 56 | def reverseCompliment(sequence):
 57 |     sequence.upper()
 58 | 
 59 |     basecomplement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
 60 |     letters = list(sequence)
 61 |     letters = [basecomplement[base] for base in letters]
 62 |     sequence = "".join(letters)
 63 | 
 64 |     sequence = sequence[::-1]  # Reverses sequence using pythons string slice syntax.
 65 |     return sequence
 66 | 
 67 | 
 68 | # ---------------------------------------------------------------------------------------------------------
 69 | # 3: Gets 16S DNA as a fasta.
 70 | def extract16sFasta(organismID, feature, record):
 71 |     start = feature.location.nofuzzy_start
 72 |     end = feature.location.nofuzzy_end
 73 |     strand = feature.location.strand
 74 |     sequence = str(record.seq[start:end])  # Extracts subsequence from the genome according to location of the feature.
 75 | 
 76 |     if strand == -1:  # Converts subsequence to reverse complement if on negative strand.
 77 |         sequence = reverseCompliment(sequence)
 78 | 
 79 |     fasta = ">%s\n%s" % (organismID, sequence)
 80 |     return fasta
 81 | 
 82 | 
 83 | # ---------------------------------------------------------------------------------------------------------
 84 | # 4: Gets a list of 16s FASTAs from a genome.
 85 | def get16sFasta(sequenceID, record):
 86 |     FASTAS = []
 87 |     organismID = sequenceID
 88 |     features = record.features
 89 |     for feature in features:
 90 |         if feature.type == "rRNA":
 91 |             if "16S" in feature.qualifiers["product"][0]:
 92 |                 fasta = extract16sFasta(organismID, feature, record)
 93 |                 if 1000 < len(fasta) < 2000:  # Removes partial sequences and really large sequence do to genbank.
 94 |                     FASTAS.append(fasta)
 95 |     return FASTAS
 96 | 
 97 | 
 98 | # ===========================================================================================================
 99 | # Main program code:
100 | 
101 | # House keeping...
102 | argsCheck(3)  # Checks if the number of arguments are correct.
103 | entrezEmail(sys.argv[2])  # Sets up arguments email require for genbank file extraction.
104 | 
105 | # Stores file one for input checking.
106 | print()
107 | ">> Opening sequence list..."
108 | inFile = sys.argv[1]
109 | 
110 | # File extension check
111 | if not inFile.endswith(".txt"):
112 |     print("[Warning] " + inFile + " may not be a txt file!")
113 | 
114 | # Reads sequence file list and stores it as a string object. Safely closes file.try:
115 | try:
116 |     with open(inFile, "r") as newFile:
117 |         sequences = newFile.read()
118 |         newFile.close()
119 | except IOError:
120 |     print("Failed to open " + inFile)
121 |     sys.exit(1)
122 | 
123 | seqList = sequences.splitlines()  # Splits string into a list. Each element is a single line from the string.
124 | 
125 | print()
126 | "You have listed", len(seqList), "sequences. They are:"
127 | print(sequences + "\n")
128 | 
129 | seqRecords = getSeqRecords(
130 |     seqList)  # Acquires list of sequence record objects from NCBI using the sequence list as reference.
131 | 
132 | No16sGenomes = []
133 | SixTeens = []
134 | for sequence in seqRecords:
135 |     sequenceID = sequence.id
136 |     No16s = True
137 |     if "plasmid" in sequence.description.lower():  # If sequence is from a plasmid skip the iteration.
138 |         continue
139 |     if isSSProject(sequence):  # If accession is a WGSS project...
140 |         contigList = extractContigs(sequence.id)  # Extract all contig accessions.
141 |         contigRecords = getSeqRecords(contigList)  # Extract sequence record object for each contig.
142 |         for contig in contigRecords:
143 |             fasta = get16sFasta(sequenceID, contig)  # Builds list fasta files.
144 |             if fasta:  # If 16S is found.
145 |                 No16s = False
146 |                 SixTeens.append(fasta[0])
147 |                 break  # If 16S is found in one contig break out and skip all the other contigs
148 |     else:  # If accession is a regular genome...
149 |         fasta = get16sFasta(sequenceID, sequence)  # Builds list fasta files.
150 |         if fasta:
151 |             No16s = False
152 |             SixTeens.append(fasta[0])
153 |     if No16s:  # If not 16S is found add genome to the no 16s found list.
154 |         No16sGenomeInfo = [sequenceID, sequence.annotations["organism"]]
155 |         No16sGenomes.append(No16sGenomeInfo)
156 | OutSixTeens = "\n".join(SixTeens)
157 | 
158 | try:
159 |     # Attempted to create to output files.
160 |     outFile = "SixTeenSSFromGenbank.fna"
161 |     outCSVFile = "NoSixTeenGenomes.csv"
162 | 
163 |     print("Writing " + outFile + " to file...")
164 |     writeFile = open(outFile, "w")
165 |     writeFile.write(OutSixTeens)
166 |     writeFile.close()
167 | 
168 |     if No16sGenomes:
169 |         print("Writing " + outCSVFile + " to file...")
170 |         csvFile = open(outCSVFile, "w")
171 |         CSVWriter = csv.writer(csvFile)
172 |         for genome in No16sGenomes:
173 |             CSVWriter.writerow(genome)
174 |         csvFile.close()
175 | except IOError:
176 |     print("Failed to create " + outFile)
177 |     sys.exit(1)
178 | 
179 | print("Done!")
180 | 


--------------------------------------------------------------------------------