├── .gitignore
├── LICENSE
├── README.md
├── dist
    ├── GetTransTool-0.0.3-py3-none-any.whl
    └── GetTransTool-0.0.3.tar.gz
├── pyproject.toml
├── setup.cfg
└── src
    └── GetTransTool
        ├── GetCDSLongestFromGTF.py
        ├── GetCDSLongestFromGencode.py
        ├── GetLongestTransFromGTF.py
        └── GetLongestTransFromGencode.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2018 The Python Packaging Authority
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # GetTransTool Package
  2 | 
  3 | There are four types of methods to extract **longest transcript** or **longest CDS regeion with longest transcript** from **transcripts fasta** file or **GTF** file.
  4 | 
  5 | ---
  6 | 
  7 | - 1.Extract longest transcript from gencode transcripts fasta file.
  8 | 
  9 | - 2.Extract longest transcript from gtf format annotation file based on gencode/ensembl/ucsc database.
 10 | 
 11 | - 3.Extract longest CDS regeion with longest transcript from gencode database transcripts fasta file.
 12 | 
 13 | - 4.Extract longest CDS regeion with longest transcript from gtf format annotation file based on ensembl/ucsc database.
 14 | 
 15 | ## Install
 16 | 
 17 | ```shell
 18 | $ pip install GetTransTool
 19 | ```
 20 | 
 21 | ## Usage
 22 | 
 23 | ## 1. get longest transcript from gencode transcripts fasta file:
 24 | 
 25 | ### help infomation:
 26 | 
 27 | ```shell
 28 | $ GetLongestTransFromGencode -h
 29 | usage: GetLongestTransFromGencode --file gencode.vM28.transcripts.fa.gz --outfile longest_trans.fa
 30 | 
 31 | Get longest transcripts from gencode transcripts fasta file.
 32 | 
 33 | optional arguments:
 34 |   -h, --help            show this help message and exit
 35 |   -v, --version         show program's version number and exit
 36 |   -f transfile, --file transfile
 37 |                         input your transcripts file with ".gz" format. (gencode.vM28.transcripts.fa.gz)
 38 |   -o longestfile, --outfile longestfile
 39 |                         output your longest transcript file. (longest_trans.fa)
 40 | 
 41 | Thank your for your support, if you have any questions or suggestions please contact me: 3219030654@stu.cpu.edu.cn.
 42 | ```
 43 | 
 44 | ### usage:
 45 | 
 46 | ```shell
 47 | $ GetLongestTransFromGencode --file gencode.vM28.transcripts.fa.gz --outfile longest_trans_gencode.fa
 48 | Your job is running, please wait...
 49 | Your job is done!
 50 | Running with 32.33 seconds!
 51 | ```
 52 | 
 53 | there will be three files produced including **name_changed.fa**, **longest_transcripts_info.csv**, **longest_trans_gencode.fa**.
 54 | 
 55 | > name_changed.fa:
 56 | 
 57 | ```
 58 | >4933401J01Rik_ENSMUSG00000102693.2_ENSMUST00000193812.2_1070
 59 | AAGGAAAGAGGATAACACTTGAAATGTAAATAAAGAAAATACCTAATAAAAATAAATAAA
 60 | AACATGCTTTCAAAGGAAATAAAAAGTTGGATTCAAAAATTTAACTTTTGCTCATTTGGT
 61 | ATAATCAAGGAAAAGACCTTTGCATATAAAATATATTTTGAATAAAATTCAGTGGAAGAA
 62 | ...
 63 | ```
 64 | 
 65 | > longest_transcripts_info.csv:
 66 | 
 67 | this is the longest transcripts exon length information.
 68 | 
 69 | ```
 70 | fullname,gene_name,translength
 71 | snoZ196_ENSMUSG00002074855.1_ENSMUST00020182568.1_35,snoZ196,35
 72 | snoZ159_ENSMUSG00002075734.1_ENSMUST00020182611.1_87,snoZ159,87
 73 | n-R5s93_ENSMUSG00000119639.1_ENSMUST00000240071.1_119,n-R5s93,119
 74 | ...
 75 | ```
 76 | 
 77 | > longest_trans_gencode.fa:
 78 | 
 79 | this is the filtered longest transcript fasta file.
 80 | 
 81 | ```
 82 | >4933401J01Rik_ENSMUSG00000102693.2_ENSMUST00000193812.2_1070
 83 | AAGGAAAGAGGATAACACTTGAAATGTAAATAAAGAAAATACCTAATAAAAATAAATAAA
 84 | AACATGCTTTCAAAGGAAATAAAAAGTTGGATTCAAAAATTTAACTTTTGCTCATTTGGT
 85 | ATAATCAAGGAAAAGACCTTTGCATATAAAATATATTTTGAATAAAATTCAGTGGAAGAA
 86 | ...
 87 | ```
 88 | 
 89 | ---
 90 | 
 91 | ## 2. Extract longest transcript from gtf format annotation file based on gencode/ensembl/ucsc database:
 92 | 
 93 | ### help infomation:
 94 | 
 95 | ```shell
 96 | $ GetLongestTransFromGTF -h
 97 | usage: GetLongestTransFromGTF --database ensembl --gtffile Homo_sapiens.GRCh38.101.gtf.gz --genome Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz --outfile longest_trans.fa
 98 | 
 99 | Extract longest transcript from gtf format annotation file based on gencode/ensembl/ucsc database.
100 | 
101 | optional arguments:
102 |   -h, --help            show this help message and exit
103 |   -v, --version         show program's version number and exit
104 |   -d databse, --database databse
105 |                         which annotation database you choose. (default="ensembl", ucsc/ensembl/gencode)
106 |   -g gtffile, --gtffile gtffile
107 |                         input your GTF file with ".gz" format.
108 |   -fa genome, --genome genome
109 |                         your genome fasta file matched with your GTF file with ".gz" format. (Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz)
110 |   -o longestfile, --outfile longestfile
111 |                         output your longest transcript file. (longest_trans.fa)
112 | 
113 | Thank your for your support, if you have any questions or suggestions please contact me: 3219030654@stu.cpu.edu.cn.
114 | ```
115 | 
116 | ### usage:
117 | 
118 | ```shell
119 | $ GetLongestTransFromGTF --database ensembl --gtffile Homo_sapiens.GRCh38.103.gtf.gz --genome Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz --outfile longest_trans_ensembl.fa
120 | Your job is running, please wait...
121 | Your job is done! 
122 | Running with 159.51 seconds!
123 | ```
124 | 
125 | there will be three files produced including **longest_transcripts_info.csv**, **longest_trans.gtf**, **longest_trans_ensembl.fa**.
126 | 
127 | > longest_transcripts_info.csv:
128 | 
129 | ```
130 | ,transcript_length,gene_name
131 | snoZ196_ENSG00000281780_ENST00000625269_snoRNA,89,snoZ196
132 | hsa-mir-423_ENSG00000266919_ENST00000586878_lncRNA,94,hsa-mir-423
133 | hsa-mir-1253_ENSG00000272920_ENST00000609567_lncRNA,105,hsa-mir-1253
134 | ...
135 | ```
136 | 
137 | > longest_trans.gtf:
138 | 
139 | this is the gtf information for the longest transcripts.
140 | 
141 | ```
142 | 1	havana	gene	11869	14409	.	+	.	gene_id "ENSG00000223972"; gene_version "5"; gene_name "DDX11L1"; gene_source "havana"; gene_biotype "transcribed_unprocessed_pseudogene";
143 | 1	havana	transcript	11869	14409	.	+	.	gene_id "ENSG00000223972"; gene_version "5"; transcript_id "ENST00000456328"; transcript_version "2"; gene_name "DDX11L1"; gene_source "havana"; gene_biotype "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; tag "basic"; transcript_support_level "1";
144 | 1	havana	exon	11869	12227	.	+	.	gene_id "ENSG00000223972"; gene_version "5"; transcript_id "ENST00000456328"; transcript_version "2"; exon_number "1"; gene_name "DDX11L1"; gene_source "havana"; gene_biotype "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; exon_id "ENSE00002234944"; exon_version "1"; tag "basic"; transcript_support_level "1";
145 | 1	havana	exon	12613	12721	.	+	.	gene_id "ENSG00000223972"; gene_version "5"; transcript_id "ENST00000456328"; transcript_version "2"; exon_number "2"; gene_name "DDX11L1"; gene_source "havana"; gene_biotype "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; exon_id "ENSE00003582793"; exon_version "1"; tag "basic"; transcript_support_level "1";
146 | 1	havana	exon	13221	14409	.	+	.	gene_id "ENSG00000223972"; gene_version "5"; transcript_id "ENST00000456328"; transcript_version "2"; exon_number "3"; gene_name "DDX11L1"; gene_source "havana"; gene_biotype "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; exon_id "ENSE00002312635"; exon_version "1"; tag "basic"; transcript_support_level "1";
147 | 1	havana	gene	14404	29570	.	-	.	gene_id "ENSG00000227232"; gene_version "5"; gene_name "WASH7P"; gene_source "havana"; gene_biotype "unprocessed_pseudogene";
148 | ```
149 | 
150 | > longest_trans_ensembl.fa:
151 | 
152 | ```
153 | >DDX11L1_ENSG00000223972_ENST00000456328_transcribed_unprocessed_pseudogene
154 | GTTAACTTGCCGTCAGCCTTTTCTTTGACCTCTTCTTTCTGTTCATGTGTATTTGCTGTC
155 | TCTTAGCCCAGACTTCCCGTGTCCTTTCCACCGGGCCTTTGAGAGGTCACAGGGTCTTGA
156 | TGCTGTGGTCTTCATCTGCAGGTGTCTGACTTCCAGCAACTGCTGGCCTGTGCCAGGGTG
157 | ...
158 | ```
159 | 
160 | for ucsc:
161 | 
162 | ```
163 | $ GetLongestTransFromGTF --database ucsc --gtffile hg19.ncbiRefSeq.gtf.gz --genome hg19.fa.gz --outfile longest_trans_ucsc.fa
164 | ```
165 | 
166 | ---
167 | 
168 | ## 3. Extract longest CDS regeion with longest transcript from gencode database transcripts fasta file.
169 | 
170 | ### help infomation:
171 | 
172 | ```shell
173 | $ GetCDSLongestFromGencode -h
174 | usage: GetCDSLongestFromGencode --file gencode.vM28.pc_transcripts.fa.gz --outfile longest_cds_trans.fa
175 | 
176 | Extract longest CDS regeion with longest transcript from gencode database transcripts fasta file.
177 | 
178 | optional arguments:
179 |   -h, --help            show this help message and exit
180 |   -v, --version         show program's version number and exit
181 |   -f transfile, --file transfile
182 |                         input your protein-coding transcripts file with ".gz" format. (gencode.vM28.pc_transcripts.fa.gz)
183 |   -o longestfile, --outfile longestfile
184 |                         output your longest transcript file. (longest_cds_trans.fa)
185 | 
186 | Thank your for your support, if you have any questions or suggestions please contact me: 3219030654@stu.cpu.edu.cn.
187 | ```
188 | 
189 | ### usage:
190 | 
191 | ```shell
192 | $ GetCDSLongestFromGencode --file gencode.vM28.pc_transcripts.fa.gz --outfile longest_cds_trans_gencode.fa
193 | Your job is running, please wait...
194 | Your job is done! 
195 | Running with 17.67 seconds!
196 | ```
197 | 
198 | there will be four files produced including **name_changed.fa**, **All_transcripts_cds_info.csv**, **longest_cds_transcripts_info.csv**, **longest_cds_trans_gencode.fa**.
199 | 
200 | > name_changed.fa:
201 | 
202 | ```
203 | >Xkr4_ENSMUSG00000051951.6_ENSMUST00000070533.5_151_2094_3634
204 | GCGGCGGCGGGCGAGCGGGCGCTGGAGTAGGAGCTGGGGAGCGGCGCGGCCGGGGAAGGA
205 | AGCCAGGGCGAGGCGAGGAGGTGGCGGGAGGAGGAGACAGCAGGGACAGGTGTCAGATAA
206 | AGGAGTGCTCTCCTCCGCTGCCGAGGCATCATGGCCGCTAAGTCAGACGGGAGGCTGAAG
207 | ...
208 | ```
209 | 
210 | > All_transcripts_cds_info.csv:
211 | 
212 | this is the all transcripts cds and exon length information.
213 | 
214 | ```
215 | fullname,gene_name,translength,cdslength
216 | >mt-Nd6_ENSMUSG00000064368.1_ENSMUST00000082419.1_1_519_519,>mt-Nd6,519,519
217 | >mt-Nd5_ENSMUSG00000064367.1_ENSMUST00000082418.1_1_1824_1824,>mt-Nd5,1824,1824
218 | >mt-Nd4l_ENSMUSG00000065947.1_ENSMUST00000084013.1_1_297_297,>mt-Nd4l,297,297
219 | ...
220 | ```
221 | 
222 | > longest_cds_transcripts_info.csv:
223 | 
224 | ```
225 | fullname,gene_name,translength,cdslength
226 | >mt-Nd6_ENSMUSG00000064368.1_ENSMUST00000082419.1_1_519_519,>mt-Nd6,519,519
227 | >mt-Nd5_ENSMUSG00000064367.1_ENSMUST00000082418.1_1_1824_1824,>mt-Nd5,1824,1824
228 | >mt-Nd4l_ENSMUSG00000065947.1_ENSMUST00000084013.1_1_297_297,>mt-Nd4l,297,297
229 | ...
230 | ```
231 | 
232 | > longest_cds_trans_gencode.fa:
233 | 
234 | ```
235 | >Xkr4_ENSMUSG00000051951.6_ENSMUST00000070533.5_151_2094_3634
236 | GCGGCGGCGGGCGAGCGGGCGCTGGAGTAGGAGCTGGGGAGCGGCGCGGCCGGGGAAGGA
237 | AGCCAGGGCGAGGCGAGGAGGTGGCGGGAGGAGGAGACAGCAGGGACAGGTGTCAGATAA
238 | AGGAGTGCTCTCCTCCGCTGCCGAGGCATCATGGCCGCTAAGTCAGACGGGAGGCTGAAG
239 | ...
240 | ```
241 | 
242 | ---
243 | 
244 | ## 4. Extract longest CDS regeion with longest transcript from gtf format annotation file based on ensembl/ucsc database.
245 | 
246 | ### help infomation:
247 | 
248 | ```shell
249 | $ GetCDSLongestFromGTF -h
250 | usage: GetCDSLongestFromGTF --database ensembl --gtffile Homo_sapiens.GRCh38.101.gtf.gz --genome Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz --outfile longest_cds_trans.fa
251 | 
252 | Extract longest CDS regeion with longest transcript from gtf format annotation file based on ensembl/ucsc database.
253 | 
254 | optional arguments:
255 |   -h, --help            show this help message and exit
256 |   -v, --version         show program's version number and exit
257 |   -d databse, --database databse
258 |                         which annotation database you choose. (default="ensembl", ucsc/ensembl)
259 |   -g gtffile, --gtffile gtffile
260 |                         input your GTF file with ".gz" format.
261 |   -fa genome, --genome genome
262 |                         your genome fasta file matched with your GTF file with ".gz" format. (Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz)
263 |   -o cdslongestfile, --outfile cdslongestfile
264 |                         output your longest transcript file. (longest_cds_trans.fa)
265 | 
266 | Thank your for your support, if you have any questions or suggestions please contact me: 3219030654@stu.cpu.edu.cn.
267 | ```
268 | 
269 | ### usage:
270 | 
271 | ```shell
272 | $ GetCDSLongestFromGTF  --database ensembl --gtffile Homo_sapiens.GRCh38.103.gtf.gz --genome Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz --outfile longest_cds_trans_ensembl.fa
273 | Your job is running, please wait...
274 | Your job is done! 
275 | Running with 152.38 seconds!
276 | ```
277 | 
278 | there will be four files produced including **CDS_longest_trans.gtf**, **All_transcripts_cds_info.csv**, **longest_cds_transcripts_info.csv**, **longest_cds_trans_ensembl.fa**.
279 | 
280 | > CDS_longest_trans.gtf:
281 | 
282 | ```
283 | 1	ensembl_havana	gene	65419	71585	.	+	.	gene_id "ENSG00000186092"; gene_version "6"; gene_name "OR4F5"; gene_source "ensembl_havana"; gene_biotype "protein_coding";
284 | 1	havana	transcript	65419	71585	.	+	.	gene_id "ENSG00000186092"; gene_version "6"; transcript_id "ENST00000641515"; transcript_version "2"; gene_name "OR4F5"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F5-202"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic";
285 | 1	havana	exon	65419	65433	.	+	.	gene_id "ENSG00000186092"; gene_version "6"; transcript_id "ENST00000641515"; transcript_version "2"; exon_number "1"; gene_name "OR4F5"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F5-202"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00003812156"; exon_version "1"; tag "basic";
286 | 1	havana	exon	65520	65573	.	+	.	gene_id "ENSG00000186092"; gene_version "6"; transcript_id "ENST00000641515"; transcript_version "2"; exon_number "2"; gene_name "OR4F5"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F5-202"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00003813641"; exon_version "1"; tag "basic";
287 | 1	havana	CDS	65565	65573	.	+	0	gene_id "ENSG00000186092"; gene_version "6"; transcript_id "ENST00000641515"; transcript_version "2"; exon_number "2"; gene_name "OR4F5"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F5-202"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000493376"; protein_version "2"; tag "basic";
288 | 1	havana	start_codon	65565	65567	.	+	0	gene_id "ENSG00000186092"; gene_version "6"; transcript_id "ENST00000641515"; transcript_version "2"; exon_number "2"; gene_name "OR4F5"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F5-202"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic";
289 | 1	havana	exon	69037	71585	.	+	.	gene_id "ENSG00000186092"; gene_version "6"; transcript_id "ENST00000641515"; transcript_version "2"; exon_number "3"; gene_name "OR4F5"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F5-202"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00003813949"; exon_version "1"; tag "basic";
290 | 1	havana	CDS	69037	70005	.	+	0	gene_id "ENSG00000186092"; gene_version "6"; transcript_id "ENST00000641515"; transcript_version "2"; exon_number "3"; gene_name "OR4F5"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F5-202"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000493376"; protein_version "2"; tag "basic";
291 | 1	havana	stop_codon	70006	70008	.	+	0	gene_id "ENSG00000186092"; gene_version "6"; transcript_id "ENST00000641515"; transcript_version "2"; exon_number "3"; gene_name "OR4F5"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F5-202"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic";
292 | 1	havana	five_prime_utr	65419	65433	.	+	.	gene_id "ENSG00000186092"; gene_version "6"; transcript_id "ENST00000641515"; transcript_version "2"; gene_name "OR4F5"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F5-202"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic";
293 | 1	havana	five_prime_utr	65520	65564	.	+	.	gene_id "ENSG00000186092"; gene_version "6"; transcript_id "ENST00000641515"; transcript_version "2"; gene_name "OR4F5"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F5-202"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic";
294 | 1	havana	three_prime_utr	70009	71585	.	+	.	gene_id "ENSG00000186092"; gene_version "6"; transcript_id "ENST00000641515"; transcript_version "2"; gene_name "OR4F5"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F5-202"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic";
295 | 1	ensembl_havana	gene	450740	451678	.	-	.	gene_id "ENSG00000284733"; gene_version "2"; gene_name "OR4F29"; gene_source "ensembl_havana"; gene_biotype "protein_coding";
296 | ...
297 | ```
298 | 
299 | > All_transcripts_cds_info.csv:
300 | 
301 | this is the all transcripts cds and exon length information.
302 | 
303 | ```
304 | cdslength,ID,translength,utr5length,gene_name
305 | 2709,ZZZ3_ENSG00000036549_ENST00000370801,6412,476,ZZZ3
306 | 1227,ZZZ3_ENSG00000036549_ENST00000370798,2468,486,ZZZ3
307 | 173,ZZZ3_ENSG00000036549_ENST00000433749,603,430,ZZZ3
308 | ...
309 | ```
310 | 
311 | > longest_cds_transcripts_info.csv:
312 | 
313 | ```
314 | cdslength,ID,translength,utr5length,gene_name
315 | 2709,ZZZ3_ENSG00000036549_ENST00000370801,6412,476,ZZZ3
316 | 8883,ZZEF1_ENSG00000074755_ENST00000381638,11466,135,ZZEF1
317 | 1716,ZYX_ENSG00000159840_ENST00000322764,2228,80,ZYX
318 | ...
319 | ```
320 | 
321 | > longest_cds_trans_gencode.fa:
322 | 
323 | ```
324 | >OR4F5_ENSG00000186092_ENST00000641515_61_1038_2618
325 | CCCAGATCTCTTCAGTTTTTATGCCTCATTCTGTGAAAATTGCTGTAGTCTCTTCCAGTT
326 | ATGAAGAAGGTAACTGCAGAGGCTATTTCCTGGAATGAATCAACGAGTGAAACGAATAAC
327 | TCTATGGTGACTGAATTCATTTTTCTGGGTCTCTCTGATTCTCAGGAACTCCAGACCTTC
328 | ...
329 | ```
330 | 
331 | for ucsc:
332 | 
333 | ```shell
334 | $ GetCDSLongestFromGTF  --database ucsc --gtffile hg19.ncbiRefSeq.gtf.gz --genome hg19.fa.gz --outfile longest_cds_trans_ensembl.fa
335 | ```
336 | 
337 | ---
338 | 
339 | ## END
340 | 
341 | > Thank your for your support, if you have any questions or suggestions please contact me: 3219030654@stu.cpu.edu.cn.


--------------------------------------------------------------------------------
/dist/GetTransTool-0.0.3-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/junjunlab/GetTransTool/a7ce0db99821457e258f2b754c2ca4fef0e11dfa/dist/GetTransTool-0.0.3-py3-none-any.whl


--------------------------------------------------------------------------------
/dist/GetTransTool-0.0.3.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/junjunlab/GetTransTool/a7ce0db99821457e258f2b754c2ca4fef0e11dfa/dist/GetTransTool-0.0.3.tar.gz


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = [
3 |     "setuptools>=42",
4 |     "wheel"
5 | ]
6 | build-backend = "setuptools.build_meta"


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | name = GetTransTool
 3 | version = 0.0.3
 4 | author = laojunjun
 5 | author_email = 3219030654@stu.cpu.edu.cn
 6 | description = Extract longest transcript or longest CDS transcript from GTF annotation file or gencode transcripts fasta file.
 7 | long_description = file: README.md
 8 | long_description_content_type = text/markdown
 9 | url = https://github.com/junjunlab/GetTransTool
10 | project_urls =
11 |     Bug Tracker = https://github.com/junjunlab/GetTransTool/issues
12 | classifiers =
13 |     Programming Language :: Python :: 3
14 |     License :: OSI Approved :: MIT License
15 |     Operating System :: OS Independent
16 | 
17 | [options]
18 | package_dir =
19 |     = src
20 | packages = find:
21 | python_requires = >=3.6
22 | 
23 | [options.packages.find]
24 | where = src
25 | 
26 | [options.entry_points]
27 | console_scripts =
28 |     GetLongestTransFromGencode = GetTransTool.GetLongestTransFromGencode:main
29 |     GetLongestTransFromGTF = GetTransTool.GetLongestTransFromGTF:main
30 |     GetCDSLongestFromGencode = GetTransTool.GetCDSLongestFromGencode:main
31 |     GetCDSLongestFromGTF = GetTransTool.GetCDSLongestFromGTF:main


--------------------------------------------------------------------------------
/src/GetTransTool/GetCDSLongestFromGTF.py:
--------------------------------------------------------------------------------
  1 | def main():
  2 |     """
  3 |     Extract longest CDS regeion with longest transcript from gtf format annotation file based on ensembl/ucsc database.
  4 |     """
  5 | 
  6 |     # 导入模块
  7 |     import pandas as pd
  8 |     import gzip
  9 |     import time
 10 |     import warnings
 11 |     import argparse
 12 |     from pyfaidx import Fasta
 13 | 
 14 |     warnings.filterwarnings('ignore')
 15 | 
 16 |     parser = argparse.ArgumentParser(usage="GetCDSLongestFromGTF --database ensembl --gtffile Homo_sapiens.GRCh38.101.gtf.gz --genome Homo_sapiens.GRCh38.dna.primary_assembly.fa --outfile longest_cds_trans.fa",
 17 |                                     description="Extract longest CDS regeion with longest transcript from gtf format annotation file based on ensembl/ucsc database.",
 18 |                                     epilog="Thank your for your support, if you have any questions or suggestions please contact me: 3219030654@stu.cpu.edu.cn.")
 19 |     # version
 20 |     parser.add_argument('-v','--version', action='version', version='%(prog)s 0.0.7')
 21 |     # 读取注释类型文件
 22 |     parser.add_argument('-d','--database',type=str,action="store",dest="database",metavar="databse",choices=['ucsc','ensembl'],default="ensembl",
 23 |                         help='which annotation database you choose. (default="ensembl", ucsc/ensembl)')
 24 |     # 读取gtf文件
 25 |     parser.add_argument('-g','--gtffile', type=str,action="store",dest="gtffile",metavar="gtffile",
 26 |                         help='input your GTF file with ".gz" format.')
 27 |     # 读取基因组fasta文件
 28 |     parser.add_argument('-fa','--genome',type=str,action="store",dest="genome",metavar="genome",
 29 |                         help='your genome fasta file matched with your GTF file with ".fa/.fatsa" format. (Homo_sapiens.GRCh38.dna.primary_assembly.fa)')
 30 |     # 导出文件名称
 31 |     parser.add_argument('-o','--outfile', type=str,action="store",dest="cdslongestfile",metavar="cdslongestfile",
 32 |                         help='output your longest transcript file. (longest_cds_trans.fa)')
 33 |     # 解析参数
 34 |     args = parser.parse_args()
 35 | 
 36 |     # 获取参数
 37 |     db = args.database
 38 |     gtffile = args.gtffile
 39 |     genomefile = args.genome
 40 |     outfile = args.cdslongestfile
 41 | 
 42 |     # main fuction
 43 |     print("Your job is running, please wait...")
 44 |     ######################################################################
 45 |     job_start = time.time()
 46 |     ################################################################################################
 47 |     if db == 'ensembl':
 48 |         # 打开 gtf 文件
 49 |         with gzip.open(gtffile,'rt') as gtf:
 50 |             # 信息保存在字典里
 51 |             trans_len = {}
 52 |             utr5_len = {}
 53 |             cds_len = {}
 54 |             for line in gtf:
 55 |                 # 跳过注释行
 56 |                 if line.startswith('#'):
 57 |                     continue
 58 |                 # 分割
 59 |                 fields = line.split()
 60 |                 # 类型
 61 |                 type = fields[2]
 62 |                 if len(fields) > 24:
 63 |                     biotype = fields[23].replace('"','').replace(';','')
 64 |                     utr_biotype = fields[21].replace('"','').replace(';','')
 65 |                     if biotype == 'protein_coding' and type == 'exon':
 66 |                         # 名称
 67 |                         gene_name = fields[19].replace('"','').replace(';','')
 68 |                         gene_id = fields[9].replace('"','').replace(';','')
 69 |                         trans_id = fields[13].replace('"','').replace(';','')
 70 |                         # 连接名称
 71 |                         key = '|'.join([gene_name,gene_id,trans_id])
 72 |                         # 计算多个外显子长度
 73 |                         length = int(fields[4]) - int(fields[3]) + 1
 74 |                         # 累计求和
 75 |                         trans_len.setdefault(key,0)
 76 |                         trans_len[key] += length
 77 |                     elif biotype == 'protein_coding' and type == 'CDS':
 78 |                         # 名称
 79 |                         gene_name = fields[19].replace('"','').replace(';','')
 80 |                         gene_id = fields[9].replace('"','').replace(';','')
 81 |                         trans_id = fields[13].replace('"','').replace(';','')
 82 |                         # 连接名称
 83 |                         key = '|'.join([gene_name,gene_id,trans_id])
 84 |                         # 计算多个CDS长度
 85 |                         length = int(fields[4]) - int(fields[3]) + 1
 86 |                         # 累计求和
 87 |                         cds_len.setdefault(key,0)
 88 |                         cds_len[key] += length
 89 |                     elif utr_biotype == 'protein_coding':
 90 |                         # 名称
 91 |                         gene_name = fields[17].replace('"','').replace(';','')
 92 |                         gene_id = fields[9].replace('"','').replace(';','')
 93 |                         trans_id = fields[13].replace('"','').replace(';','')
 94 |                         # 连接名称
 95 |                         key = '|'.join([gene_name,gene_id,trans_id])
 96 |                         if type == 'five_prime_utr':
 97 |                             # 计算多个5'UTR长度
 98 |                             length = int(fields[4]) - int(fields[3]) + 1
 99 |                             # 累计求和
100 |                             utr5_len.setdefault(key,0)
101 |                             utr5_len[key] += length
102 |                         else:
103 |                             # 若无则为 0
104 |                             utr5_len.setdefault(key,0) 
105 |                 else:
106 |                     pass
107 |         
108 |         # transorm into dataframe and merge by id
109 |         df_tran = pd.DataFrame.from_dict(trans_len,orient='index',columns=['translength'])
110 |         df_tran['ID'] = df_tran.index
111 | 
112 |         df_cds = pd.DataFrame.from_dict(cds_len,orient='index',columns=['cdslength'])
113 |         df_cds['ID'] = df_cds.index
114 | 
115 |         df_5utr = pd.DataFrame.from_dict(utr5_len,orient='index',columns=['utr5length'])
116 |         df_5utr['ID'] = df_5utr.index
117 | 
118 |         # 按id合并表格
119 |         data_info = df_cds.merge(df_tran.merge(df_5utr,on='ID'),on='ID')
120 | 
121 |         # 添加基因名列
122 |         data_info['gene_name'] = [i.split(sep='|')[0] for i in data_info['ID']]
123 | 
124 |         # 按gen_name cdslength translength 降序排序
125 |         data_infonew = data_info.sort_values(by = ['gene_name','cdslength','translength'],ascending = False,inplace=False)
126 | 
127 |         # order columns
128 |         data_infonew = data_infonew[['ID','gene_name','translength','utr5length','cdslength']]
129 | 
130 |         # 保存
131 |         data_infonew.to_csv(r'All_transcripts_cds_info.csv', index=False)
132 | 
133 |         ############################
134 |         # 筛选最长转录本id
135 |         longest_id = list(data_infonew.drop_duplicates(subset=['gene_name'],keep='first')['ID'])
136 |  
137 |         # 筛选最长转录本表格
138 |         longest_data = data_infonew.loc[data_infonew.ID.isin(longest_id)]
139 | 
140 |         # 保存
141 |         longest_data.to_csv(r'longest_cds_transcripts_info.csv', index=False)
142 | 
143 |         # 给 ID 添加 CDS 位置信息和转录本长度信息
144 |         longest_data['ID'] = longest_data.ID + '|' + (longest_data.utr5length + 1).map(str) + '|' + \
145 |                                 (longest_data.utr5length + longest_data.cdslength).map(str) + '|' + \
146 |                                 (longest_data.translength).map(str)
147 | 
148 |         # order columns
149 |         longest_data = longest_data[['ID','gene_name','translength','utr5length','cdslength']]
150 | 
151 |         # 储存最长转录本id  
152 |         transid = {line.split(sep='|')[2]:line for line in list(longest_data.ID)}
153 | 
154 |         infolist = []
155 |         with gzip.open(gtffile,'rt') as gtf:
156 |             for line in gtf:
157 |                 # skip
158 |                 if line.startswith('#'):
159 |                     continue
160 |                 # split
161 |                 fields = line.split()
162 |                 # feature type
163 |                 type = fields[2]
164 |                 if type == 'exon':
165 |                     # pos
166 |                     chr = fields[0]
167 |                     start = fields[3]
168 |                     end = fields[4]
169 |                     strand = fields[6]
170 |                     # name
171 |                     gene_name = fields[19].replace('"','').replace(';','')
172 |                     gene_id = fields[9].replace('"','').replace(';','')
173 |                     trans_id = fields[13].replace('"','').replace(';','')
174 |                     if trans_id in transid:
175 |                         infolist.append([chr,start,end,strand,type,gene_name,gene_id,trans_id,transid[trans_id]])
176 |                     else:
177 |                         pass
178 |                 else:
179 |                     pass
180 | 
181 |         # to dataframe
182 |         dfinfo = pd.DataFrame(infolist,columns=['chr','start','end','strand','type','gene_name','gene_id','trans_id','id'])
183 |         dfinfo_1_strand = dfinfo[dfinfo['strand'] == '+']
184 | 
185 |         # descrese coord by - strand gene
186 |         dfinfo_2_strand = dfinfo[dfinfo['strand'] == '-']
187 |         dfinfo_2_strand = dfinfo_2_strand.sort_values(by = ['trans_id','start','end'],ascending = False)
188 | 
189 |         # merge
190 |         df_fianl = pd.concat([dfinfo_1_strand,dfinfo_2_strand],axis=0)
191 | 
192 |         #############################################################
193 |         # extact  sequnece from genome
194 | 
195 |         # load genome
196 |         genome = Fasta(genomefile)
197 | 
198 |         # chrmosome info
199 |         chrmosome_list = genome.keys()
200 | 
201 |         # save in dict
202 |         res = {}
203 |         for line in range(0,df_fianl.shape[0]):
204 |             
205 |             # chromosome strand
206 |             fileds = df_fianl.iloc[line]
207 |             chrom = fileds['chr']
208 |             strand = fileds['strand']
209 |             start = int(fileds['start'])
210 |             end = int(fileds['end'])
211 |             # key
212 |             key = fileds['id']
213 |             # filter chromoseome
214 |             if chrom in chrmosome_list:
215 |                 # extarct sequence
216 |                 if strand == '+':
217 |                     seq = genome[chrom][(start-1):end].seq
218 |                 elif strand == '-':
219 |                     seq = genome[chrom][(start-1):end].complement.reverse.seq
220 |                 else:
221 |                     pass
222 |                 # save in dict
223 |                 res.setdefault(key,'')
224 |                 res[key] += seq
225 |             else:
226 |                 pass
227 | 
228 |         #############################################################
229 |         # 输出序列
230 |         outputfile = open(outfile,'w')
231 | 
232 |         # fasta序列分割长度
233 |         my_length = 60
234 | 
235 |         # 输出
236 |         for key,val in res.items():
237 |             outputfile.write('>' + key + '\n')
238 |             while len(val) > my_length:
239 |                 outputfile.write(val[0:my_length] + '\n')
240 |                 val = val[my_length:len(val)]
241 |             outputfile.write(val + '\n')
242 | 
243 |         # 关闭文件
244 |         outputfile.close()
245 | 
246 |     ################################################################################################
247 |     elif db == 'ucsc':
248 |         # 打开 gtf 文件
249 |         with gzip.open(gtffile,'rt') as gtf:
250 |             # 信息保存在字典里
251 |             trans_len = {}
252 |             utr5_len = {}
253 |             cds_len = {}
254 |             for line in gtf:
255 |                 # 跳过注释行
256 |                 if line.startswith('#'):
257 |                     continue
258 |                 # 分割
259 |                 fields = line.split()
260 |                 # 类型
261 |                 type = fields[2]
262 |                 if type == 'exon':
263 |                     # 名称
264 |                     gene_name = fields[17].replace('"','').replace(';','')
265 |                     gene_id = fields[9].replace('"','').replace(';','')
266 |                     trans_id = fields[11].replace('"','').replace(';','')
267 |                     # 连接名称
268 |                     key = '|'.join([gene_name,gene_id,trans_id])
269 |                     # 计算多个外显子长度
270 |                     length = int(fields[4]) - int(fields[3]) + 1
271 |                     # 累计求和
272 |                     trans_len.setdefault(key,0)
273 |                     trans_len[key] += length
274 |                 elif type == 'CDS':
275 |                     # 名称
276 |                     gene_name = fields[17].replace('"','').replace(';','')
277 |                     gene_id = fields[9].replace('"','').replace(';','')
278 |                     trans_id = fields[11].replace('"','').replace(';','')
279 |                     # 连接名称
280 |                     key = '|'.join([gene_name,gene_id,trans_id])
281 |                     # 计算多个CDS长度
282 |                     length = int(fields[4]) - int(fields[3]) + 1
283 |                     # 累计求和
284 |                     cds_len.setdefault(key,0)
285 |                     cds_len[key] += length
286 |                 elif type == '5UTR':
287 |                     # 名称
288 |                     gene_name = fields[17].replace('"','').replace(';','')
289 |                     gene_id = fields[9].replace('"','').replace(';','')
290 |                     trans_id = fields[11].replace('"','').replace(';','')
291 |                     # 连接名称
292 |                     key = '|'.join([gene_name,gene_id,trans_id])
293 |                     # 计算多个5'UTR长度
294 |                     length = int(fields[4]) - int(fields[3]) + 1
295 |                     # 累计求和
296 |                     utr5_len.setdefault(key,0)
297 |                     utr5_len[key] += length
298 |                 else:
299 |                     pass
300 |             else:
301 |                 pass
302 |         
303 |         # fillwith no 5UTR genes
304 |         new_utr5_len = {key:utr5_len.get(key,0) for key,val in cds_len.items()}
305 | 
306 |         # transorm into dataframe and merge by id
307 |         df_tran = pd.DataFrame.from_dict(trans_len,orient='index',columns=['translength'])
308 |         df_tran['ID'] = df_tran.index
309 | 
310 |         df_cds = pd.DataFrame.from_dict(cds_len,orient='index',columns=['cdslength'])
311 |         df_cds['ID'] = df_cds.index
312 | 
313 |         df_5utr = pd.DataFrame.from_dict(new_utr5_len,orient='index',columns=['utr5length'])
314 |         df_5utr['ID'] = df_5utr.index
315 | 
316 |         # 按id合并表格
317 |         data_info = df_cds.merge(df_tran.merge(df_5utr,on='ID'),on='ID')
318 | 
319 |         # 添加基因名列
320 |         data_info['gene_name'] = [i.split(sep='|')[0] for i in data_info['ID']]
321 | 
322 |         # 按gen_name cdslength translength 降序排序
323 |         data_infonew = data_info.sort_values(by = ['gene_name','cdslength','translength'],ascending = False,inplace=False)
324 | 
325 |         # order columns
326 |         data_infonew = data_infonew[['ID','gene_name','translength','utr5length','cdslength']]
327 | 
328 |         # 保存
329 |         data_infonew.to_csv(r'All_transcripts_cds_info.csv', index=False)
330 | 
331 |         ############################
332 |         # 筛选最长转录本id
333 |         longest_id = list(data_infonew.drop_duplicates(subset=['gene_name'],keep='first')['ID'])
334 | 
335 |         # 筛选最长转录本表格
336 |         longest_data = data_infonew.loc[data_infonew.ID.isin(longest_id)]
337 | 
338 |         # 保存
339 |         longest_data.to_csv(r'longest_cds_transcripts_info.csv', index=False)
340 | 
341 |         # 给 ID 添加 CDS 位置信息和转录本长度信息
342 |         longest_data['ID'] = longest_data.ID + '|' + (longest_data.utr5length + 1).map(str) + '|' + \
343 |                                 (longest_data.utr5length + longest_data.cdslength).map(str) + '|' + \
344 |                                 (longest_data.translength).map(str)
345 | 
346 |         # order columns
347 |         longest_data = longest_data[['ID','gene_name','translength','utr5length','cdslength']]
348 | 
349 |         #############################################################
350 |         # 储存最长转录本id  
351 |         transid = {line.split(sep='_')[2]:line for line in list(longest_data.ID)}
352 | 
353 |         infolist = []
354 |         with gzip.open('Homo_sapiens.GRCh38.103.gtf.gz','rt') as gtf:
355 |             for line in gtf:
356 |                 # skip
357 |                 if line.startswith('#'):
358 |                     continue
359 |                 # split
360 |                 fields = line.split()
361 |                 # feature type
362 |                 type = fields[2]
363 |                 if type == 'exon':
364 |                     # pos
365 |                     chr = fields[0]
366 |                     start = fields[3]
367 |                     end = fields[4]
368 |                     strand = fields[6]
369 |                     # name
370 |                     gene_name = fields[17].replace('"','').replace(';','')
371 |                     gene_id = fields[9].replace('"','').replace(';','')
372 |                     trans_id = fields[11].replace('"','').replace(';','')
373 |                     if trans_id in transid:
374 |                         infolist.append([chr,start,end,strand,type,gene_name,gene_id,trans_id,transid[trans_id]])
375 |                     else:
376 |                         pass
377 |                 else:
378 |                     pass
379 | 
380 |         # to dataframe
381 |         dfinfo = pd.DataFrame(infolist,columns=['chr','start','end','strand','type','gene_name','gene_id','trans_id','id'])
382 |         dfinfo_1_strand = dfinfo[dfinfo['strand'] == '+']
383 | 
384 |         # descrese coord by - strand gene
385 |         dfinfo_2_strand = dfinfo[dfinfo['strand'] == '-']
386 |         dfinfo_2_strand = dfinfo_2_strand.sort_values(by = ['trans_id','start','end'],ascending = False)
387 | 
388 |         # merge
389 |         df_fianl = pd.concat([dfinfo_1_strand,dfinfo_2_strand],axis=0)
390 | 
391 |         #############################################################
392 |         # extact  sequnece from genome
393 | 
394 |         # load genome
395 |         genome = Fasta(genomefile)
396 | 
397 |         # chrmosome info
398 |         chrmosome_list = genome.keys()
399 | 
400 |         # save in dict
401 |         res = {}
402 |         for line in range(0,df_fianl.shape[0]):
403 |             
404 |             # chromosome strand
405 |             fileds = df_fianl.iloc[line]
406 |             chrom = fileds['chr']
407 |             strand = fileds['strand']
408 |             start = int(fileds['start'])
409 |             end = int(fileds['end'])
410 |             # key
411 |             key = fileds['id']
412 |             # filter chromoseome
413 |             if chrom in chrmosome_list:
414 |                 # extarct sequence
415 |                 if strand == '+':
416 |                     seq = genome[chrom][(start-1):end].seq
417 |                 elif strand == '-':
418 |                     seq = genome[chrom][(start-1):end].complement.reverse.seq
419 |                 else:
420 |                     pass
421 |                 # save in dict
422 |                 res.setdefault(key,'')
423 |                 res[key] += seq
424 |             else:
425 |                 pass
426 |         
427 |         #############################################################
428 |         # 输出序列
429 |         outputfile = open(outfile,'w')
430 | 
431 |         # fasta序列分割长度
432 |         my_length = 60
433 | 
434 |         # 输出
435 |         for key,val in res.items():
436 |             outputfile.write('>' + key + '\n')
437 |             while len(val) > my_length:
438 |                 outputfile.write(val[0:my_length] + '\n')
439 |                 val = val[my_length:len(val)]
440 |             outputfile.write(val + '\n')
441 | 
442 |         # 关闭文件
443 |         outputfile.close()
444 | 
445 |     ####################################################################################
446 |     job_stop = time.time()
447 |     print("Your job is done! ")
448 |     print("Running with " + str(round(job_stop - job_start,2)) + " seconds!")
449 | 
450 |     if __name__=="__main__":
451 | 	    main()


--------------------------------------------------------------------------------
/src/GetTransTool/GetCDSLongestFromGencode.py:
--------------------------------------------------------------------------------
  1 | def main():
  2 |     """
  3 |     Extract longest CDS regeion with longest transcript from gencode database transcripts fasta file.
  4 |     Only for human and mouse transcripts fasta file.
  5 |     """
  6 | 
  7 |     # 导入模块
  8 |     import pandas as pd
  9 |     import gzip
 10 |     import time
 11 |     import argparse
 12 | 
 13 |     parser = argparse.ArgumentParser(usage="GetCDSLongestFromGencode --file gencode.vM28.pc_transcripts.fa.gz --outfile longest_cds_trans.fa",
 14 |                                     description="Extract longest CDS regeion with longest transcript from gencode database transcripts fasta file.",
 15 |                                     epilog="Thank your for your support, if you have any questions or suggestions please contact me: 3219030654@stu.cpu.edu.cn.")
 16 |     parser.add_argument('-v','--version', action='version', version='%(prog)s 0.0.3')
 17 | 
 18 |     # 读取转录本文件
 19 |     parser.add_argument('-f','--file', type=str,action="store",dest="transfile",metavar="transfile",help='input your protein-coding transcripts file with ".gz" format. (gencode.vM28.pc_transcripts.fa.gz)')
 20 |     # 导出文件名称
 21 |     parser.add_argument('-o','--outfile', type=str,action="store",dest="longestfile",metavar="longestfile",help='output your longest transcript file. (longest_cds_trans.fa)')
 22 |     # 解析参数
 23 |     args = parser.parse_args()
 24 | 
 25 |     # 获取参数
 26 |     inputfile = args.transfile
 27 |     outfile = args.longestfile
 28 | 
 29 |     # main fuction
 30 |     print("Your job is running, please wait...")
 31 |     ######################################################################
 32 |     job_start = time.time()
 33 |     #######################
 34 |     # 储存id
 35 |     cdsinfo = {}
 36 | 
 37 |     # 储存改名文件
 38 |     tmpfile = open('name_changed.fa','w')
 39 | 
 40 |     # main code
 41 |     with gzip.open(inputfile,'rt') as pc:
 42 |         for line in pc:
 43 |             if line.startswith('>'):
 44 |                 # split id
 45 |                 fileds = line.split(sep='|')
 46 |                 gene_name = '>' + fileds[5]
 47 |                 gene_id = fileds[1]
 48 |                 trans_id = fileds[0].replace('>','')
 49 |                 trans_length = fileds[6]
 50 |                 # include 5UTR+CDS+3UTR or 5UTR+CDS/
 51 |                 if fileds[8].startswith('CDS'):
 52 |                     cds_range = fileds[8].split(sep=':')[1].split(sep='-')
 53 |                     cds_start = cds_range[0]
 54 |                     cds_end = cds_range[1]
 55 |                     # cds length
 56 |                     cds_len = int(cds_end) - int(cds_start) + 1
 57 |                     fullname = '|'.join([gene_name,gene_id,trans_id,cds_start,cds_end,trans_length])
 58 |                     # save
 59 |                     tmpfile.write(fullname + '\n')
 60 |                     # save in dict
 61 |                     cdsinfo[fullname] = str(cds_len)
 62 |                 # include CDS+3UTR or only CDS
 63 |                 else:
 64 |                     cds_range = fileds[7].split(sep=':')[1].split(sep='-')
 65 |                     cds_start = cds_range[0]
 66 |                     cds_end = cds_range[1]
 67 |                     # cds length
 68 |                     cds_len = int(cds_end) - int(cds_start) + 1
 69 |                     fullname = '|'.join([gene_name,gene_id,trans_id,cds_start,cds_end,trans_length])
 70 |                     # save
 71 |                     tmpfile.write(fullname + '\n')
 72 |                     # save in dict
 73 |                     cdsinfo[fullname] = str(cds_len)
 74 |             else:
 75 |                 # write seq
 76 |                 tmpfile.write(line)
 77 | 
 78 |     # close file
 79 |     tmpfile.close()
 80 | 
 81 |     #################################################
 82 |     # transform into datafarme
 83 |     tmp = [[key,key.split(sep='|')[0],int(key.split(sep='|')[5]),int(val)] for key,val in cdsinfo.items()]
 84 | 
 85 |     # 转为数据框
 86 |     data_info = pd.DataFrame(tmp,columns=['fullname','gene_name','translength','cdslength'])
 87 | 
 88 |     # 按gen_name cdslength translength 降序排序
 89 |     data_infonew = data_info.sort_values(by = ['gene_name','cdslength','translength'],ascending = False,inplace=False)
 90 | 
 91 |     # 保存
 92 |     data_infonew.to_csv(r'All_transcripts_cds_info.csv', index=False)
 93 | 
 94 |     # 筛选最长转录本id
 95 |     longest_id = list(data_infonew.drop_duplicates(subset=['gene_name'],keep='first')['fullname'])
 96 | 
 97 |     # 筛选最长转录本表格
 98 |     longest_data = data_infonew.loc[data_infonew.fullname.isin(longest_id)]
 99 | 
100 |     # 保存
101 |     longest_data.to_csv(r'longest_cds_transcripts_info.csv', index=False)
102 | 
103 |     #################################################
104 |     # prepare filter id  
105 |     filter_id = {id:0 for id in list(longest_data.fullname)}
106 | 
107 |     # 读取 fasta 文件保存为字典
108 |     with open('name_changed.fa') as fa:
109 |         fa_dict = {}
110 |         for line in fa:
111 |             if line.startswith('>'):
112 |                 seq_name = line.strip()
113 |                 fa_dict[seq_name] = ''
114 |             else:
115 |                 # 序列
116 |                 fa_dict[seq_name] += line.replace('\n','')
117 |                 
118 |     # 新建输出结果文件
119 |     output_fa = open(outfile,'w')
120 | 
121 |     # fasta序列分割长度
122 |     my_length = 60
123 | 
124 |     # 输出
125 |     for key,val in fa_dict.items():
126 |         if key in filter_id:
127 |             output_fa.write(key + '\n')
128 |             while len(val) > my_length:
129 |                 output_fa.write(val[0:my_length] + '\n')
130 |                 val = val[my_length:len(val)]
131 |             output_fa.write(val + '\n')
132 |         
133 |     # 关闭文件
134 |     output_fa.close()
135 | 
136 |     ####################################################################################
137 |     job_stop = time.time()
138 |     print("Your job is done! ")
139 |     print("Running with " + str(round(job_stop - job_start,2)) + " seconds!")
140 | 
141 |     if __name__=="__main__":
142 | 	    main()


--------------------------------------------------------------------------------
/src/GetTransTool/GetLongestTransFromGTF.py:
--------------------------------------------------------------------------------
  1 | def main():
  2 |     """
  3 |     Extract longest transcript from gtf format annotation file based on gencode/ensembl/ucsc database.
  4 |     """
  5 | 
  6 |     # 导入模块
  7 |     import pandas as pd
  8 |     import gzip
  9 |     import time
 10 |     import argparse
 11 |     from pyfaidx import Fasta
 12 | 
 13 |     parser = argparse.ArgumentParser(usage="GetLongestTransFromGTF --database ensembl --gtffile Homo_sapiens.GRCh38.101.gtf.gz --genome Homo_sapiens.GRCh38.dna.primary_assembly.fa --outfile longest_trans.fa",
 14 |                                     description="Extract longest transcript from gtf format annotation file based on gencode/ensembl/ucsc database.",
 15 |                                     epilog="Thank your for your support, if you have any questions or suggestions please contact me: 3219030654@stu.cpu.edu.cn.")
 16 |     # version
 17 |     parser.add_argument('-v','--version', action='version', version='%(prog)s 0.0.7')
 18 |     # 读取注释类型文件
 19 |     parser.add_argument('-d','--database',type=str,action="store",dest="database",metavar="databse",choices=['ucsc','ensembl','gencode'],default="ensembl",
 20 |                         help='which annotation database you choose. (default="ensembl", ucsc/ensembl/gencode)')
 21 |     # 读取gtf文件
 22 |     parser.add_argument('-g','--gtffile', type=str,action="store",dest="gtffile",metavar="gtffile",
 23 |                         help='input your GTF file with ".gz" format.')
 24 |     # 读取基因组fasta文件
 25 |     parser.add_argument('-fa','--genome',type=str,action="store",dest="genome",metavar="genome",
 26 |                         help='your genome fasta file matched with your GTF file with ".fa/.fasta" format. (Homo_sapiens.GRCh38.dna.primary_assembly.fa)')
 27 |     # 导出文件名称
 28 |     parser.add_argument('-o','--outfile', type=str,action="store",dest="longestfile",metavar="longestfile",
 29 |                         help='output your longest transcript file. (longest_trans.fa)')
 30 |     # 解析参数
 31 |     args = parser.parse_args()
 32 | 
 33 |     # 获取参数
 34 |     db = args.database
 35 |     gtffile = args.gtffile
 36 |     genomefile = args.genome
 37 |     outfile = args.longestfile
 38 | 
 39 |     # main fuction
 40 |     print("Your job is running, please wait...")
 41 |     ######################################################################
 42 |     job_start = time.time()
 43 |     #######################
 44 |     if db == 'ensembl':
 45 |     ####################################################################################
 46 |         # 信息保存在字典里
 47 |         info = {}
 48 |         # 打开测试 gtf 文件
 49 |         with gzip.open(gtffile,'rt') as gtf:
 50 |             for line in gtf:
 51 |                 # 跳过注释行
 52 |                 if line.startswith('#'):
 53 |                     continue
 54 |                 # 分割
 55 |                 fields = line.split()
 56 |                 # 类型
 57 |                 type = fields[2]
 58 |                 if type == 'exon':
 59 |                     # 名称
 60 |                     gene_name = fields[19].replace('"','').replace(';','')
 61 |                     gene_id = fields[9].replace('"','').replace(';','')
 62 |                     trans_id = fields[13].replace('"','').replace(';','')
 63 |                     biotype = fields[23].replace('"','').replace(';','')
 64 |                     # 连接名称
 65 |                     key = '|'.join([gene_name,gene_id,trans_id,biotype])
 66 |                     # 计算多个外显子长度
 67 |                     start = int(fields[3])
 68 |                     end = int(fields[4])
 69 |                     length = end - start + 1
 70 |                     # 累计求和
 71 |                     info.setdefault(key,0)
 72 |                     info[key] += length
 73 |         ######################################
 74 |         # 转为数据框
 75 |         res = pd.DataFrame(pd.Series(info), columns = ['transcript_length'])
 76 | 
 77 |         # 添加基因名列
 78 |         res['gene_name'] = [line.split(sep='|')[0] for line in list(res.index[:])]
 79 | 
 80 |         # 排序
 81 |         res_sorted = res.sort_values(by = ['gene_name','transcript_length'],ascending=False)
 82 | 
 83 |         # 筛选最长转录本id
 84 |         longest_id = res_sorted.drop_duplicates(subset=['gene_name'],keep='first').index.values.tolist()
 85 | 
 86 |         # 筛选最长转录本表格
 87 |         longest_data = res_sorted.loc[res_sorted.index.isin(longest_id)]
 88 | 
 89 |         longest_data['ID'] = longest_data.index
 90 |         # order columns
 91 |         longest_data = longest_data[['ID','gene_name','transcript_length']]
 92 |         
 93 |         # 保存
 94 |         longest_data.to_csv(r'longest_transcripts_info.csv', index=False)
 95 | 
 96 |         ###########################################################
 97 |         # 储存最长转录本id  
 98 |         transid = {line.split(sep='_')[2]:line for line in list(longest_data.ID)}
 99 | 
100 |         infolist = []
101 |         with gzip.open(gtffile) as gtf:
102 |             for line in gtf:
103 |                 # skip
104 |                 if line.startswith('#'):
105 |                     continue
106 |                 # split
107 |                 fields = line.split()
108 |                 # feature type
109 |                 type = fields[2]
110 |                 if type == 'exon':
111 |                     # pos
112 |                     chr = fields[0]
113 |                     start = fields[3]
114 |                     end = fields[4]
115 |                     strand = fields[6]
116 |                     # name
117 |                     gene_name = fields[19].replace('"','').replace(';','')
118 |                     gene_id = fields[9].replace('"','').replace(';','')
119 |                     trans_id = fields[13].replace('"','').replace(';','')
120 |                     if trans_id in transid:
121 |                         infolist.append([chr,start,end,strand,type,gene_name,gene_id,trans_id,transid[trans_id]])
122 |                     else:
123 |                         pass
124 |                 else:
125 |                     pass
126 | 
127 |         # to dataframe
128 |         dfinfo = pd.DataFrame(infolist,columns=['chr','start','end','strand','type','gene_name','gene_id','trans_id','id'])
129 |         dfinfo_1_strand = dfinfo[dfinfo['strand'] == '+']
130 | 
131 |         # descrese coord by - strand gene
132 |         dfinfo_2_strand = dfinfo[dfinfo['strand'] == '-']
133 |         dfinfo_2_strand = dfinfo_2_strand.sort_values(by = ['trans_id','start','end'],ascending = False)
134 | 
135 |         # merge
136 |         df_fianl = pd.concat([dfinfo_1_strand,dfinfo_2_strand],axis=0)
137 | 
138 |         ###########################################################
139 |         # extact  sequnece from genome
140 | 
141 |         # load genome
142 |         genome = Fasta(genomefile)
143 | 
144 |         # chrmosome info
145 |         chrmosome_list = genome.keys()
146 | 
147 |         # save in dict
148 |         res = {}
149 |         for line in range(0,df_fianl.shape[0]):
150 |             
151 |             # chromosome strand
152 |             fileds = df_fianl.iloc[line]
153 |             chrom = fileds['chr']
154 |             strand = fileds['strand']
155 |             start = int(fileds['start'])
156 |             end = int(fileds['end'])
157 |             # key
158 |             key = fileds['id']
159 |             # filter chromoseome
160 |             if chrom in chrmosome_list:
161 |                 # extarct sequence
162 |                 if strand == '+':
163 |                     seq = genome[chrom][(start-1):end].seq
164 |                 elif strand == '-':
165 |                     seq = genome[chrom][(start-1):end].complement.reverse.seq
166 |                 else:
167 |                     pass
168 |                 # save in dict
169 |                 res.setdefault(key,'')
170 |                 res[key] += seq
171 |             else:
172 |                 pass
173 |         
174 |         ###########################################################
175 |         # 输出序列
176 |         outputfile = open(outfile,'w')
177 | 
178 |         # fasta序列分割长度
179 |         my_length = 60
180 |         ###########################################################
181 |         # 输出
182 |         for key,val in res.items():
183 |             outputfile.write('>' + key + '\n')
184 |             while len(val) > my_length:
185 |                 outputfile.write(val[0:my_length] + '\n')
186 |                 val = val[my_length:len(val)]
187 |             outputfile.write(val + '\n')
188 | 
189 |         # 关闭文件
190 |         outputfile.close()
191 |     ####################################################################################
192 |     elif db == 'gencode':
193 |         # 信息保存在字典里
194 |         info = {}
195 |         # 打开测试 gtf 文件
196 |         with gzip.open(gtffile,'rt') as gtf:
197 |             for line in gtf:
198 |                 # 跳过注释行
199 |                 if line.startswith('#'):
200 |                     continue
201 |                 # 分割
202 |                 fields = line.split()
203 |                 # 类型
204 |                 type = fields[2]
205 |                 if type == 'exon':
206 |                     # 名称
207 |                     gene_name = fields[15].replace('"','').replace(';','')
208 |                     gene_id = fields[9].replace('"','').replace(';','')
209 |                     trans_id = fields[11].replace('"','').replace(';','')
210 |                     biotype = fields[13].replace('"','').replace(';','')
211 |                     # 连接名称
212 |                     key = '|'.join([gene_name,gene_id,trans_id,biotype])
213 |                     # 计算多个外显子长度
214 |                     start = int(fields[3])
215 |                     end = int(fields[4])
216 |                     length = end - start + 1
217 |                     # 累计求和
218 |                     info.setdefault(key,0)
219 |                     info[key] += length
220 | 
221 |         ######################################
222 |         # 转为数据框
223 |         res = pd.DataFrame(pd.Series(info), columns = ['transcript_length'])
224 | 
225 |         # 添加基因名列
226 |         res['gene_name'] = [line.split(sep='|')[0] for line in list(res.index[:])]
227 | 
228 |         # 排序
229 |         res_sorted = res.sort_values(by = ['gene_name','transcript_length'],ascending=False)
230 | 
231 |         # 筛选最长转录本id
232 |         longest_id = res_sorted.drop_duplicates(subset=['gene_name'],keep='first').index.values.tolist()
233 | 
234 |         # 筛选最长转录本表格
235 |         longest_data = res_sorted.loc[res_sorted.index.isin(longest_id)]
236 | 
237 |         longest_data['ID'] = longest_data.index
238 |         # order columns
239 |         longest_data = longest_data[['ID','gene_name','transcript_length']]
240 |         
241 |         # 保存
242 |         longest_data.to_csv(r'longest_transcripts_info.csv', index=True)
243 | 
244 |         ###########################################################
245 |         # 储存最长转录本id  
246 |         transid = {line.split(sep='|')[2]:line for line in list(longest_data.ID)}
247 | 
248 |         infolist = []
249 |         with gzip.open(gtffile,'rt') as gtf:
250 |             for line in gtf:
251 |                 # skip
252 |                 if line.startswith('#'):
253 |                     continue
254 |                 # split
255 |                 fields = line.split()
256 |                 # feature type
257 |                 type = fields[2]
258 |                 if type == 'exon':
259 |                     # pos
260 |                     chr = fields[0]
261 |                     start = fields[3]
262 |                     end = fields[4]
263 |                     strand = fields[6]
264 |                     # name
265 |                     gene_name = fields[15].replace('"','').replace(';','')
266 |                     gene_id = fields[9].replace('"','').replace(';','')
267 |                     trans_id = fields[11].replace('"','').replace(';','')
268 |                     if trans_id in transid:
269 |                         infolist.append([chr,start,end,strand,type,gene_name,gene_id,trans_id,transid[trans_id]])
270 |                     else:
271 |                         pass
272 |                 else:
273 |                     pass
274 | 
275 |         # to dataframe
276 |         dfinfo = pd.DataFrame(infolist,columns=['chr','start','end','strand','type','gene_name','gene_id','trans_id','id'])
277 |         dfinfo_1_strand = dfinfo[dfinfo['strand'] == '+']
278 | 
279 |         # descrese coord by - strand gene
280 |         dfinfo_2_strand = dfinfo[dfinfo['strand'] == '-']
281 |         dfinfo_2_strand = dfinfo_2_strand.sort_values(by = ['trans_id','start','end'],ascending = False)
282 | 
283 |         # merge
284 |         df_fianl = pd.concat([dfinfo_1_strand,dfinfo_2_strand],axis=0)
285 |         ###########################################################
286 | 
287 |         # extact  sequnece from genome
288 | 
289 |         # load genome
290 |         genome = Fasta(genomefile)
291 | 
292 |         # chrmosome info
293 |         chrmosome_list = genome.keys()
294 | 
295 |         # save in dict
296 |         res = {}
297 |         for line in range(0,df_fianl.shape[0]):
298 |             
299 |             # chromosome strand
300 |             fileds = df_fianl.iloc[line]
301 |             chrom = fileds['chr']
302 |             strand = fileds['strand']
303 |             start = int(fileds['start'])
304 |             end = int(fileds['end'])
305 |             # key
306 |             key = fileds['id']
307 |             # filter chromoseome
308 |             if chrom in chrmosome_list:
309 |                 # extarct sequence
310 |                 if strand == '+':
311 |                     seq = genome[chrom][(start-1):end].seq
312 |                 elif strand == '-':
313 |                     seq = genome[chrom][(start-1):end].complement.reverse.seq
314 |                 else:
315 |                     pass
316 |                 # save in dict
317 |                 res.setdefault(key,'')
318 |                 res[key] += seq
319 |             else:
320 |                 pass
321 | 
322 |         ###########################################################
323 |         # 输出序列
324 |         outputfile = open(outfile,'w')
325 | 
326 |         # fasta序列分割长度
327 |         my_length = 60
328 | 
329 |         # 输出
330 |         for key,val in res.items():
331 |             outputfile.write('>' + key + '\n')
332 |             while len(val) > my_length:
333 |                 outputfile.write(val[0:my_length] + '\n')
334 |                 val = val[my_length:len(val)]
335 |             outputfile.write(val + '\n')
336 | 
337 |         # 关闭文件
338 |         outputfile.close()
339 |     ####################################################################################
340 |     elif db == 'ucsc':
341 |         # 信息保存在字典里
342 |         info = {}
343 |         # 打开测试 gtf 文件
344 |         with gzip.open(gtffile,'rt') as gtf:
345 |             for line in gtf:
346 |                 # 跳过注释行
347 |                 if line.startswith('#'):
348 |                     continue
349 |                 # 分割
350 |                 fields = line.split()
351 |                 # 类型
352 |                 type = fields[2]
353 |                 if type == 'exon':
354 |                     # 名称
355 |                     gene_name = fields[17].replace('"','').replace(';','')
356 |                     gene_id = fields[9].replace('"','').replace(';','')
357 |                     trans_id = fields[11].replace('"','').replace(';','')
358 |                     # 连接名称
359 |                     key = '|'.join([gene_name,gene_id,trans_id])
360 |                     # 计算多个外显子长度
361 |                     start = int(fields[3])
362 |                     end = int(fields[4])
363 |                     length = end - start + 1
364 |                     # 累计求和
365 |                     info.setdefault(key,0)
366 |                     info[key] += length
367 |         ######################################
368 |         # 转为数据框
369 |         res = pd.DataFrame(pd.Series(info), columns = ['transcript_length'])
370 | 
371 |         # 添加基因名列
372 |         res['gene_name'] = [line.split(sep='|')[0] for line in list(res.index[:])]
373 | 
374 |         # 排序
375 |         res_sorted = res.sort_values(by = ['gene_name','transcript_length'],ascending=False)
376 | 
377 |         # 筛选最长转录本id
378 |         longest_id = res_sorted.drop_duplicates(subset=['gene_name'],keep='first').index.values.tolist()
379 | 
380 |         # 筛选最长转录本表格
381 |         longest_data = res_sorted.loc[res_sorted.index.isin(longest_id)]
382 | 
383 |         longest_data['ID'] = longest_data.index
384 |         # order columns
385 |         longest_data = longest_data[['ID','gene_name','transcript_length']]
386 |         
387 |         # 保存
388 |         longest_data.to_csv(r'longest_transcripts_info.csv', index=True)
389 | 
390 |         ###########################################################
391 |         # 储存最长转录本id  
392 |         transid = {line.split(sep='|')[2]:line for line in list(longest_data.ID)}
393 | 
394 |         infolist = []
395 |         with gzip.open(gtffile) as gtf:
396 |             for line in gtf:
397 |                 # skip
398 |                 if line.startswith('#'):
399 |                     continue
400 |                 # split
401 |                 fields = line.split()
402 |                 # feature type
403 |                 type = fields[2]
404 |                 if type == 'exon':
405 |                     # pos
406 |                     chr = fields[0]
407 |                     start = fields[3]
408 |                     end = fields[4]
409 |                     strand = fields[6]
410 |                     # name
411 |                     gene_name = fields[17].replace('"','').replace(';','')
412 |                     gene_id = fields[9].replace('"','').replace(';','')
413 |                     trans_id = fields[11].replace('"','').replace(';','')
414 |                     if trans_id in transid:
415 |                         infolist.append([chr,start,end,strand,type,gene_name,gene_id,trans_id,transid[trans_id]])
416 |                     else:
417 |                         pass
418 |                 else:
419 |                     pass
420 | 
421 |         # to dataframe
422 |         dfinfo = pd.DataFrame(infolist,columns=['chr','start','end','strand','type','gene_name','gene_id','trans_id','id'])
423 |         dfinfo_1_strand = dfinfo[dfinfo['strand'] == '+']
424 | 
425 |         # descrese coord by - strand gene
426 |         dfinfo_2_strand = dfinfo[dfinfo['strand'] == '-']
427 |         dfinfo_2_strand = dfinfo_2_strand.sort_values(by = ['trans_id','start','end'],ascending = False)
428 | 
429 |         # merge
430 |         df_fianl = pd.concat([dfinfo_1_strand,dfinfo_2_strand],axis=0)
431 | 
432 |         ###########################################################
433 |         # extact  sequnece from genome
434 | 
435 |         # load genome
436 |         genome = Fasta(genomefile)
437 | 
438 |         # chrmosome info
439 |         chrmosome_list = genome.keys()
440 | 
441 |         # save in dict
442 |         res = {}
443 |         for line in range(0,df_fianl.shape[0]):
444 |             
445 |             # chromosome strand
446 |             fileds = df_fianl.iloc[line]
447 |             chrom = fileds['chr']
448 |             strand = fileds['strand']
449 |             start = int(fileds['start'])
450 |             end = int(fileds['end'])
451 |             # key
452 |             key = fileds['id']
453 |             # filter chromoseome
454 |             if chrom in chrmosome_list:
455 |                 # extarct sequence
456 |                 if strand == '+':
457 |                     seq = genome[chrom][(start-1):end].seq
458 |                 elif strand == '-':
459 |                     seq = genome[chrom][(start-1):end].complement.reverse.seq
460 |                 else:
461 |                     pass
462 |                 # save in dict
463 |                 res.setdefault(key,'')
464 |                 res[key] += seq
465 |             else:
466 |                 pass
467 |         
468 |         ###########################################################
469 | 
470 |         # 输出序列
471 |         outputfile = open(outfile,'w')
472 | 
473 |         # fasta序列分割长度
474 |         my_length = 60
475 | 
476 |         # 输出
477 |         for key,val in res.items():
478 |             outputfile.write('>' + key + '\n')
479 |             while len(val) > my_length:
480 |                 outputfile.write(val[0:my_length] + '\n')
481 |                 val = val[my_length:len(val)]
482 |             outputfile.write(val + '\n')
483 | 
484 |         # 关闭文件
485 |         outputfile.close()
486 | 
487 |     ####################################################################################
488 |     job_stop = time.time()
489 |     print("Your job is done! ")
490 |     print("Running with " + str(round(job_stop - job_start,2)) + " seconds!")
491 | 
492 |     if __name__=="__main__":
493 | 	    main()


--------------------------------------------------------------------------------
/src/GetTransTool/GetLongestTransFromGencode.py:
--------------------------------------------------------------------------------
  1 | def main():
  2 |     """
  3 |     Extract longest transcript from gencode database transcripts fasta file.
  4 |     Only for human and mouse transcripts fasta file.
  5 |     """
  6 | 
  7 |     # 导入模块
  8 |     import pandas as pd
  9 |     import gzip
 10 |     import time
 11 |     import argparse
 12 | 
 13 |     parser = argparse.ArgumentParser(usage="GetLongestTransFromGencode --file gencode.vM28.transcripts.fa.gz --outfile longest_trans.fa",
 14 |                                     description="Get longest transcripts from gencode transcripts fasta file.",
 15 |                                     epilog="Thank your for your support, if you have any questions or suggestions please contact me: 3219030654@stu.cpu.edu.cn.")
 16 |     parser.add_argument('-v','--version', action='version', version='%(prog)s 0.0.3')
 17 | 
 18 |     # 读取转录本文件
 19 |     parser.add_argument('-f','--file', type=str,action="store",dest="transfile",metavar="transfile",help='input your transcripts file with ".gz" format. (gencode.vM28.transcripts.fa.gz)')
 20 |     # 导出文件名称
 21 |     parser.add_argument('-o','--outfile', type=str,action="store",dest="longestfile",metavar="longestfile",help='output your longest transcript file. (longest_trans.fa)')
 22 |     # 解析参数
 23 |     args = parser.parse_args()
 24 | 
 25 |     # 获取参数
 26 |     inputfile = args.transfile
 27 |     outfile = args.longestfile
 28 | 
 29 |     # main fuction
 30 |     print("Your job is running, please wait...")
 31 |     ######################################################################
 32 |     job_start = time.time()
 33 |     #######################
 34 |     # 保存ID列表
 35 |     info = []
 36 |     with gzip.open(inputfile,'rt') as tx:
 37 |         for line in tx:
 38 |             if line.startswith('>'):
 39 |                 info_name = line.replace('\n','')
 40 |                 info.append(info_name)
 41 | 
 42 |     # 更改名称            
 43 |     comb = []
 44 |     for line in info:
 45 |         # 分割取出需要信息
 46 |         gene_name = line.split('|')[5]
 47 |         gene_id = line.split('|')[1]
 48 |         transcript_id = line.split('|')[0].replace('>','')
 49 |         length = line.split('|')[6]
 50 |         
 51 |         # 连接完整名称
 52 |         # finame = gene_name + '_' + gene_id + '_' + transcript_id + '_' + length
 53 |         finame = '|'.join([gene_name,gene_id,transcript_id,length])
 54 |         # 保存到列表
 55 |         comb.append([finame,gene_name,int(length)])
 56 | 
 57 |     # 转为数据框
 58 |     data_info = pd.DataFrame(comb,columns=['fullname','gene_name','translength'])
 59 | 
 60 |     ######################################################################
 61 |     # 新建输出改名结果文件
 62 |     output_fa = open('name_changed.fa','w')
 63 | 
 64 |     # loop change ID
 65 |     with gzip.open(inputfile,'rt') as tx:
 66 |         for line in tx:
 67 |             if line.startswith('>'):
 68 |                 info_name = line.replace('\n','')
 69 |                 # 分割取出需要信息
 70 |                 gene_name = info_name.split('|')[5]
 71 |                 gene_id = info_name.split('|')[1]
 72 |                 transcript_id = info_name.split('|')[0].replace('>','')
 73 |                 length = info_name.split('|')[6]
 74 |                 # 连接完整名称
 75 |                 finame = '>' + gene_name + '|' + gene_id + '|' + transcript_id + '|' + length + '\n'
 76 |                 # 命名
 77 |                 output_fa.write(finame)
 78 |             else:
 79 |                 seq = line
 80 |                 output_fa.write(seq)
 81 | 
 82 |     # 关闭文件
 83 |     output_fa.close()
 84 | 
 85 |     ######################################################################
 86 |     # 按转录本序列降序排序
 87 |     data_infonew = data_info.sort_values(by = ['gene_name','translength'],ascending = False,inplace=False)
 88 | 
 89 |     # 筛选最长转录本id
 90 |     longest_id = list('>' + data_infonew.drop_duplicates(subset=['gene_name'],keep='first')['fullname'])
 91 | 
 92 |     ####################### save csv
 93 |     # remove '>'
 94 |     longest_idClean = [id.replace('>','') for id in longest_id]
 95 |         
 96 |     # 筛选最长转录本表格
 97 |     longest_data = data_infonew.loc[data_infonew.fullname.isin(longest_idClean)]
 98 | 
 99 |     # 保存
100 |     longest_data.to_csv(r'longest_transcripts_info.csv', index=False)
101 | 
102 |     ######################################################################
103 |     # 保存筛选的id为字典
104 |     filter_id = {id:0 for id in longest_id}
105 | 
106 |     # 读取 fasta 文件保存为字典
107 |     with open('name_changed.fa') as fa:
108 |         fa_dict = {}
109 |         for line in fa:
110 |             if line.startswith('>'):
111 |                 seq_name = line.strip()
112 |                 fa_dict[seq_name] = ''
113 |             else:
114 |                 # 序列
115 |                 fa_dict[seq_name] += line.replace('\n','')
116 |                 
117 |     # 新建输出结果文件
118 |     output_fa = open(outfile,'w')
119 | 
120 |     # fasta序列分割长度
121 |     my_length = 60
122 | 
123 |     # 输出
124 |     for key,val in fa_dict.items():
125 |         if key in filter_id:
126 |             output_fa.write(key + '\n')
127 |             while len(val) > my_length:
128 |                 output_fa.write(val[0:my_length] + '\n')
129 |                 val = val[my_length:len(val)]
130 |             output_fa.write(val + '\n')
131 |         
132 |     # 关闭文件
133 |     output_fa.close()
134 | 
135 |     ####################################################################################
136 |     job_stop = time.time()
137 |     print("Your job is done! ")
138 |     print("Running with " + str(round(job_stop - job_start,2)) + " seconds!")
139 | 
140 |     if __name__=="__main__":
141 | 	    main()


--------------------------------------------------------------------------------