├── .gitignore ├── LICENSE ├── README.md ├── dist ├── GetTransTool-0.0.3-py3-none-any.whl └── GetTransTool-0.0.3.tar.gz ├── pyproject.toml ├── setup.cfg └── src └── GetTransTool ├── GetCDSLongestFromGTF.py ├── GetCDSLongestFromGencode.py ├── GetLongestTransFromGTF.py └── GetLongestTransFromGencode.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2018 The Python Packaging Authority 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GetTransTool Package 2 | 3 | There are four types of methods to extract **longest transcript** or **longest CDS regeion with longest transcript** from **transcripts fasta** file or **GTF** file. 4 | 5 | --- 6 | 7 | - 1.Extract longest transcript from gencode transcripts fasta file. 8 | 9 | - 2.Extract longest transcript from gtf format annotation file based on gencode/ensembl/ucsc database. 10 | 11 | - 3.Extract longest CDS regeion with longest transcript from gencode database transcripts fasta file. 12 | 13 | - 4.Extract longest CDS regeion with longest transcript from gtf format annotation file based on ensembl/ucsc database. 14 | 15 | ## Install 16 | 17 | ```shell 18 | $ pip install GetTransTool 19 | ``` 20 | 21 | ## Usage 22 | 23 | ## 1. get longest transcript from gencode transcripts fasta file: 24 | 25 | ### help infomation: 26 | 27 | ```shell 28 | $ GetLongestTransFromGencode -h 29 | usage: GetLongestTransFromGencode --file gencode.vM28.transcripts.fa.gz --outfile longest_trans.fa 30 | 31 | Get longest transcripts from gencode transcripts fasta file. 32 | 33 | optional arguments: 34 | -h, --help show this help message and exit 35 | -v, --version show program's version number and exit 36 | -f transfile, --file transfile 37 | input your transcripts file with ".gz" format. (gencode.vM28.transcripts.fa.gz) 38 | -o longestfile, --outfile longestfile 39 | output your longest transcript file. (longest_trans.fa) 40 | 41 | Thank your for your support, if you have any questions or suggestions please contact me: 3219030654@stu.cpu.edu.cn. 42 | ``` 43 | 44 | ### usage: 45 | 46 | ```shell 47 | $ GetLongestTransFromGencode --file gencode.vM28.transcripts.fa.gz --outfile longest_trans_gencode.fa 48 | Your job is running, please wait... 49 | Your job is done! 50 | Running with 32.33 seconds! 51 | ``` 52 | 53 | there will be three files produced including **name_changed.fa**, **longest_transcripts_info.csv**, **longest_trans_gencode.fa**. 54 | 55 | > name_changed.fa: 56 | 57 | ``` 58 | >4933401J01Rik_ENSMUSG00000102693.2_ENSMUST00000193812.2_1070 59 | AAGGAAAGAGGATAACACTTGAAATGTAAATAAAGAAAATACCTAATAAAAATAAATAAA 60 | AACATGCTTTCAAAGGAAATAAAAAGTTGGATTCAAAAATTTAACTTTTGCTCATTTGGT 61 | ATAATCAAGGAAAAGACCTTTGCATATAAAATATATTTTGAATAAAATTCAGTGGAAGAA 62 | ... 63 | ``` 64 | 65 | > longest_transcripts_info.csv: 66 | 67 | this is the longest transcripts exon length information. 68 | 69 | ``` 70 | fullname,gene_name,translength 71 | snoZ196_ENSMUSG00002074855.1_ENSMUST00020182568.1_35,snoZ196,35 72 | snoZ159_ENSMUSG00002075734.1_ENSMUST00020182611.1_87,snoZ159,87 73 | n-R5s93_ENSMUSG00000119639.1_ENSMUST00000240071.1_119,n-R5s93,119 74 | ... 75 | ``` 76 | 77 | > longest_trans_gencode.fa: 78 | 79 | this is the filtered longest transcript fasta file. 80 | 81 | ``` 82 | >4933401J01Rik_ENSMUSG00000102693.2_ENSMUST00000193812.2_1070 83 | AAGGAAAGAGGATAACACTTGAAATGTAAATAAAGAAAATACCTAATAAAAATAAATAAA 84 | AACATGCTTTCAAAGGAAATAAAAAGTTGGATTCAAAAATTTAACTTTTGCTCATTTGGT 85 | ATAATCAAGGAAAAGACCTTTGCATATAAAATATATTTTGAATAAAATTCAGTGGAAGAA 86 | ... 87 | ``` 88 | 89 | --- 90 | 91 | ## 2. Extract longest transcript from gtf format annotation file based on gencode/ensembl/ucsc database: 92 | 93 | ### help infomation: 94 | 95 | ```shell 96 | $ GetLongestTransFromGTF -h 97 | usage: GetLongestTransFromGTF --database ensembl --gtffile Homo_sapiens.GRCh38.101.gtf.gz --genome Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz --outfile longest_trans.fa 98 | 99 | Extract longest transcript from gtf format annotation file based on gencode/ensembl/ucsc database. 100 | 101 | optional arguments: 102 | -h, --help show this help message and exit 103 | -v, --version show program's version number and exit 104 | -d databse, --database databse 105 | which annotation database you choose. (default="ensembl", ucsc/ensembl/gencode) 106 | -g gtffile, --gtffile gtffile 107 | input your GTF file with ".gz" format. 108 | -fa genome, --genome genome 109 | your genome fasta file matched with your GTF file with ".gz" format. (Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz) 110 | -o longestfile, --outfile longestfile 111 | output your longest transcript file. (longest_trans.fa) 112 | 113 | Thank your for your support, if you have any questions or suggestions please contact me: 3219030654@stu.cpu.edu.cn. 114 | ``` 115 | 116 | ### usage: 117 | 118 | ```shell 119 | $ GetLongestTransFromGTF --database ensembl --gtffile Homo_sapiens.GRCh38.103.gtf.gz --genome Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz --outfile longest_trans_ensembl.fa 120 | Your job is running, please wait... 121 | Your job is done! 122 | Running with 159.51 seconds! 123 | ``` 124 | 125 | there will be three files produced including **longest_transcripts_info.csv**, **longest_trans.gtf**, **longest_trans_ensembl.fa**. 126 | 127 | > longest_transcripts_info.csv: 128 | 129 | ``` 130 | ,transcript_length,gene_name 131 | snoZ196_ENSG00000281780_ENST00000625269_snoRNA,89,snoZ196 132 | hsa-mir-423_ENSG00000266919_ENST00000586878_lncRNA,94,hsa-mir-423 133 | hsa-mir-1253_ENSG00000272920_ENST00000609567_lncRNA,105,hsa-mir-1253 134 | ... 135 | ``` 136 | 137 | > longest_trans.gtf: 138 | 139 | this is the gtf information for the longest transcripts. 140 | 141 | ``` 142 | 1 havana gene 11869 14409 . + . gene_id "ENSG00000223972"; gene_version "5"; gene_name "DDX11L1"; gene_source "havana"; gene_biotype "transcribed_unprocessed_pseudogene"; 143 | 1 havana transcript 11869 14409 . + . gene_id "ENSG00000223972"; gene_version "5"; transcript_id "ENST00000456328"; transcript_version "2"; gene_name "DDX11L1"; gene_source "havana"; gene_biotype "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; tag "basic"; transcript_support_level "1"; 144 | 1 havana exon 11869 12227 . + . gene_id "ENSG00000223972"; gene_version "5"; transcript_id "ENST00000456328"; transcript_version "2"; exon_number "1"; gene_name "DDX11L1"; gene_source "havana"; gene_biotype "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; exon_id "ENSE00002234944"; exon_version "1"; tag "basic"; transcript_support_level "1"; 145 | 1 havana exon 12613 12721 . + . gene_id "ENSG00000223972"; gene_version "5"; transcript_id "ENST00000456328"; transcript_version "2"; exon_number "2"; gene_name "DDX11L1"; gene_source "havana"; gene_biotype "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; exon_id "ENSE00003582793"; exon_version "1"; tag "basic"; transcript_support_level "1"; 146 | 1 havana exon 13221 14409 . + . gene_id "ENSG00000223972"; gene_version "5"; transcript_id "ENST00000456328"; transcript_version "2"; exon_number "3"; gene_name "DDX11L1"; gene_source "havana"; gene_biotype "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; exon_id "ENSE00002312635"; exon_version "1"; tag "basic"; transcript_support_level "1"; 147 | 1 havana gene 14404 29570 . - . gene_id "ENSG00000227232"; gene_version "5"; gene_name "WASH7P"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; 148 | ``` 149 | 150 | > longest_trans_ensembl.fa: 151 | 152 | ``` 153 | >DDX11L1_ENSG00000223972_ENST00000456328_transcribed_unprocessed_pseudogene 154 | GTTAACTTGCCGTCAGCCTTTTCTTTGACCTCTTCTTTCTGTTCATGTGTATTTGCTGTC 155 | TCTTAGCCCAGACTTCCCGTGTCCTTTCCACCGGGCCTTTGAGAGGTCACAGGGTCTTGA 156 | TGCTGTGGTCTTCATCTGCAGGTGTCTGACTTCCAGCAACTGCTGGCCTGTGCCAGGGTG 157 | ... 158 | ``` 159 | 160 | for ucsc: 161 | 162 | ``` 163 | $ GetLongestTransFromGTF --database ucsc --gtffile hg19.ncbiRefSeq.gtf.gz --genome hg19.fa.gz --outfile longest_trans_ucsc.fa 164 | ``` 165 | 166 | --- 167 | 168 | ## 3. Extract longest CDS regeion with longest transcript from gencode database transcripts fasta file. 169 | 170 | ### help infomation: 171 | 172 | ```shell 173 | $ GetCDSLongestFromGencode -h 174 | usage: GetCDSLongestFromGencode --file gencode.vM28.pc_transcripts.fa.gz --outfile longest_cds_trans.fa 175 | 176 | Extract longest CDS regeion with longest transcript from gencode database transcripts fasta file. 177 | 178 | optional arguments: 179 | -h, --help show this help message and exit 180 | -v, --version show program's version number and exit 181 | -f transfile, --file transfile 182 | input your protein-coding transcripts file with ".gz" format. (gencode.vM28.pc_transcripts.fa.gz) 183 | -o longestfile, --outfile longestfile 184 | output your longest transcript file. (longest_cds_trans.fa) 185 | 186 | Thank your for your support, if you have any questions or suggestions please contact me: 3219030654@stu.cpu.edu.cn. 187 | ``` 188 | 189 | ### usage: 190 | 191 | ```shell 192 | $ GetCDSLongestFromGencode --file gencode.vM28.pc_transcripts.fa.gz --outfile longest_cds_trans_gencode.fa 193 | Your job is running, please wait... 194 | Your job is done! 195 | Running with 17.67 seconds! 196 | ``` 197 | 198 | there will be four files produced including **name_changed.fa**, **All_transcripts_cds_info.csv**, **longest_cds_transcripts_info.csv**, **longest_cds_trans_gencode.fa**. 199 | 200 | > name_changed.fa: 201 | 202 | ``` 203 | >Xkr4_ENSMUSG00000051951.6_ENSMUST00000070533.5_151_2094_3634 204 | GCGGCGGCGGGCGAGCGGGCGCTGGAGTAGGAGCTGGGGAGCGGCGCGGCCGGGGAAGGA 205 | AGCCAGGGCGAGGCGAGGAGGTGGCGGGAGGAGGAGACAGCAGGGACAGGTGTCAGATAA 206 | AGGAGTGCTCTCCTCCGCTGCCGAGGCATCATGGCCGCTAAGTCAGACGGGAGGCTGAAG 207 | ... 208 | ``` 209 | 210 | > All_transcripts_cds_info.csv: 211 | 212 | this is the all transcripts cds and exon length information. 213 | 214 | ``` 215 | fullname,gene_name,translength,cdslength 216 | >mt-Nd6_ENSMUSG00000064368.1_ENSMUST00000082419.1_1_519_519,>mt-Nd6,519,519 217 | >mt-Nd5_ENSMUSG00000064367.1_ENSMUST00000082418.1_1_1824_1824,>mt-Nd5,1824,1824 218 | >mt-Nd4l_ENSMUSG00000065947.1_ENSMUST00000084013.1_1_297_297,>mt-Nd4l,297,297 219 | ... 220 | ``` 221 | 222 | > longest_cds_transcripts_info.csv: 223 | 224 | ``` 225 | fullname,gene_name,translength,cdslength 226 | >mt-Nd6_ENSMUSG00000064368.1_ENSMUST00000082419.1_1_519_519,>mt-Nd6,519,519 227 | >mt-Nd5_ENSMUSG00000064367.1_ENSMUST00000082418.1_1_1824_1824,>mt-Nd5,1824,1824 228 | >mt-Nd4l_ENSMUSG00000065947.1_ENSMUST00000084013.1_1_297_297,>mt-Nd4l,297,297 229 | ... 230 | ``` 231 | 232 | > longest_cds_trans_gencode.fa: 233 | 234 | ``` 235 | >Xkr4_ENSMUSG00000051951.6_ENSMUST00000070533.5_151_2094_3634 236 | GCGGCGGCGGGCGAGCGGGCGCTGGAGTAGGAGCTGGGGAGCGGCGCGGCCGGGGAAGGA 237 | AGCCAGGGCGAGGCGAGGAGGTGGCGGGAGGAGGAGACAGCAGGGACAGGTGTCAGATAA 238 | AGGAGTGCTCTCCTCCGCTGCCGAGGCATCATGGCCGCTAAGTCAGACGGGAGGCTGAAG 239 | ... 240 | ``` 241 | 242 | --- 243 | 244 | ## 4. Extract longest CDS regeion with longest transcript from gtf format annotation file based on ensembl/ucsc database. 245 | 246 | ### help infomation: 247 | 248 | ```shell 249 | $ GetCDSLongestFromGTF -h 250 | usage: GetCDSLongestFromGTF --database ensembl --gtffile Homo_sapiens.GRCh38.101.gtf.gz --genome Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz --outfile longest_cds_trans.fa 251 | 252 | Extract longest CDS regeion with longest transcript from gtf format annotation file based on ensembl/ucsc database. 253 | 254 | optional arguments: 255 | -h, --help show this help message and exit 256 | -v, --version show program's version number and exit 257 | -d databse, --database databse 258 | which annotation database you choose. (default="ensembl", ucsc/ensembl) 259 | -g gtffile, --gtffile gtffile 260 | input your GTF file with ".gz" format. 261 | -fa genome, --genome genome 262 | your genome fasta file matched with your GTF file with ".gz" format. (Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz) 263 | -o cdslongestfile, --outfile cdslongestfile 264 | output your longest transcript file. (longest_cds_trans.fa) 265 | 266 | Thank your for your support, if you have any questions or suggestions please contact me: 3219030654@stu.cpu.edu.cn. 267 | ``` 268 | 269 | ### usage: 270 | 271 | ```shell 272 | $ GetCDSLongestFromGTF --database ensembl --gtffile Homo_sapiens.GRCh38.103.gtf.gz --genome Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz --outfile longest_cds_trans_ensembl.fa 273 | Your job is running, please wait... 274 | Your job is done! 275 | Running with 152.38 seconds! 276 | ``` 277 | 278 | there will be four files produced including **CDS_longest_trans.gtf**, **All_transcripts_cds_info.csv**, **longest_cds_transcripts_info.csv**, **longest_cds_trans_ensembl.fa**. 279 | 280 | > CDS_longest_trans.gtf: 281 | 282 | ``` 283 | 1 ensembl_havana gene 65419 71585 . + . gene_id "ENSG00000186092"; gene_version "6"; gene_name "OR4F5"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; 284 | 1 havana transcript 65419 71585 . + . gene_id "ENSG00000186092"; gene_version "6"; transcript_id "ENST00000641515"; transcript_version "2"; gene_name "OR4F5"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F5-202"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; 285 | 1 havana exon 65419 65433 . + . gene_id "ENSG00000186092"; gene_version "6"; transcript_id "ENST00000641515"; transcript_version "2"; exon_number "1"; gene_name "OR4F5"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F5-202"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00003812156"; exon_version "1"; tag "basic"; 286 | 1 havana exon 65520 65573 . + . gene_id "ENSG00000186092"; gene_version "6"; transcript_id "ENST00000641515"; transcript_version "2"; exon_number "2"; gene_name "OR4F5"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F5-202"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00003813641"; exon_version "1"; tag "basic"; 287 | 1 havana CDS 65565 65573 . + 0 gene_id "ENSG00000186092"; gene_version "6"; transcript_id "ENST00000641515"; transcript_version "2"; exon_number "2"; gene_name "OR4F5"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F5-202"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000493376"; protein_version "2"; tag "basic"; 288 | 1 havana start_codon 65565 65567 . + 0 gene_id "ENSG00000186092"; gene_version "6"; transcript_id "ENST00000641515"; transcript_version "2"; exon_number "2"; gene_name "OR4F5"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F5-202"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; 289 | 1 havana exon 69037 71585 . + . gene_id "ENSG00000186092"; gene_version "6"; transcript_id "ENST00000641515"; transcript_version "2"; exon_number "3"; gene_name "OR4F5"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F5-202"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00003813949"; exon_version "1"; tag "basic"; 290 | 1 havana CDS 69037 70005 . + 0 gene_id "ENSG00000186092"; gene_version "6"; transcript_id "ENST00000641515"; transcript_version "2"; exon_number "3"; gene_name "OR4F5"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F5-202"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000493376"; protein_version "2"; tag "basic"; 291 | 1 havana stop_codon 70006 70008 . + 0 gene_id "ENSG00000186092"; gene_version "6"; transcript_id "ENST00000641515"; transcript_version "2"; exon_number "3"; gene_name "OR4F5"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F5-202"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; 292 | 1 havana five_prime_utr 65419 65433 . + . gene_id "ENSG00000186092"; gene_version "6"; transcript_id "ENST00000641515"; transcript_version "2"; gene_name "OR4F5"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F5-202"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; 293 | 1 havana five_prime_utr 65520 65564 . + . gene_id "ENSG00000186092"; gene_version "6"; transcript_id "ENST00000641515"; transcript_version "2"; gene_name "OR4F5"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F5-202"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; 294 | 1 havana three_prime_utr 70009 71585 . + . gene_id "ENSG00000186092"; gene_version "6"; transcript_id "ENST00000641515"; transcript_version "2"; gene_name "OR4F5"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F5-202"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; 295 | 1 ensembl_havana gene 450740 451678 . - . gene_id "ENSG00000284733"; gene_version "2"; gene_name "OR4F29"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; 296 | ... 297 | ``` 298 | 299 | > All_transcripts_cds_info.csv: 300 | 301 | this is the all transcripts cds and exon length information. 302 | 303 | ``` 304 | cdslength,ID,translength,utr5length,gene_name 305 | 2709,ZZZ3_ENSG00000036549_ENST00000370801,6412,476,ZZZ3 306 | 1227,ZZZ3_ENSG00000036549_ENST00000370798,2468,486,ZZZ3 307 | 173,ZZZ3_ENSG00000036549_ENST00000433749,603,430,ZZZ3 308 | ... 309 | ``` 310 | 311 | > longest_cds_transcripts_info.csv: 312 | 313 | ``` 314 | cdslength,ID,translength,utr5length,gene_name 315 | 2709,ZZZ3_ENSG00000036549_ENST00000370801,6412,476,ZZZ3 316 | 8883,ZZEF1_ENSG00000074755_ENST00000381638,11466,135,ZZEF1 317 | 1716,ZYX_ENSG00000159840_ENST00000322764,2228,80,ZYX 318 | ... 319 | ``` 320 | 321 | > longest_cds_trans_gencode.fa: 322 | 323 | ``` 324 | >OR4F5_ENSG00000186092_ENST00000641515_61_1038_2618 325 | CCCAGATCTCTTCAGTTTTTATGCCTCATTCTGTGAAAATTGCTGTAGTCTCTTCCAGTT 326 | ATGAAGAAGGTAACTGCAGAGGCTATTTCCTGGAATGAATCAACGAGTGAAACGAATAAC 327 | TCTATGGTGACTGAATTCATTTTTCTGGGTCTCTCTGATTCTCAGGAACTCCAGACCTTC 328 | ... 329 | ``` 330 | 331 | for ucsc: 332 | 333 | ```shell 334 | $ GetCDSLongestFromGTF --database ucsc --gtffile hg19.ncbiRefSeq.gtf.gz --genome hg19.fa.gz --outfile longest_cds_trans_ensembl.fa 335 | ``` 336 | 337 | --- 338 | 339 | ## END 340 | 341 | > Thank your for your support, if you have any questions or suggestions please contact me: 3219030654@stu.cpu.edu.cn. -------------------------------------------------------------------------------- /dist/GetTransTool-0.0.3-py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junjunlab/GetTransTool/a7ce0db99821457e258f2b754c2ca4fef0e11dfa/dist/GetTransTool-0.0.3-py3-none-any.whl -------------------------------------------------------------------------------- /dist/GetTransTool-0.0.3.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junjunlab/GetTransTool/a7ce0db99821457e258f2b754c2ca4fef0e11dfa/dist/GetTransTool-0.0.3.tar.gz -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools>=42", 4 | "wheel" 5 | ] 6 | build-backend = "setuptools.build_meta" -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = GetTransTool 3 | version = 0.0.3 4 | author = laojunjun 5 | author_email = 3219030654@stu.cpu.edu.cn 6 | description = Extract longest transcript or longest CDS transcript from GTF annotation file or gencode transcripts fasta file. 7 | long_description = file: README.md 8 | long_description_content_type = text/markdown 9 | url = https://github.com/junjunlab/GetTransTool 10 | project_urls = 11 | Bug Tracker = https://github.com/junjunlab/GetTransTool/issues 12 | classifiers = 13 | Programming Language :: Python :: 3 14 | License :: OSI Approved :: MIT License 15 | Operating System :: OS Independent 16 | 17 | [options] 18 | package_dir = 19 | = src 20 | packages = find: 21 | python_requires = >=3.6 22 | 23 | [options.packages.find] 24 | where = src 25 | 26 | [options.entry_points] 27 | console_scripts = 28 | GetLongestTransFromGencode = GetTransTool.GetLongestTransFromGencode:main 29 | GetLongestTransFromGTF = GetTransTool.GetLongestTransFromGTF:main 30 | GetCDSLongestFromGencode = GetTransTool.GetCDSLongestFromGencode:main 31 | GetCDSLongestFromGTF = GetTransTool.GetCDSLongestFromGTF:main -------------------------------------------------------------------------------- /src/GetTransTool/GetCDSLongestFromGTF.py: -------------------------------------------------------------------------------- 1 | def main(): 2 | """ 3 | Extract longest CDS regeion with longest transcript from gtf format annotation file based on ensembl/ucsc database. 4 | """ 5 | 6 | # 导入模块 7 | import pandas as pd 8 | import gzip 9 | import time 10 | import warnings 11 | import argparse 12 | from pyfaidx import Fasta 13 | 14 | warnings.filterwarnings('ignore') 15 | 16 | parser = argparse.ArgumentParser(usage="GetCDSLongestFromGTF --database ensembl --gtffile Homo_sapiens.GRCh38.101.gtf.gz --genome Homo_sapiens.GRCh38.dna.primary_assembly.fa --outfile longest_cds_trans.fa", 17 | description="Extract longest CDS regeion with longest transcript from gtf format annotation file based on ensembl/ucsc database.", 18 | epilog="Thank your for your support, if you have any questions or suggestions please contact me: 3219030654@stu.cpu.edu.cn.") 19 | # version 20 | parser.add_argument('-v','--version', action='version', version='%(prog)s 0.0.7') 21 | # 读取注释类型文件 22 | parser.add_argument('-d','--database',type=str,action="store",dest="database",metavar="databse",choices=['ucsc','ensembl'],default="ensembl", 23 | help='which annotation database you choose. (default="ensembl", ucsc/ensembl)') 24 | # 读取gtf文件 25 | parser.add_argument('-g','--gtffile', type=str,action="store",dest="gtffile",metavar="gtffile", 26 | help='input your GTF file with ".gz" format.') 27 | # 读取基因组fasta文件 28 | parser.add_argument('-fa','--genome',type=str,action="store",dest="genome",metavar="genome", 29 | help='your genome fasta file matched with your GTF file with ".fa/.fatsa" format. (Homo_sapiens.GRCh38.dna.primary_assembly.fa)') 30 | # 导出文件名称 31 | parser.add_argument('-o','--outfile', type=str,action="store",dest="cdslongestfile",metavar="cdslongestfile", 32 | help='output your longest transcript file. (longest_cds_trans.fa)') 33 | # 解析参数 34 | args = parser.parse_args() 35 | 36 | # 获取参数 37 | db = args.database 38 | gtffile = args.gtffile 39 | genomefile = args.genome 40 | outfile = args.cdslongestfile 41 | 42 | # main fuction 43 | print("Your job is running, please wait...") 44 | ###################################################################### 45 | job_start = time.time() 46 | ################################################################################################ 47 | if db == 'ensembl': 48 | # 打开 gtf 文件 49 | with gzip.open(gtffile,'rt') as gtf: 50 | # 信息保存在字典里 51 | trans_len = {} 52 | utr5_len = {} 53 | cds_len = {} 54 | for line in gtf: 55 | # 跳过注释行 56 | if line.startswith('#'): 57 | continue 58 | # 分割 59 | fields = line.split() 60 | # 类型 61 | type = fields[2] 62 | if len(fields) > 24: 63 | biotype = fields[23].replace('"','').replace(';','') 64 | utr_biotype = fields[21].replace('"','').replace(';','') 65 | if biotype == 'protein_coding' and type == 'exon': 66 | # 名称 67 | gene_name = fields[19].replace('"','').replace(';','') 68 | gene_id = fields[9].replace('"','').replace(';','') 69 | trans_id = fields[13].replace('"','').replace(';','') 70 | # 连接名称 71 | key = '|'.join([gene_name,gene_id,trans_id]) 72 | # 计算多个外显子长度 73 | length = int(fields[4]) - int(fields[3]) + 1 74 | # 累计求和 75 | trans_len.setdefault(key,0) 76 | trans_len[key] += length 77 | elif biotype == 'protein_coding' and type == 'CDS': 78 | # 名称 79 | gene_name = fields[19].replace('"','').replace(';','') 80 | gene_id = fields[9].replace('"','').replace(';','') 81 | trans_id = fields[13].replace('"','').replace(';','') 82 | # 连接名称 83 | key = '|'.join([gene_name,gene_id,trans_id]) 84 | # 计算多个CDS长度 85 | length = int(fields[4]) - int(fields[3]) + 1 86 | # 累计求和 87 | cds_len.setdefault(key,0) 88 | cds_len[key] += length 89 | elif utr_biotype == 'protein_coding': 90 | # 名称 91 | gene_name = fields[17].replace('"','').replace(';','') 92 | gene_id = fields[9].replace('"','').replace(';','') 93 | trans_id = fields[13].replace('"','').replace(';','') 94 | # 连接名称 95 | key = '|'.join([gene_name,gene_id,trans_id]) 96 | if type == 'five_prime_utr': 97 | # 计算多个5'UTR长度 98 | length = int(fields[4]) - int(fields[3]) + 1 99 | # 累计求和 100 | utr5_len.setdefault(key,0) 101 | utr5_len[key] += length 102 | else: 103 | # 若无则为 0 104 | utr5_len.setdefault(key,0) 105 | else: 106 | pass 107 | 108 | # transorm into dataframe and merge by id 109 | df_tran = pd.DataFrame.from_dict(trans_len,orient='index',columns=['translength']) 110 | df_tran['ID'] = df_tran.index 111 | 112 | df_cds = pd.DataFrame.from_dict(cds_len,orient='index',columns=['cdslength']) 113 | df_cds['ID'] = df_cds.index 114 | 115 | df_5utr = pd.DataFrame.from_dict(utr5_len,orient='index',columns=['utr5length']) 116 | df_5utr['ID'] = df_5utr.index 117 | 118 | # 按id合并表格 119 | data_info = df_cds.merge(df_tran.merge(df_5utr,on='ID'),on='ID') 120 | 121 | # 添加基因名列 122 | data_info['gene_name'] = [i.split(sep='|')[0] for i in data_info['ID']] 123 | 124 | # 按gen_name cdslength translength 降序排序 125 | data_infonew = data_info.sort_values(by = ['gene_name','cdslength','translength'],ascending = False,inplace=False) 126 | 127 | # order columns 128 | data_infonew = data_infonew[['ID','gene_name','translength','utr5length','cdslength']] 129 | 130 | # 保存 131 | data_infonew.to_csv(r'All_transcripts_cds_info.csv', index=False) 132 | 133 | ############################ 134 | # 筛选最长转录本id 135 | longest_id = list(data_infonew.drop_duplicates(subset=['gene_name'],keep='first')['ID']) 136 | 137 | # 筛选最长转录本表格 138 | longest_data = data_infonew.loc[data_infonew.ID.isin(longest_id)] 139 | 140 | # 保存 141 | longest_data.to_csv(r'longest_cds_transcripts_info.csv', index=False) 142 | 143 | # 给 ID 添加 CDS 位置信息和转录本长度信息 144 | longest_data['ID'] = longest_data.ID + '|' + (longest_data.utr5length + 1).map(str) + '|' + \ 145 | (longest_data.utr5length + longest_data.cdslength).map(str) + '|' + \ 146 | (longest_data.translength).map(str) 147 | 148 | # order columns 149 | longest_data = longest_data[['ID','gene_name','translength','utr5length','cdslength']] 150 | 151 | # 储存最长转录本id 152 | transid = {line.split(sep='|')[2]:line for line in list(longest_data.ID)} 153 | 154 | infolist = [] 155 | with gzip.open(gtffile,'rt') as gtf: 156 | for line in gtf: 157 | # skip 158 | if line.startswith('#'): 159 | continue 160 | # split 161 | fields = line.split() 162 | # feature type 163 | type = fields[2] 164 | if type == 'exon': 165 | # pos 166 | chr = fields[0] 167 | start = fields[3] 168 | end = fields[4] 169 | strand = fields[6] 170 | # name 171 | gene_name = fields[19].replace('"','').replace(';','') 172 | gene_id = fields[9].replace('"','').replace(';','') 173 | trans_id = fields[13].replace('"','').replace(';','') 174 | if trans_id in transid: 175 | infolist.append([chr,start,end,strand,type,gene_name,gene_id,trans_id,transid[trans_id]]) 176 | else: 177 | pass 178 | else: 179 | pass 180 | 181 | # to dataframe 182 | dfinfo = pd.DataFrame(infolist,columns=['chr','start','end','strand','type','gene_name','gene_id','trans_id','id']) 183 | dfinfo_1_strand = dfinfo[dfinfo['strand'] == '+'] 184 | 185 | # descrese coord by - strand gene 186 | dfinfo_2_strand = dfinfo[dfinfo['strand'] == '-'] 187 | dfinfo_2_strand = dfinfo_2_strand.sort_values(by = ['trans_id','start','end'],ascending = False) 188 | 189 | # merge 190 | df_fianl = pd.concat([dfinfo_1_strand,dfinfo_2_strand],axis=0) 191 | 192 | ############################################################# 193 | # extact sequnece from genome 194 | 195 | # load genome 196 | genome = Fasta(genomefile) 197 | 198 | # chrmosome info 199 | chrmosome_list = genome.keys() 200 | 201 | # save in dict 202 | res = {} 203 | for line in range(0,df_fianl.shape[0]): 204 | 205 | # chromosome strand 206 | fileds = df_fianl.iloc[line] 207 | chrom = fileds['chr'] 208 | strand = fileds['strand'] 209 | start = int(fileds['start']) 210 | end = int(fileds['end']) 211 | # key 212 | key = fileds['id'] 213 | # filter chromoseome 214 | if chrom in chrmosome_list: 215 | # extarct sequence 216 | if strand == '+': 217 | seq = genome[chrom][(start-1):end].seq 218 | elif strand == '-': 219 | seq = genome[chrom][(start-1):end].complement.reverse.seq 220 | else: 221 | pass 222 | # save in dict 223 | res.setdefault(key,'') 224 | res[key] += seq 225 | else: 226 | pass 227 | 228 | ############################################################# 229 | # 输出序列 230 | outputfile = open(outfile,'w') 231 | 232 | # fasta序列分割长度 233 | my_length = 60 234 | 235 | # 输出 236 | for key,val in res.items(): 237 | outputfile.write('>' + key + '\n') 238 | while len(val) > my_length: 239 | outputfile.write(val[0:my_length] + '\n') 240 | val = val[my_length:len(val)] 241 | outputfile.write(val + '\n') 242 | 243 | # 关闭文件 244 | outputfile.close() 245 | 246 | ################################################################################################ 247 | elif db == 'ucsc': 248 | # 打开 gtf 文件 249 | with gzip.open(gtffile,'rt') as gtf: 250 | # 信息保存在字典里 251 | trans_len = {} 252 | utr5_len = {} 253 | cds_len = {} 254 | for line in gtf: 255 | # 跳过注释行 256 | if line.startswith('#'): 257 | continue 258 | # 分割 259 | fields = line.split() 260 | # 类型 261 | type = fields[2] 262 | if type == 'exon': 263 | # 名称 264 | gene_name = fields[17].replace('"','').replace(';','') 265 | gene_id = fields[9].replace('"','').replace(';','') 266 | trans_id = fields[11].replace('"','').replace(';','') 267 | # 连接名称 268 | key = '|'.join([gene_name,gene_id,trans_id]) 269 | # 计算多个外显子长度 270 | length = int(fields[4]) - int(fields[3]) + 1 271 | # 累计求和 272 | trans_len.setdefault(key,0) 273 | trans_len[key] += length 274 | elif type == 'CDS': 275 | # 名称 276 | gene_name = fields[17].replace('"','').replace(';','') 277 | gene_id = fields[9].replace('"','').replace(';','') 278 | trans_id = fields[11].replace('"','').replace(';','') 279 | # 连接名称 280 | key = '|'.join([gene_name,gene_id,trans_id]) 281 | # 计算多个CDS长度 282 | length = int(fields[4]) - int(fields[3]) + 1 283 | # 累计求和 284 | cds_len.setdefault(key,0) 285 | cds_len[key] += length 286 | elif type == '5UTR': 287 | # 名称 288 | gene_name = fields[17].replace('"','').replace(';','') 289 | gene_id = fields[9].replace('"','').replace(';','') 290 | trans_id = fields[11].replace('"','').replace(';','') 291 | # 连接名称 292 | key = '|'.join([gene_name,gene_id,trans_id]) 293 | # 计算多个5'UTR长度 294 | length = int(fields[4]) - int(fields[3]) + 1 295 | # 累计求和 296 | utr5_len.setdefault(key,0) 297 | utr5_len[key] += length 298 | else: 299 | pass 300 | else: 301 | pass 302 | 303 | # fillwith no 5UTR genes 304 | new_utr5_len = {key:utr5_len.get(key,0) for key,val in cds_len.items()} 305 | 306 | # transorm into dataframe and merge by id 307 | df_tran = pd.DataFrame.from_dict(trans_len,orient='index',columns=['translength']) 308 | df_tran['ID'] = df_tran.index 309 | 310 | df_cds = pd.DataFrame.from_dict(cds_len,orient='index',columns=['cdslength']) 311 | df_cds['ID'] = df_cds.index 312 | 313 | df_5utr = pd.DataFrame.from_dict(new_utr5_len,orient='index',columns=['utr5length']) 314 | df_5utr['ID'] = df_5utr.index 315 | 316 | # 按id合并表格 317 | data_info = df_cds.merge(df_tran.merge(df_5utr,on='ID'),on='ID') 318 | 319 | # 添加基因名列 320 | data_info['gene_name'] = [i.split(sep='|')[0] for i in data_info['ID']] 321 | 322 | # 按gen_name cdslength translength 降序排序 323 | data_infonew = data_info.sort_values(by = ['gene_name','cdslength','translength'],ascending = False,inplace=False) 324 | 325 | # order columns 326 | data_infonew = data_infonew[['ID','gene_name','translength','utr5length','cdslength']] 327 | 328 | # 保存 329 | data_infonew.to_csv(r'All_transcripts_cds_info.csv', index=False) 330 | 331 | ############################ 332 | # 筛选最长转录本id 333 | longest_id = list(data_infonew.drop_duplicates(subset=['gene_name'],keep='first')['ID']) 334 | 335 | # 筛选最长转录本表格 336 | longest_data = data_infonew.loc[data_infonew.ID.isin(longest_id)] 337 | 338 | # 保存 339 | longest_data.to_csv(r'longest_cds_transcripts_info.csv', index=False) 340 | 341 | # 给 ID 添加 CDS 位置信息和转录本长度信息 342 | longest_data['ID'] = longest_data.ID + '|' + (longest_data.utr5length + 1).map(str) + '|' + \ 343 | (longest_data.utr5length + longest_data.cdslength).map(str) + '|' + \ 344 | (longest_data.translength).map(str) 345 | 346 | # order columns 347 | longest_data = longest_data[['ID','gene_name','translength','utr5length','cdslength']] 348 | 349 | ############################################################# 350 | # 储存最长转录本id 351 | transid = {line.split(sep='_')[2]:line for line in list(longest_data.ID)} 352 | 353 | infolist = [] 354 | with gzip.open('Homo_sapiens.GRCh38.103.gtf.gz','rt') as gtf: 355 | for line in gtf: 356 | # skip 357 | if line.startswith('#'): 358 | continue 359 | # split 360 | fields = line.split() 361 | # feature type 362 | type = fields[2] 363 | if type == 'exon': 364 | # pos 365 | chr = fields[0] 366 | start = fields[3] 367 | end = fields[4] 368 | strand = fields[6] 369 | # name 370 | gene_name = fields[17].replace('"','').replace(';','') 371 | gene_id = fields[9].replace('"','').replace(';','') 372 | trans_id = fields[11].replace('"','').replace(';','') 373 | if trans_id in transid: 374 | infolist.append([chr,start,end,strand,type,gene_name,gene_id,trans_id,transid[trans_id]]) 375 | else: 376 | pass 377 | else: 378 | pass 379 | 380 | # to dataframe 381 | dfinfo = pd.DataFrame(infolist,columns=['chr','start','end','strand','type','gene_name','gene_id','trans_id','id']) 382 | dfinfo_1_strand = dfinfo[dfinfo['strand'] == '+'] 383 | 384 | # descrese coord by - strand gene 385 | dfinfo_2_strand = dfinfo[dfinfo['strand'] == '-'] 386 | dfinfo_2_strand = dfinfo_2_strand.sort_values(by = ['trans_id','start','end'],ascending = False) 387 | 388 | # merge 389 | df_fianl = pd.concat([dfinfo_1_strand,dfinfo_2_strand],axis=0) 390 | 391 | ############################################################# 392 | # extact sequnece from genome 393 | 394 | # load genome 395 | genome = Fasta(genomefile) 396 | 397 | # chrmosome info 398 | chrmosome_list = genome.keys() 399 | 400 | # save in dict 401 | res = {} 402 | for line in range(0,df_fianl.shape[0]): 403 | 404 | # chromosome strand 405 | fileds = df_fianl.iloc[line] 406 | chrom = fileds['chr'] 407 | strand = fileds['strand'] 408 | start = int(fileds['start']) 409 | end = int(fileds['end']) 410 | # key 411 | key = fileds['id'] 412 | # filter chromoseome 413 | if chrom in chrmosome_list: 414 | # extarct sequence 415 | if strand == '+': 416 | seq = genome[chrom][(start-1):end].seq 417 | elif strand == '-': 418 | seq = genome[chrom][(start-1):end].complement.reverse.seq 419 | else: 420 | pass 421 | # save in dict 422 | res.setdefault(key,'') 423 | res[key] += seq 424 | else: 425 | pass 426 | 427 | ############################################################# 428 | # 输出序列 429 | outputfile = open(outfile,'w') 430 | 431 | # fasta序列分割长度 432 | my_length = 60 433 | 434 | # 输出 435 | for key,val in res.items(): 436 | outputfile.write('>' + key + '\n') 437 | while len(val) > my_length: 438 | outputfile.write(val[0:my_length] + '\n') 439 | val = val[my_length:len(val)] 440 | outputfile.write(val + '\n') 441 | 442 | # 关闭文件 443 | outputfile.close() 444 | 445 | #################################################################################### 446 | job_stop = time.time() 447 | print("Your job is done! ") 448 | print("Running with " + str(round(job_stop - job_start,2)) + " seconds!") 449 | 450 | if __name__=="__main__": 451 | main() -------------------------------------------------------------------------------- /src/GetTransTool/GetCDSLongestFromGencode.py: -------------------------------------------------------------------------------- 1 | def main(): 2 | """ 3 | Extract longest CDS regeion with longest transcript from gencode database transcripts fasta file. 4 | Only for human and mouse transcripts fasta file. 5 | """ 6 | 7 | # 导入模块 8 | import pandas as pd 9 | import gzip 10 | import time 11 | import argparse 12 | 13 | parser = argparse.ArgumentParser(usage="GetCDSLongestFromGencode --file gencode.vM28.pc_transcripts.fa.gz --outfile longest_cds_trans.fa", 14 | description="Extract longest CDS regeion with longest transcript from gencode database transcripts fasta file.", 15 | epilog="Thank your for your support, if you have any questions or suggestions please contact me: 3219030654@stu.cpu.edu.cn.") 16 | parser.add_argument('-v','--version', action='version', version='%(prog)s 0.0.3') 17 | 18 | # 读取转录本文件 19 | parser.add_argument('-f','--file', type=str,action="store",dest="transfile",metavar="transfile",help='input your protein-coding transcripts file with ".gz" format. (gencode.vM28.pc_transcripts.fa.gz)') 20 | # 导出文件名称 21 | parser.add_argument('-o','--outfile', type=str,action="store",dest="longestfile",metavar="longestfile",help='output your longest transcript file. (longest_cds_trans.fa)') 22 | # 解析参数 23 | args = parser.parse_args() 24 | 25 | # 获取参数 26 | inputfile = args.transfile 27 | outfile = args.longestfile 28 | 29 | # main fuction 30 | print("Your job is running, please wait...") 31 | ###################################################################### 32 | job_start = time.time() 33 | ####################### 34 | # 储存id 35 | cdsinfo = {} 36 | 37 | # 储存改名文件 38 | tmpfile = open('name_changed.fa','w') 39 | 40 | # main code 41 | with gzip.open(inputfile,'rt') as pc: 42 | for line in pc: 43 | if line.startswith('>'): 44 | # split id 45 | fileds = line.split(sep='|') 46 | gene_name = '>' + fileds[5] 47 | gene_id = fileds[1] 48 | trans_id = fileds[0].replace('>','') 49 | trans_length = fileds[6] 50 | # include 5UTR+CDS+3UTR or 5UTR+CDS/ 51 | if fileds[8].startswith('CDS'): 52 | cds_range = fileds[8].split(sep=':')[1].split(sep='-') 53 | cds_start = cds_range[0] 54 | cds_end = cds_range[1] 55 | # cds length 56 | cds_len = int(cds_end) - int(cds_start) + 1 57 | fullname = '|'.join([gene_name,gene_id,trans_id,cds_start,cds_end,trans_length]) 58 | # save 59 | tmpfile.write(fullname + '\n') 60 | # save in dict 61 | cdsinfo[fullname] = str(cds_len) 62 | # include CDS+3UTR or only CDS 63 | else: 64 | cds_range = fileds[7].split(sep=':')[1].split(sep='-') 65 | cds_start = cds_range[0] 66 | cds_end = cds_range[1] 67 | # cds length 68 | cds_len = int(cds_end) - int(cds_start) + 1 69 | fullname = '|'.join([gene_name,gene_id,trans_id,cds_start,cds_end,trans_length]) 70 | # save 71 | tmpfile.write(fullname + '\n') 72 | # save in dict 73 | cdsinfo[fullname] = str(cds_len) 74 | else: 75 | # write seq 76 | tmpfile.write(line) 77 | 78 | # close file 79 | tmpfile.close() 80 | 81 | ################################################# 82 | # transform into datafarme 83 | tmp = [[key,key.split(sep='|')[0],int(key.split(sep='|')[5]),int(val)] for key,val in cdsinfo.items()] 84 | 85 | # 转为数据框 86 | data_info = pd.DataFrame(tmp,columns=['fullname','gene_name','translength','cdslength']) 87 | 88 | # 按gen_name cdslength translength 降序排序 89 | data_infonew = data_info.sort_values(by = ['gene_name','cdslength','translength'],ascending = False,inplace=False) 90 | 91 | # 保存 92 | data_infonew.to_csv(r'All_transcripts_cds_info.csv', index=False) 93 | 94 | # 筛选最长转录本id 95 | longest_id = list(data_infonew.drop_duplicates(subset=['gene_name'],keep='first')['fullname']) 96 | 97 | # 筛选最长转录本表格 98 | longest_data = data_infonew.loc[data_infonew.fullname.isin(longest_id)] 99 | 100 | # 保存 101 | longest_data.to_csv(r'longest_cds_transcripts_info.csv', index=False) 102 | 103 | ################################################# 104 | # prepare filter id 105 | filter_id = {id:0 for id in list(longest_data.fullname)} 106 | 107 | # 读取 fasta 文件保存为字典 108 | with open('name_changed.fa') as fa: 109 | fa_dict = {} 110 | for line in fa: 111 | if line.startswith('>'): 112 | seq_name = line.strip() 113 | fa_dict[seq_name] = '' 114 | else: 115 | # 序列 116 | fa_dict[seq_name] += line.replace('\n','') 117 | 118 | # 新建输出结果文件 119 | output_fa = open(outfile,'w') 120 | 121 | # fasta序列分割长度 122 | my_length = 60 123 | 124 | # 输出 125 | for key,val in fa_dict.items(): 126 | if key in filter_id: 127 | output_fa.write(key + '\n') 128 | while len(val) > my_length: 129 | output_fa.write(val[0:my_length] + '\n') 130 | val = val[my_length:len(val)] 131 | output_fa.write(val + '\n') 132 | 133 | # 关闭文件 134 | output_fa.close() 135 | 136 | #################################################################################### 137 | job_stop = time.time() 138 | print("Your job is done! ") 139 | print("Running with " + str(round(job_stop - job_start,2)) + " seconds!") 140 | 141 | if __name__=="__main__": 142 | main() -------------------------------------------------------------------------------- /src/GetTransTool/GetLongestTransFromGTF.py: -------------------------------------------------------------------------------- 1 | def main(): 2 | """ 3 | Extract longest transcript from gtf format annotation file based on gencode/ensembl/ucsc database. 4 | """ 5 | 6 | # 导入模块 7 | import pandas as pd 8 | import gzip 9 | import time 10 | import argparse 11 | from pyfaidx import Fasta 12 | 13 | parser = argparse.ArgumentParser(usage="GetLongestTransFromGTF --database ensembl --gtffile Homo_sapiens.GRCh38.101.gtf.gz --genome Homo_sapiens.GRCh38.dna.primary_assembly.fa --outfile longest_trans.fa", 14 | description="Extract longest transcript from gtf format annotation file based on gencode/ensembl/ucsc database.", 15 | epilog="Thank your for your support, if you have any questions or suggestions please contact me: 3219030654@stu.cpu.edu.cn.") 16 | # version 17 | parser.add_argument('-v','--version', action='version', version='%(prog)s 0.0.7') 18 | # 读取注释类型文件 19 | parser.add_argument('-d','--database',type=str,action="store",dest="database",metavar="databse",choices=['ucsc','ensembl','gencode'],default="ensembl", 20 | help='which annotation database you choose. (default="ensembl", ucsc/ensembl/gencode)') 21 | # 读取gtf文件 22 | parser.add_argument('-g','--gtffile', type=str,action="store",dest="gtffile",metavar="gtffile", 23 | help='input your GTF file with ".gz" format.') 24 | # 读取基因组fasta文件 25 | parser.add_argument('-fa','--genome',type=str,action="store",dest="genome",metavar="genome", 26 | help='your genome fasta file matched with your GTF file with ".fa/.fasta" format. (Homo_sapiens.GRCh38.dna.primary_assembly.fa)') 27 | # 导出文件名称 28 | parser.add_argument('-o','--outfile', type=str,action="store",dest="longestfile",metavar="longestfile", 29 | help='output your longest transcript file. (longest_trans.fa)') 30 | # 解析参数 31 | args = parser.parse_args() 32 | 33 | # 获取参数 34 | db = args.database 35 | gtffile = args.gtffile 36 | genomefile = args.genome 37 | outfile = args.longestfile 38 | 39 | # main fuction 40 | print("Your job is running, please wait...") 41 | ###################################################################### 42 | job_start = time.time() 43 | ####################### 44 | if db == 'ensembl': 45 | #################################################################################### 46 | # 信息保存在字典里 47 | info = {} 48 | # 打开测试 gtf 文件 49 | with gzip.open(gtffile,'rt') as gtf: 50 | for line in gtf: 51 | # 跳过注释行 52 | if line.startswith('#'): 53 | continue 54 | # 分割 55 | fields = line.split() 56 | # 类型 57 | type = fields[2] 58 | if type == 'exon': 59 | # 名称 60 | gene_name = fields[19].replace('"','').replace(';','') 61 | gene_id = fields[9].replace('"','').replace(';','') 62 | trans_id = fields[13].replace('"','').replace(';','') 63 | biotype = fields[23].replace('"','').replace(';','') 64 | # 连接名称 65 | key = '|'.join([gene_name,gene_id,trans_id,biotype]) 66 | # 计算多个外显子长度 67 | start = int(fields[3]) 68 | end = int(fields[4]) 69 | length = end - start + 1 70 | # 累计求和 71 | info.setdefault(key,0) 72 | info[key] += length 73 | ###################################### 74 | # 转为数据框 75 | res = pd.DataFrame(pd.Series(info), columns = ['transcript_length']) 76 | 77 | # 添加基因名列 78 | res['gene_name'] = [line.split(sep='|')[0] for line in list(res.index[:])] 79 | 80 | # 排序 81 | res_sorted = res.sort_values(by = ['gene_name','transcript_length'],ascending=False) 82 | 83 | # 筛选最长转录本id 84 | longest_id = res_sorted.drop_duplicates(subset=['gene_name'],keep='first').index.values.tolist() 85 | 86 | # 筛选最长转录本表格 87 | longest_data = res_sorted.loc[res_sorted.index.isin(longest_id)] 88 | 89 | longest_data['ID'] = longest_data.index 90 | # order columns 91 | longest_data = longest_data[['ID','gene_name','transcript_length']] 92 | 93 | # 保存 94 | longest_data.to_csv(r'longest_transcripts_info.csv', index=False) 95 | 96 | ########################################################### 97 | # 储存最长转录本id 98 | transid = {line.split(sep='_')[2]:line for line in list(longest_data.ID)} 99 | 100 | infolist = [] 101 | with gzip.open(gtffile) as gtf: 102 | for line in gtf: 103 | # skip 104 | if line.startswith('#'): 105 | continue 106 | # split 107 | fields = line.split() 108 | # feature type 109 | type = fields[2] 110 | if type == 'exon': 111 | # pos 112 | chr = fields[0] 113 | start = fields[3] 114 | end = fields[4] 115 | strand = fields[6] 116 | # name 117 | gene_name = fields[19].replace('"','').replace(';','') 118 | gene_id = fields[9].replace('"','').replace(';','') 119 | trans_id = fields[13].replace('"','').replace(';','') 120 | if trans_id in transid: 121 | infolist.append([chr,start,end,strand,type,gene_name,gene_id,trans_id,transid[trans_id]]) 122 | else: 123 | pass 124 | else: 125 | pass 126 | 127 | # to dataframe 128 | dfinfo = pd.DataFrame(infolist,columns=['chr','start','end','strand','type','gene_name','gene_id','trans_id','id']) 129 | dfinfo_1_strand = dfinfo[dfinfo['strand'] == '+'] 130 | 131 | # descrese coord by - strand gene 132 | dfinfo_2_strand = dfinfo[dfinfo['strand'] == '-'] 133 | dfinfo_2_strand = dfinfo_2_strand.sort_values(by = ['trans_id','start','end'],ascending = False) 134 | 135 | # merge 136 | df_fianl = pd.concat([dfinfo_1_strand,dfinfo_2_strand],axis=0) 137 | 138 | ########################################################### 139 | # extact sequnece from genome 140 | 141 | # load genome 142 | genome = Fasta(genomefile) 143 | 144 | # chrmosome info 145 | chrmosome_list = genome.keys() 146 | 147 | # save in dict 148 | res = {} 149 | for line in range(0,df_fianl.shape[0]): 150 | 151 | # chromosome strand 152 | fileds = df_fianl.iloc[line] 153 | chrom = fileds['chr'] 154 | strand = fileds['strand'] 155 | start = int(fileds['start']) 156 | end = int(fileds['end']) 157 | # key 158 | key = fileds['id'] 159 | # filter chromoseome 160 | if chrom in chrmosome_list: 161 | # extarct sequence 162 | if strand == '+': 163 | seq = genome[chrom][(start-1):end].seq 164 | elif strand == '-': 165 | seq = genome[chrom][(start-1):end].complement.reverse.seq 166 | else: 167 | pass 168 | # save in dict 169 | res.setdefault(key,'') 170 | res[key] += seq 171 | else: 172 | pass 173 | 174 | ########################################################### 175 | # 输出序列 176 | outputfile = open(outfile,'w') 177 | 178 | # fasta序列分割长度 179 | my_length = 60 180 | ########################################################### 181 | # 输出 182 | for key,val in res.items(): 183 | outputfile.write('>' + key + '\n') 184 | while len(val) > my_length: 185 | outputfile.write(val[0:my_length] + '\n') 186 | val = val[my_length:len(val)] 187 | outputfile.write(val + '\n') 188 | 189 | # 关闭文件 190 | outputfile.close() 191 | #################################################################################### 192 | elif db == 'gencode': 193 | # 信息保存在字典里 194 | info = {} 195 | # 打开测试 gtf 文件 196 | with gzip.open(gtffile,'rt') as gtf: 197 | for line in gtf: 198 | # 跳过注释行 199 | if line.startswith('#'): 200 | continue 201 | # 分割 202 | fields = line.split() 203 | # 类型 204 | type = fields[2] 205 | if type == 'exon': 206 | # 名称 207 | gene_name = fields[15].replace('"','').replace(';','') 208 | gene_id = fields[9].replace('"','').replace(';','') 209 | trans_id = fields[11].replace('"','').replace(';','') 210 | biotype = fields[13].replace('"','').replace(';','') 211 | # 连接名称 212 | key = '|'.join([gene_name,gene_id,trans_id,biotype]) 213 | # 计算多个外显子长度 214 | start = int(fields[3]) 215 | end = int(fields[4]) 216 | length = end - start + 1 217 | # 累计求和 218 | info.setdefault(key,0) 219 | info[key] += length 220 | 221 | ###################################### 222 | # 转为数据框 223 | res = pd.DataFrame(pd.Series(info), columns = ['transcript_length']) 224 | 225 | # 添加基因名列 226 | res['gene_name'] = [line.split(sep='|')[0] for line in list(res.index[:])] 227 | 228 | # 排序 229 | res_sorted = res.sort_values(by = ['gene_name','transcript_length'],ascending=False) 230 | 231 | # 筛选最长转录本id 232 | longest_id = res_sorted.drop_duplicates(subset=['gene_name'],keep='first').index.values.tolist() 233 | 234 | # 筛选最长转录本表格 235 | longest_data = res_sorted.loc[res_sorted.index.isin(longest_id)] 236 | 237 | longest_data['ID'] = longest_data.index 238 | # order columns 239 | longest_data = longest_data[['ID','gene_name','transcript_length']] 240 | 241 | # 保存 242 | longest_data.to_csv(r'longest_transcripts_info.csv', index=True) 243 | 244 | ########################################################### 245 | # 储存最长转录本id 246 | transid = {line.split(sep='|')[2]:line for line in list(longest_data.ID)} 247 | 248 | infolist = [] 249 | with gzip.open(gtffile,'rt') as gtf: 250 | for line in gtf: 251 | # skip 252 | if line.startswith('#'): 253 | continue 254 | # split 255 | fields = line.split() 256 | # feature type 257 | type = fields[2] 258 | if type == 'exon': 259 | # pos 260 | chr = fields[0] 261 | start = fields[3] 262 | end = fields[4] 263 | strand = fields[6] 264 | # name 265 | gene_name = fields[15].replace('"','').replace(';','') 266 | gene_id = fields[9].replace('"','').replace(';','') 267 | trans_id = fields[11].replace('"','').replace(';','') 268 | if trans_id in transid: 269 | infolist.append([chr,start,end,strand,type,gene_name,gene_id,trans_id,transid[trans_id]]) 270 | else: 271 | pass 272 | else: 273 | pass 274 | 275 | # to dataframe 276 | dfinfo = pd.DataFrame(infolist,columns=['chr','start','end','strand','type','gene_name','gene_id','trans_id','id']) 277 | dfinfo_1_strand = dfinfo[dfinfo['strand'] == '+'] 278 | 279 | # descrese coord by - strand gene 280 | dfinfo_2_strand = dfinfo[dfinfo['strand'] == '-'] 281 | dfinfo_2_strand = dfinfo_2_strand.sort_values(by = ['trans_id','start','end'],ascending = False) 282 | 283 | # merge 284 | df_fianl = pd.concat([dfinfo_1_strand,dfinfo_2_strand],axis=0) 285 | ########################################################### 286 | 287 | # extact sequnece from genome 288 | 289 | # load genome 290 | genome = Fasta(genomefile) 291 | 292 | # chrmosome info 293 | chrmosome_list = genome.keys() 294 | 295 | # save in dict 296 | res = {} 297 | for line in range(0,df_fianl.shape[0]): 298 | 299 | # chromosome strand 300 | fileds = df_fianl.iloc[line] 301 | chrom = fileds['chr'] 302 | strand = fileds['strand'] 303 | start = int(fileds['start']) 304 | end = int(fileds['end']) 305 | # key 306 | key = fileds['id'] 307 | # filter chromoseome 308 | if chrom in chrmosome_list: 309 | # extarct sequence 310 | if strand == '+': 311 | seq = genome[chrom][(start-1):end].seq 312 | elif strand == '-': 313 | seq = genome[chrom][(start-1):end].complement.reverse.seq 314 | else: 315 | pass 316 | # save in dict 317 | res.setdefault(key,'') 318 | res[key] += seq 319 | else: 320 | pass 321 | 322 | ########################################################### 323 | # 输出序列 324 | outputfile = open(outfile,'w') 325 | 326 | # fasta序列分割长度 327 | my_length = 60 328 | 329 | # 输出 330 | for key,val in res.items(): 331 | outputfile.write('>' + key + '\n') 332 | while len(val) > my_length: 333 | outputfile.write(val[0:my_length] + '\n') 334 | val = val[my_length:len(val)] 335 | outputfile.write(val + '\n') 336 | 337 | # 关闭文件 338 | outputfile.close() 339 | #################################################################################### 340 | elif db == 'ucsc': 341 | # 信息保存在字典里 342 | info = {} 343 | # 打开测试 gtf 文件 344 | with gzip.open(gtffile,'rt') as gtf: 345 | for line in gtf: 346 | # 跳过注释行 347 | if line.startswith('#'): 348 | continue 349 | # 分割 350 | fields = line.split() 351 | # 类型 352 | type = fields[2] 353 | if type == 'exon': 354 | # 名称 355 | gene_name = fields[17].replace('"','').replace(';','') 356 | gene_id = fields[9].replace('"','').replace(';','') 357 | trans_id = fields[11].replace('"','').replace(';','') 358 | # 连接名称 359 | key = '|'.join([gene_name,gene_id,trans_id]) 360 | # 计算多个外显子长度 361 | start = int(fields[3]) 362 | end = int(fields[4]) 363 | length = end - start + 1 364 | # 累计求和 365 | info.setdefault(key,0) 366 | info[key] += length 367 | ###################################### 368 | # 转为数据框 369 | res = pd.DataFrame(pd.Series(info), columns = ['transcript_length']) 370 | 371 | # 添加基因名列 372 | res['gene_name'] = [line.split(sep='|')[0] for line in list(res.index[:])] 373 | 374 | # 排序 375 | res_sorted = res.sort_values(by = ['gene_name','transcript_length'],ascending=False) 376 | 377 | # 筛选最长转录本id 378 | longest_id = res_sorted.drop_duplicates(subset=['gene_name'],keep='first').index.values.tolist() 379 | 380 | # 筛选最长转录本表格 381 | longest_data = res_sorted.loc[res_sorted.index.isin(longest_id)] 382 | 383 | longest_data['ID'] = longest_data.index 384 | # order columns 385 | longest_data = longest_data[['ID','gene_name','transcript_length']] 386 | 387 | # 保存 388 | longest_data.to_csv(r'longest_transcripts_info.csv', index=True) 389 | 390 | ########################################################### 391 | # 储存最长转录本id 392 | transid = {line.split(sep='|')[2]:line for line in list(longest_data.ID)} 393 | 394 | infolist = [] 395 | with gzip.open(gtffile) as gtf: 396 | for line in gtf: 397 | # skip 398 | if line.startswith('#'): 399 | continue 400 | # split 401 | fields = line.split() 402 | # feature type 403 | type = fields[2] 404 | if type == 'exon': 405 | # pos 406 | chr = fields[0] 407 | start = fields[3] 408 | end = fields[4] 409 | strand = fields[6] 410 | # name 411 | gene_name = fields[17].replace('"','').replace(';','') 412 | gene_id = fields[9].replace('"','').replace(';','') 413 | trans_id = fields[11].replace('"','').replace(';','') 414 | if trans_id in transid: 415 | infolist.append([chr,start,end,strand,type,gene_name,gene_id,trans_id,transid[trans_id]]) 416 | else: 417 | pass 418 | else: 419 | pass 420 | 421 | # to dataframe 422 | dfinfo = pd.DataFrame(infolist,columns=['chr','start','end','strand','type','gene_name','gene_id','trans_id','id']) 423 | dfinfo_1_strand = dfinfo[dfinfo['strand'] == '+'] 424 | 425 | # descrese coord by - strand gene 426 | dfinfo_2_strand = dfinfo[dfinfo['strand'] == '-'] 427 | dfinfo_2_strand = dfinfo_2_strand.sort_values(by = ['trans_id','start','end'],ascending = False) 428 | 429 | # merge 430 | df_fianl = pd.concat([dfinfo_1_strand,dfinfo_2_strand],axis=0) 431 | 432 | ########################################################### 433 | # extact sequnece from genome 434 | 435 | # load genome 436 | genome = Fasta(genomefile) 437 | 438 | # chrmosome info 439 | chrmosome_list = genome.keys() 440 | 441 | # save in dict 442 | res = {} 443 | for line in range(0,df_fianl.shape[0]): 444 | 445 | # chromosome strand 446 | fileds = df_fianl.iloc[line] 447 | chrom = fileds['chr'] 448 | strand = fileds['strand'] 449 | start = int(fileds['start']) 450 | end = int(fileds['end']) 451 | # key 452 | key = fileds['id'] 453 | # filter chromoseome 454 | if chrom in chrmosome_list: 455 | # extarct sequence 456 | if strand == '+': 457 | seq = genome[chrom][(start-1):end].seq 458 | elif strand == '-': 459 | seq = genome[chrom][(start-1):end].complement.reverse.seq 460 | else: 461 | pass 462 | # save in dict 463 | res.setdefault(key,'') 464 | res[key] += seq 465 | else: 466 | pass 467 | 468 | ########################################################### 469 | 470 | # 输出序列 471 | outputfile = open(outfile,'w') 472 | 473 | # fasta序列分割长度 474 | my_length = 60 475 | 476 | # 输出 477 | for key,val in res.items(): 478 | outputfile.write('>' + key + '\n') 479 | while len(val) > my_length: 480 | outputfile.write(val[0:my_length] + '\n') 481 | val = val[my_length:len(val)] 482 | outputfile.write(val + '\n') 483 | 484 | # 关闭文件 485 | outputfile.close() 486 | 487 | #################################################################################### 488 | job_stop = time.time() 489 | print("Your job is done! ") 490 | print("Running with " + str(round(job_stop - job_start,2)) + " seconds!") 491 | 492 | if __name__=="__main__": 493 | main() -------------------------------------------------------------------------------- /src/GetTransTool/GetLongestTransFromGencode.py: -------------------------------------------------------------------------------- 1 | def main(): 2 | """ 3 | Extract longest transcript from gencode database transcripts fasta file. 4 | Only for human and mouse transcripts fasta file. 5 | """ 6 | 7 | # 导入模块 8 | import pandas as pd 9 | import gzip 10 | import time 11 | import argparse 12 | 13 | parser = argparse.ArgumentParser(usage="GetLongestTransFromGencode --file gencode.vM28.transcripts.fa.gz --outfile longest_trans.fa", 14 | description="Get longest transcripts from gencode transcripts fasta file.", 15 | epilog="Thank your for your support, if you have any questions or suggestions please contact me: 3219030654@stu.cpu.edu.cn.") 16 | parser.add_argument('-v','--version', action='version', version='%(prog)s 0.0.3') 17 | 18 | # 读取转录本文件 19 | parser.add_argument('-f','--file', type=str,action="store",dest="transfile",metavar="transfile",help='input your transcripts file with ".gz" format. (gencode.vM28.transcripts.fa.gz)') 20 | # 导出文件名称 21 | parser.add_argument('-o','--outfile', type=str,action="store",dest="longestfile",metavar="longestfile",help='output your longest transcript file. (longest_trans.fa)') 22 | # 解析参数 23 | args = parser.parse_args() 24 | 25 | # 获取参数 26 | inputfile = args.transfile 27 | outfile = args.longestfile 28 | 29 | # main fuction 30 | print("Your job is running, please wait...") 31 | ###################################################################### 32 | job_start = time.time() 33 | ####################### 34 | # 保存ID列表 35 | info = [] 36 | with gzip.open(inputfile,'rt') as tx: 37 | for line in tx: 38 | if line.startswith('>'): 39 | info_name = line.replace('\n','') 40 | info.append(info_name) 41 | 42 | # 更改名称 43 | comb = [] 44 | for line in info: 45 | # 分割取出需要信息 46 | gene_name = line.split('|')[5] 47 | gene_id = line.split('|')[1] 48 | transcript_id = line.split('|')[0].replace('>','') 49 | length = line.split('|')[6] 50 | 51 | # 连接完整名称 52 | # finame = gene_name + '_' + gene_id + '_' + transcript_id + '_' + length 53 | finame = '|'.join([gene_name,gene_id,transcript_id,length]) 54 | # 保存到列表 55 | comb.append([finame,gene_name,int(length)]) 56 | 57 | # 转为数据框 58 | data_info = pd.DataFrame(comb,columns=['fullname','gene_name','translength']) 59 | 60 | ###################################################################### 61 | # 新建输出改名结果文件 62 | output_fa = open('name_changed.fa','w') 63 | 64 | # loop change ID 65 | with gzip.open(inputfile,'rt') as tx: 66 | for line in tx: 67 | if line.startswith('>'): 68 | info_name = line.replace('\n','') 69 | # 分割取出需要信息 70 | gene_name = info_name.split('|')[5] 71 | gene_id = info_name.split('|')[1] 72 | transcript_id = info_name.split('|')[0].replace('>','') 73 | length = info_name.split('|')[6] 74 | # 连接完整名称 75 | finame = '>' + gene_name + '|' + gene_id + '|' + transcript_id + '|' + length + '\n' 76 | # 命名 77 | output_fa.write(finame) 78 | else: 79 | seq = line 80 | output_fa.write(seq) 81 | 82 | # 关闭文件 83 | output_fa.close() 84 | 85 | ###################################################################### 86 | # 按转录本序列降序排序 87 | data_infonew = data_info.sort_values(by = ['gene_name','translength'],ascending = False,inplace=False) 88 | 89 | # 筛选最长转录本id 90 | longest_id = list('>' + data_infonew.drop_duplicates(subset=['gene_name'],keep='first')['fullname']) 91 | 92 | ####################### save csv 93 | # remove '>' 94 | longest_idClean = [id.replace('>','') for id in longest_id] 95 | 96 | # 筛选最长转录本表格 97 | longest_data = data_infonew.loc[data_infonew.fullname.isin(longest_idClean)] 98 | 99 | # 保存 100 | longest_data.to_csv(r'longest_transcripts_info.csv', index=False) 101 | 102 | ###################################################################### 103 | # 保存筛选的id为字典 104 | filter_id = {id:0 for id in longest_id} 105 | 106 | # 读取 fasta 文件保存为字典 107 | with open('name_changed.fa') as fa: 108 | fa_dict = {} 109 | for line in fa: 110 | if line.startswith('>'): 111 | seq_name = line.strip() 112 | fa_dict[seq_name] = '' 113 | else: 114 | # 序列 115 | fa_dict[seq_name] += line.replace('\n','') 116 | 117 | # 新建输出结果文件 118 | output_fa = open(outfile,'w') 119 | 120 | # fasta序列分割长度 121 | my_length = 60 122 | 123 | # 输出 124 | for key,val in fa_dict.items(): 125 | if key in filter_id: 126 | output_fa.write(key + '\n') 127 | while len(val) > my_length: 128 | output_fa.write(val[0:my_length] + '\n') 129 | val = val[my_length:len(val)] 130 | output_fa.write(val + '\n') 131 | 132 | # 关闭文件 133 | output_fa.close() 134 | 135 | #################################################################################### 136 | job_stop = time.time() 137 | print("Your job is done! ") 138 | print("Running with " + str(round(job_stop - job_start,2)) + " seconds!") 139 | 140 | if __name__=="__main__": 141 | main() --------------------------------------------------------------------------------