├── .gitignore ├── LICENSE ├── README.md ├── dist ├── GetTss-0.0.2-py3-none-any.whl └── GetTss-0.0.2.tar.gz ├── pyproject.toml ├── setup.cfg └── src └── GetTss └── GetTss.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2018 The Python Packaging Authority 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GetTss python Package 2 | 3 | ### extract gene TSS site form gencode/ensembl/gencode database GTF file and export bed format file. 4 | 5 | ## Install 6 | 7 | ```shell 8 | $ pip install GetTss 9 | ``` 10 | 11 | ## Usage 12 | 13 | help infomation: 14 | 15 | ```shell 16 | $ GetTss -h 17 | usage: GetTss --database ucsc --gtffile hg19.ncbiRefSeq.gtf --tssfile testTSS.bed 18 | 19 | Get gene TSS site and export bed format from GTF annotation file. 20 | 21 | optional arguments: 22 | -h, --help show this help message and exit 23 | -v, --version show program's version number and exit 24 | -d {ucsc,ensembl,gencode}, --database {ucsc,ensembl,gencode} 25 | which annotation database you choose. (default="ensembl") 26 | -g GTFFILE, --gtffile GTFFILE 27 | input your GTF file. (ucsc/ensembl/gencode) 28 | -t TSSFILE, --tssfile TSSFILE 29 | output your TSS file. (test-TSS.bed) 30 | 31 | Thank your for your support, if you have any questions or suggestions please contact me: 3219030654@stu.cpu.edu.cn. 32 | ``` 33 | 34 | for ucsc gtf file: 35 | 36 | ```shell 37 | $ GetTss -d ucsc -g hg19.ncbiRefSeq.gtf -t ucsc-TSS.bed 38 | Your job is starting, please wait! 39 | You GTF file have: 104178 transcripts. 40 | 41 | Your task has down! 42 | 43 | $ head -n 3 ucsc-TSS.bed 44 | chrMT 16023 16024 TRNP . - 45 | chrMT 15887 15888 TRNT . + 46 | chrMT 14746 14747 CYTB . + 47 | ``` 48 | 49 | for gencode/ensembl gtf file: 50 | 51 | ```shell 52 | $ GetTss -d gencode -g gencode.v19.annotation.gtf -t test-TSS.bed 53 | Your job is starting, please wait! 54 | You GTF file have: 57820 genes. 55 | 56 | Your task has down! 57 | 58 | $ head -n 3 test-TSS.bed 59 | chr1 11868 11869 ENSG00000223972.4 . + 60 | chr1 29806 29807 ENSG00000227232.4 . - 61 | chr1 29553 29554 ENSG00000243485.2 . + 62 | ``` 63 | 64 | ## plot peaks density around TSS 65 | 66 | compute matrix: 67 | 68 | ```shell 69 | $ computeMatrix reference-point -S normal.bw treat.bw \ 70 | -R myTSS.bed \ 71 | --referencePoint center \ 72 | -a 3000 -b 3000 -p 25 \ 73 | -out matrix.tab.gz 74 | ``` 75 | 76 | plot Profile: 77 | 78 | ```shell 79 | $ plotProfile -m matrix.tab.gz \ 80 | -out profile.pdf \ 81 | --perGroup \ 82 | --plotTitle 'test profile' 83 | ``` -------------------------------------------------------------------------------- /dist/GetTss-0.0.2-py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junjunlab/GetTss/3476772dd1fa83208ccacf1c1ec9278d212fbc29/dist/GetTss-0.0.2-py3-none-any.whl -------------------------------------------------------------------------------- /dist/GetTss-0.0.2.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junjunlab/GetTss/3476772dd1fa83208ccacf1c1ec9278d212fbc29/dist/GetTss-0.0.2.tar.gz -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools>=42", 4 | "wheel" 5 | ] 6 | build-backend = "setuptools.build_meta" -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = GetTss 3 | version = 0.0.2 4 | author = laojunjun 5 | author_email = 3219030654@stu.cpu.edu.cn 6 | description = Get gene TSS site and export bed format from GTF annotation file 7 | long_description = file: README.md 8 | long_description_content_type = text/markdown 9 | url = https://github.com/junjunlab/GetTss 10 | project_urls = 11 | Bug Tracker = https://github.com/junjunlab/GetTss/issues 12 | classifiers = 13 | Programming Language :: Python :: 3 14 | License :: OSI Approved :: MIT License 15 | Operating System :: OS Independent 16 | 17 | [options] 18 | package_dir = 19 | = src 20 | packages = find: 21 | python_requires = >=3.6 22 | 23 | [options.packages.find] 24 | where = src 25 | 26 | [options.entry_points] 27 | console_scripts = 28 | GetTss = GetTss.GetTss:main -------------------------------------------------------------------------------- /src/GetTss/GetTss.py: -------------------------------------------------------------------------------- 1 | def main(): 2 | """ 3 | 提取GTF文件所有基因的TSS位点信息,输出为bed格式 4 | 适用ensembl,gencode,ucsc数据库的gtf注释文件 5 | 6 | 注意: 7 | GTF/GFF 格式坐标为1-based 8 | BED格式的为0-based 9 | """ 10 | 11 | # 引入库 12 | import argparse 13 | import sys 14 | 15 | parser = argparse.ArgumentParser(usage="GetTss --database ucsc --gtffile hg19.ncbiRefSeq.gtf --tssfile testTSS.bed", 16 | description="Get gene TSS site and export bed format from GTF annotation file.", 17 | epilog="Thank your for your support, if you have any questions or suggestions please contact me: 3219030654@stu.cpu.edu.cn.") 18 | parser.add_argument('-v','--version', action='version', version='%(prog)s 0.0.2') 19 | # 读取注释类型文件 20 | parser.add_argument('-d','--database',type=str,action="store",dest="Database",choices=['ucsc','ensembl','gencode'], 21 | default="ensembl",help='which annotation database you choose. (default="ensembl")') 22 | # 读取gtf文件 23 | parser.add_argument('-g','--gtffile', type=str,action="store",dest="GTFfile",help='input your GTF file. (ucsc/ensembl/gencode)') 24 | # 导出文件名称 25 | parser.add_argument('-t','--tssfile', type=str,action="store",dest="Tssfile",help='output your TSS file. (test-TSS.bed)') 26 | 27 | # parser.print_help() 28 | # parser.parse_args('-g test.gtf'.split()) 29 | args = parser.parse_args() 30 | 31 | # 获取参数 32 | Database = args.Database 33 | GTFfile = args.GTFfile 34 | Tssfile = args.Tssfile 35 | 36 | # 定义提取函数 37 | def GetTssBed(Database,GTFfile,Tssfile): 38 | print('Your job is starting, please wait!') 39 | # 储存文件 40 | outfile = open(Tssfile,'w') 41 | # 提取tss区间 42 | with open(GTFfile,'r') as gtfile: 43 | # gene数量 44 | target_number = 0 45 | # loop 46 | for line in gtfile: 47 | # 跳过开头注释 48 | if line.startswith('#'): 49 | continue 50 | # 分割 51 | fileds = line.split() 52 | # 类型为gene 53 | type = fileds[2] 54 | # 选择注释来源 55 | if Database == 'ucsc': 56 | # ucsc没有gene行,只有转录本 57 | if type == 'transcript': 58 | # 数转录本 59 | target_number += 1 60 | # 列信息 61 | chr = fileds[0] 62 | start = int(fileds[3]) 63 | end = int(fileds[4]) 64 | gene_id = fileds[9].replace('"','').replace(';','') 65 | strand = fileds[6] 66 | # 正链上的基因Tss为基因左边第一个碱基 67 | if strand == '+': 68 | start1 = start - 1 69 | end1 = start 70 | newline = chr + "\t" + str(start1) + "\t" + str(end1) + "\t" + gene_id + "\t" + "." + "\t" + strand 71 | outfile.write(newline + '\n') 72 | # 负链上的基因Tss为基因右边最后一个碱基 73 | else: 74 | start1 = end - 1 75 | end1 = end 76 | newline = chr + "\t" + str(start1) + "\t" + str(end1) + "\t" + gene_id + "\t" + "." + "\t" + strand 77 | outfile.write(newline + '\n') 78 | else: 79 | # ensembl和gencode注释文件有gene信息 80 | if type == 'gene': 81 | # 数基因 82 | target_number += 1 83 | # 列信息 84 | chr = fileds[0] 85 | start = int(fileds[3]) 86 | end = int(fileds[4]) 87 | gene_id = fileds[9].replace('"','').replace(';','') 88 | strand = fileds[6] 89 | # 正链上的基因Tss为基因左边第一个碱基 90 | if strand == '+': 91 | start1 = start - 1 92 | end1 = start 93 | newline = chr + "\t" + str(start1) + "\t" + str(end1) + "\t" + gene_id + "\t" + "." + "\t" + strand 94 | outfile.write(newline + '\n') 95 | # 负链上的基因Tss为基因右边最后一个碱基 96 | else: 97 | start1 = end - 1 98 | end1 = end 99 | newline = chr + "\t" + str(start1) + "\t" + str(end1) + "\t" + gene_id + "\t" + "." + "\t" + strand 100 | outfile.write(newline + '\n') 101 | # 打印完成信息 102 | if Database == 'ucsc': 103 | print("You GTF file have: " + str(target_number) + " transcripts." + "\n") 104 | else: 105 | print("You GTF file have: " + str(target_number) + " genes." + "\n") 106 | 107 | print("Your task has done!\n") 108 | # 关闭文件 109 | outfile.close() 110 | 111 | # 运行函数 112 | GetTssBed(Database=Database,GTFfile=GTFfile,Tssfile=Tssfile) 113 | 114 | # 测试code 115 | # GetTssBed(Database='gencode',GTFfile='gencode.v19.annotation.gtf',Tssfile='testTSS.bed') 116 | # GetTssBed(Database='ucsc',GTFfile='hg19.ncbiRefSeq.gtf',Tssfile='testTSS.bed') --------------------------------------------------------------------------------