├── .github ├── CODEOWNERS ├── pull_request_template.md └── workflows │ ├── python-black.yml │ └── python-pytest.yml ├── .gitignore ├── LICENSE ├── README.md ├── borzoi_logo.png ├── data ├── sequences_human.bed.gz ├── sequences_mouse.bed.gz ├── targets_human.txt.gz └── targets_mouse.txt.gz ├── download_models.sh ├── env_vars.sh ├── examples ├── .gitignore ├── CD99_example.gtf ├── CFHR2_example.gtf ├── GCFC2_example.gtf ├── borzoi_example_eqtl_chr10_116952944_T_C.ipynb ├── borzoi_example_eqtl_chr10_116952944_T_C_fancy.ipynb ├── borzoi_example_ipaqtl_chr10_116664061_G_A.ipynb ├── borzoi_example_paqtl_chr1_236763042_A_G.ipynb ├── borzoi_example_paqtl_chr1_236763042_A_G_fancy.ipynb ├── borzoi_example_sqtl_chr9_135548708_G_C.ipynb ├── borzoi_helpers.py ├── params.json ├── params_pred.json ├── targets_gtex.txt ├── targets_gtex_liver.txt ├── targets_human.txt ├── targets_mouse.txt └── targets_rna.txt ├── pyproject.toml ├── src ├── __init__.py ├── borzoi │ ├── __init__.py │ ├── helpers │ │ └── h5_grad_utils.py │ └── scripts │ │ └── borzoi_satg_gene_gpu.py ├── notebooks │ └── borzoi_snp.ipynb ├── scripts │ ├── _archive │ │ ├── borzoi_bench_crispr.py │ │ ├── borzoi_bench_crispr_folds.py │ │ ├── borzoi_bench_flowfish_folds.py │ │ ├── borzoi_bench_gasperini_folds.py │ │ ├── borzoi_satg_gene.py │ │ └── borzoi_satg_gene_multi.py │ ├── borzoi_bench_classify.py │ ├── borzoi_bench_gtex_folds_sad.py │ ├── borzoi_bench_gtex_folds_sed.py │ ├── borzoi_bench_ipaqtl_folds.py │ ├── borzoi_bench_paqtl_folds.py │ ├── borzoi_bench_sqtl_folds.py │ ├── borzoi_bench_trip_folds.py │ ├── borzoi_gtex_coef_sad.py │ ├── borzoi_gtex_coef_sed.py │ ├── borzoi_sad.py │ ├── borzoi_sad_folds.py │ ├── borzoi_satg_gene.py │ ├── borzoi_satg_gene_crispr_ism_shuffle.py │ ├── borzoi_satg_gene_focused_ism.py │ ├── borzoi_satg_polya.py │ ├── borzoi_satg_splice.py │ ├── borzoi_sed.py │ ├── borzoi_sed_folds.py │ ├── borzoi_sed_ipaqtl_cov.py │ ├── borzoi_sed_paqtl_cov.py │ ├── borzoi_test_apa.py │ ├── borzoi_test_apa_folds.py │ ├── borzoi_test_exons.py │ ├── borzoi_test_exons_folds.py │ ├── borzoi_test_genes.py │ ├── borzoi_test_genes_folds.py │ ├── borzoi_test_tss.py │ ├── borzoi_test_tss_folds.py │ ├── borzoi_tfmodisco.py │ ├── borzoi_tfmodisco_diff.py │ ├── borzoi_trip.py │ ├── bw_h5.py │ ├── idx_genome.py │ ├── pygene.py │ ├── slurm.py │ ├── util.py │ ├── w5_merge.py │ └── w5_qc.py └── tests │ ├── __init__.py │ └── test_dummy.py └── tutorials ├── latest ├── analyze_sv │ ├── README.md │ ├── analyze_indel.sh │ ├── analyze_vcf.py │ ├── data │ │ ├── STR.csv │ │ └── chr6_41897087_SV.vcf │ ├── download_dependencies_STR.sh │ ├── download_dependencies_SV.sh │ ├── save_STR_vcf.py │ ├── score_STRs.sh │ ├── score_tandem_repeats.py │ └── utils.py ├── interpret_sequence │ ├── HBE1_example.gtf │ ├── README.md │ ├── explore_grads_k562_HBE1.ipynb │ ├── run_gradients_expr_HBE1.sh │ └── vis_helpers.py ├── make_data │ ├── Makefile │ ├── README.md │ ├── download_bw.sh │ ├── download_dependencies.sh │ ├── process_w5.sh │ └── targets_human.txt ├── score_variants │ ├── README.md │ ├── run_variant_scripts.ipynb │ ├── score_expr_sad.sh │ ├── score_expr_sed.sh │ ├── score_polya.sh │ ├── score_splice.sh │ ├── snps_expr.vcf │ ├── snps_polya.vcf │ └── snps_splice.vcf └── train_model │ ├── README.md │ ├── params_micro.json │ ├── params_mini.json │ ├── train_micro.sh │ └── train_mini.sh └── legacy ├── interpret_sequence ├── README.md ├── explore_grads_liver_CFHR2.ipynb ├── explore_polya_grads_CD99.ipynb ├── explore_splice_grads_GCFC2.ipynb ├── run_gradients_expr_CFHR2.sh ├── run_gradients_polya_CD99.sh ├── run_gradients_splice_GCFC2.sh └── vis_helpers.py ├── make_data ├── Makefile ├── README.md ├── download_bw.sh ├── download_dependencies.sh ├── process_w5.sh └── targets_human.txt ├── score_variants ├── README.md ├── run_variant_scripts.ipynb ├── score_expr_sad.sh ├── score_expr_sed.sh ├── score_polya.sh ├── score_splice.sh ├── snps_expr.vcf ├── snps_polya.vcf └── snps_splice.vcf └── train_model ├── README.md ├── params_micro.json ├── params_mini.json ├── train_micro.sh └── train_mini.sh /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | # By default, PRs for this repo are automatically reviewed by: 2 | * @calico/sweng-dev @calico/data-eng-dev 3 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | ### Description of your changes 2 | 3 | 4 | 5 | ### Issue ticket number and link 6 | 7 | 8 | 9 | ### Type of change 10 | 11 | - [ ] Bug fix (non-breaking change which fixes an issue) 12 | - [ ] New feature (non-breaking change which adds functionality) 13 | - [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected) 14 | - [ ] Documentation add / update 15 | 16 | ### (If applicable) How has this been tested? 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /.github/workflows/python-black.yml: -------------------------------------------------------------------------------- 1 | name: Validate Black Formatting 2 | 3 | on: 4 | workflow_dispatch: 5 | push: 6 | paths: 7 | - "**.py" 8 | 9 | jobs: 10 | format: 11 | runs-on: ubuntu-20.04 12 | 13 | steps: 14 | - name: Checkout base repo 15 | uses: actions/checkout@v3 16 | 17 | - name: Set up Python 3.11 18 | uses: actions/setup-python@v2 19 | with: 20 | python-version: "3.11" 21 | 22 | - name: Install dependencies 23 | run: python3 -m pip install black~=22.3.0 24 | 25 | - name: Check Black formatting 26 | run: black --check . 27 | -------------------------------------------------------------------------------- /.github/workflows/python-pytest.yml: -------------------------------------------------------------------------------- 1 | name: Install Requirements and Run Pytest 2 | 3 | on: 4 | workflow_dispatch: 5 | push: 6 | paths: 7 | - "**.py" 8 | 9 | jobs: 10 | validate: 11 | runs-on: ubuntu-20.04 12 | strategy: 13 | matrix: 14 | python-version: ["3.8", "3.9", "3.10"] 15 | 16 | steps: 17 | - name: Checkout base repo 18 | uses: actions/checkout@v3 19 | 20 | - name: Set up Python ${{ matrix.python-version }} 21 | uses: actions/setup-python@v4 22 | with: 23 | python-version: ${{ matrix.python-version }} 24 | 25 | - name: Install dependencies 26 | run: | 27 | python3 -m pip install --upgrade pip 28 | python3 -m pip install '.[dev]' 29 | 30 | - name: Run pytest 31 | run: python -m pytest 32 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # JetBrains IDE files 2 | **/.idea* 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | pip-wheel-metadata/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | *.py,cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | db.sqlite3 65 | db.sqlite3-journal 66 | 67 | # Flask stuff: 68 | instance/ 69 | .webassets-cache 70 | 71 | # Scrapy stuff: 72 | .scrapy 73 | 74 | # Sphinx documentation 75 | docs/_build/ 76 | 77 | # PyBuilder 78 | target/ 79 | 80 | # Jupyter Notebook 81 | .ipynb_checkpoints 82 | 83 | # IPython 84 | profile_default/ 85 | ipython_config.py 86 | 87 | # pyenv 88 | .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | 3 | Version 2.0, January 2004 4 | 5 | http://www.apache.org/licenses/ 6 | 7 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 8 | 9 | 1. Definitions. 10 | 11 | "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. 16 | 17 | "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. 18 | 19 | "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. 20 | 21 | "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. 22 | 23 | "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). 24 | 25 | "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. 26 | 27 | "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." 28 | 29 | "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 30 | 31 | 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 32 | 33 | 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 34 | 35 | 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: 36 | 37 | You must give any other recipients of the Work or Derivative Works a copy of this License; and 38 | You must cause any modified files to carry prominent notices stating that You changed the files; and 39 | You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and 40 | If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. 41 | 42 | You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 43 | 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 44 | 45 | 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 46 | 47 | 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 48 | 49 | 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 50 | 51 | 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. 52 | 53 | END OF TERMS AND CONDITIONS 54 | -------------------------------------------------------------------------------- /borzoi_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/calico/borzoi/77670d9b0f940ff0a4b2b0edbdff445bbaefc6ed/borzoi_logo.png -------------------------------------------------------------------------------- /data/sequences_human.bed.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/calico/borzoi/77670d9b0f940ff0a4b2b0edbdff445bbaefc6ed/data/sequences_human.bed.gz -------------------------------------------------------------------------------- /data/sequences_mouse.bed.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/calico/borzoi/77670d9b0f940ff0a4b2b0edbdff445bbaefc6ed/data/sequences_mouse.bed.gz -------------------------------------------------------------------------------- /data/targets_human.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/calico/borzoi/77670d9b0f940ff0a4b2b0edbdff445bbaefc6ed/data/targets_human.txt.gz -------------------------------------------------------------------------------- /data/targets_mouse.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/calico/borzoi/77670d9b0f940ff0a4b2b0edbdff445bbaefc6ed/data/targets_mouse.txt.gz -------------------------------------------------------------------------------- /download_models.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # download model weights (data fold 3, 4 replicates) 4 | for rep in f3c0,f0 f3c1,f1 f3c2,f2 f3c3,f3; do IFS=","; set -- $rep; 5 | mkdir -p "examples/saved_models/$1/train" 6 | local_model="examples/saved_models/$1/train/model0_best.h5" 7 | if [ -f "$local_model" ]; then 8 | echo "$1 model already exists." 9 | else 10 | wget --progress=bar:force "https://storage.googleapis.com/seqnn-share/borzoi/$2/model0_best.h5" -O "$local_model" 11 | fi 12 | done 13 | 14 | # download and uncompress annotation files 15 | mkdir -p examples/hg38/genes/gencode41 16 | mkdir -p examples/hg38/genes/polyadb 17 | 18 | if [ -f examples/hg38/genes/gencode41/gencode41_basic_nort.gtf ]; then 19 | echo "Gene annotation already exists." 20 | else 21 | wget -O - https://storage.googleapis.com/seqnn-share/helper/gencode41_basic_nort.gtf.gz | gunzip -c > examples/hg38/genes/gencode41/gencode41_basic_nort.gtf 22 | fi 23 | 24 | if [ -f examples/hg38/genes/gencode41/gencode41_basic_nort_protein.gtf ]; then 25 | echo "Gene annotation (no read-through, protein-coding) already exists." 26 | else 27 | wget -O - https://storage.googleapis.com/seqnn-share/helper/gencode41_basic_nort_protein.gtf.gz | gunzip -c > examples/hg38/genes/gencode41/gencode41_basic_nort_protein.gtf 28 | fi 29 | 30 | if [ -f examples/hg38/genes/gencode41/gencode41_basic_protein.gtf ]; then 31 | echo "Gene annotation (protein-coding) already exists." 32 | else 33 | wget -O - https://storage.googleapis.com/seqnn-share/helper/gencode41_basic_protein.gtf.gz | gunzip -c > examples/hg38/genes/gencode41/gencode41_basic_protein.gtf 34 | fi 35 | 36 | if [ -f examples/hg38/genes/gencode41/gencode41_basic_tss2.bed ]; then 37 | echo "TSS annotation already exists." 38 | else 39 | wget -O - https://storage.googleapis.com/seqnn-share/helper/gencode41_basic_tss2.bed.gz | gunzip -c > examples/hg38/genes/gencode41/gencode41_basic_tss2.bed 40 | fi 41 | 42 | if [ -f examples/hg38/genes/gencode41/gencode41_basic_protein_splice.csv.gz ]; then 43 | echo "Splice site annotation already exist." 44 | else 45 | wget https://storage.googleapis.com/seqnn-share/helper/gencode41_basic_protein_splice.csv.gz -O examples/hg38/genes/gencode41/gencode41_basic_protein_splice.csv.gz 46 | fi 47 | 48 | if [ -f examples/hg38/genes/gencode41/gencode41_basic_protein_splice.gff ]; then 49 | echo "Splice site annotation already exist." 50 | else 51 | wget -O - https://storage.googleapis.com/seqnn-share/helper/gencode41_basic_protein_splice.gff.gz | gunzip -c > examples/hg38/genes/gencode41/gencode41_basic_protein_splice.gff 52 | fi 53 | 54 | if [ -f examples/hg38/genes/polyadb/polyadb_human_v3.csv.gz ]; then 55 | echo "PolyA site annotation already exist." 56 | else 57 | wget https://storage.googleapis.com/seqnn-share/helper/polyadb_human_v3.csv.gz -O examples/hg38/genes/polyadb/polyadb_human_v3.csv.gz 58 | fi 59 | 60 | # download and index hg38 genome 61 | mkdir -p examples/hg38/assembly/ucsc 62 | 63 | if [ -f examples/hg38/assembly/ucsc/hg38.fa ]; then 64 | echo "Human genome FASTA already exists." 65 | else 66 | wget -O - http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz | gunzip -c > examples/hg38/assembly/ucsc/hg38.fa 67 | python src/scripts/idx_genome.py examples/hg38/assembly/ucsc/hg38.fa 68 | fi 69 | -------------------------------------------------------------------------------- /env_vars.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # set these variables before running the script 4 | LOCAL_BORZOI_PATH="/home/jlinder/borzoi" 5 | LOCAL_CONDA_PATH="/home/jlinder/anaconda3/etc/profile.d/conda.sh" 6 | 7 | # create env_vars sh scripts in local conda env 8 | mkdir -p "$CONDA_PREFIX/etc/conda/activate.d" 9 | mkdir -p "$CONDA_PREFIX/etc/conda/deactivate.d" 10 | 11 | file_vars_act="$CONDA_PREFIX/etc/conda/activate.d/env_vars.sh" 12 | if ! [ -e $file_vars_act ]; then 13 | echo '#!/bin/sh' > $file_vars_act 14 | fi 15 | 16 | file_vars_deact="$CONDA_PREFIX/etc/conda/deactivate.d/env_vars.sh" 17 | if ! [ -e $file_vars_deact ]; then 18 | echo '#!/bin/sh' > $file_vars_deact 19 | fi 20 | 21 | # append env variable exports to /activate.d/env_vars.sh 22 | echo "export BORZOI_DIR=$LOCAL_BORZOI_PATH" >> $file_vars_act 23 | echo 'export PATH=$BORZOI_DIR/src/scripts:$PATH' >> $file_vars_act 24 | echo 'export PYTHONPATH=$BORZOI_DIR/src/scripts:$PYTHONPATH' >> $file_vars_act 25 | 26 | echo 'export BORZOI_HG38=$BORZOI_DIR/examples/hg38' >> $file_vars_act 27 | echo 'export BORZOI_MM10=$BORZOI_DIR/examples/mm10' >> $file_vars_act 28 | 29 | echo "export BORZOI_CONDA=$LOCAL_CONDA_PATH" >> $file_vars_act 30 | 31 | # append env variable unsets to /deactivate.d/env_vars.sh 32 | echo 'unset BORZOI_DIR' >> $file_vars_deact 33 | echo 'unset BORZOI_HG38' >> $file_vars_deact 34 | echo 'unset BORZOI_MM10' >> $file_vars_deact 35 | echo 'unset BORZOI_CONDA' >> $file_vars_deact 36 | 37 | # finally activate env variables 38 | source $file_vars_act 39 | -------------------------------------------------------------------------------- /examples/.gitignore: -------------------------------------------------------------------------------- 1 | gencode41_basic* 2 | hg38.fa* 3 | polyadb_human_v3.csv.gz 4 | saved_models/ 5 | .virtual_documents/ 6 | *.eps 7 | *.png 8 | -------------------------------------------------------------------------------- /examples/CFHR2_example.gtf: -------------------------------------------------------------------------------- 1 | chr1 HAVANA transcript 196943738 196959622 . + . gene_id "ENSG00000080910.14"; transcript_id "ENST00000367415.8"; gene_type "protein_coding"; gene_name "CFHR2"; transcript_type "protein_coding"; transcript_name "CFHR2-201"; level 2; protein_id "ENSP00000356385.4"; transcript_support_level "1"; hgnc_id "HGNC:4890"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; tag "appris_principal_2"; tag "CCDS"; ccdsid "CCDS30959.1"; havana_gene "OTTHUMG00000036518.4"; havana_transcript "OTTHUMT00000088815.4"; 2 | chr1 HAVANA exon 196943738 196943938 . + . gene_id "ENSG00000080910.14"; transcript_id "ENST00000367415.8"; gene_type "protein_coding"; gene_name "CFHR2"; transcript_type "protein_coding"; transcript_name "CFHR2-201"; exon_number 1; exon_id "ENSE00001920108.3"; level 2; protein_id "ENSP00000356385.4"; transcript_support_level "1"; hgnc_id "HGNC:4890"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; tag "appris_principal_2"; tag "CCDS"; ccdsid "CCDS30959.1"; havana_gene "OTTHUMG00000036518.4"; havana_transcript "OTTHUMT00000088815.4"; 3 | chr1 HAVANA CDS 196943881 196943938 . + 0 gene_id "ENSG00000080910.14"; transcript_id "ENST00000367415.8"; gene_type "protein_coding"; gene_name "CFHR2"; transcript_type "protein_coding"; transcript_name "CFHR2-201"; exon_number 1; exon_id "ENSE00001920108.3"; level 2; protein_id "ENSP00000356385.4"; transcript_support_level "1"; hgnc_id "HGNC:4890"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; tag "appris_principal_2"; tag "CCDS"; ccdsid "CCDS30959.1"; havana_gene "OTTHUMG00000036518.4"; havana_transcript "OTTHUMT00000088815.4"; 4 | chr1 HAVANA start_codon 196943881 196943883 . + 0 gene_id "ENSG00000080910.14"; transcript_id "ENST00000367415.8"; gene_type "protein_coding"; gene_name "CFHR2"; transcript_type "protein_coding"; transcript_name "CFHR2-201"; exon_number 1; exon_id "ENSE00001920108.3"; level 2; protein_id "ENSP00000356385.4"; transcript_support_level "1"; hgnc_id "HGNC:4890"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; tag "appris_principal_2"; tag "CCDS"; ccdsid "CCDS30959.1"; havana_gene "OTTHUMG00000036518.4"; havana_transcript "OTTHUMT00000088815.4"; 5 | chr1 HAVANA exon 196949455 196949649 . + . gene_id "ENSG00000080910.14"; transcript_id "ENST00000367415.8"; gene_type "protein_coding"; gene_name "CFHR2"; transcript_type "protein_coding"; transcript_name "CFHR2-201"; exon_number 2; exon_id "ENSE00003745979.1"; level 2; protein_id "ENSP00000356385.4"; transcript_support_level "1"; hgnc_id "HGNC:4890"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; tag "appris_principal_2"; tag "CCDS"; ccdsid "CCDS30959.1"; havana_gene "OTTHUMG00000036518.4"; havana_transcript "OTTHUMT00000088815.4"; 6 | chr1 HAVANA CDS 196949455 196949649 . + 2 gene_id "ENSG00000080910.14"; transcript_id "ENST00000367415.8"; gene_type "protein_coding"; gene_name "CFHR2"; transcript_type "protein_coding"; transcript_name "CFHR2-201"; exon_number 2; exon_id "ENSE00003745979.1"; level 2; protein_id "ENSP00000356385.4"; transcript_support_level "1"; hgnc_id "HGNC:4890"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; tag "appris_principal_2"; tag "CCDS"; ccdsid "CCDS30959.1"; havana_gene "OTTHUMG00000036518.4"; havana_transcript "OTTHUMT00000088815.4"; 7 | chr1 HAVANA exon 196950852 196951028 . + . gene_id "ENSG00000080910.14"; transcript_id "ENST00000367415.8"; gene_type "protein_coding"; gene_name "CFHR2"; transcript_type "protein_coding"; transcript_name "CFHR2-201"; exon_number 3; exon_id "ENSE00003831930.1"; level 2; protein_id "ENSP00000356385.4"; transcript_support_level "1"; hgnc_id "HGNC:4890"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; tag "appris_principal_2"; tag "CCDS"; ccdsid "CCDS30959.1"; havana_gene "OTTHUMG00000036518.4"; havana_transcript "OTTHUMT00000088815.4"; 8 | chr1 HAVANA CDS 196950852 196951028 . + 2 gene_id "ENSG00000080910.14"; transcript_id "ENST00000367415.8"; gene_type "protein_coding"; gene_name "CFHR2"; transcript_type "protein_coding"; transcript_name "CFHR2-201"; exon_number 3; exon_id "ENSE00003831930.1"; level 2; protein_id "ENSP00000356385.4"; transcript_support_level "1"; hgnc_id "HGNC:4890"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; tag "appris_principal_2"; tag "CCDS"; ccdsid "CCDS30959.1"; havana_gene "OTTHUMG00000036518.4"; havana_transcript "OTTHUMT00000088815.4"; 9 | chr1 HAVANA exon 196957891 196958073 . + . gene_id "ENSG00000080910.14"; transcript_id "ENST00000367415.8"; gene_type "protein_coding"; gene_name "CFHR2"; transcript_type "protein_coding"; transcript_name "CFHR2-201"; exon_number 4; exon_id "ENSE00003836915.1"; level 2; protein_id "ENSP00000356385.4"; transcript_support_level "1"; hgnc_id "HGNC:4890"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; tag "appris_principal_2"; tag "CCDS"; ccdsid "CCDS30959.1"; havana_gene "OTTHUMG00000036518.4"; havana_transcript "OTTHUMT00000088815.4"; 10 | chr1 HAVANA CDS 196957891 196958073 . + 2 gene_id "ENSG00000080910.14"; transcript_id "ENST00000367415.8"; gene_type "protein_coding"; gene_name "CFHR2"; transcript_type "protein_coding"; transcript_name "CFHR2-201"; exon_number 4; exon_id "ENSE00003836915.1"; level 2; protein_id "ENSP00000356385.4"; transcript_support_level "1"; hgnc_id "HGNC:4890"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; tag "appris_principal_2"; tag "CCDS"; ccdsid "CCDS30959.1"; havana_gene "OTTHUMG00000036518.4"; havana_transcript "OTTHUMT00000088815.4"; 11 | chr1 HAVANA exon 196958881 196959622 . + . gene_id "ENSG00000080910.14"; transcript_id "ENST00000367415.8"; gene_type "protein_coding"; gene_name "CFHR2"; transcript_type "protein_coding"; transcript_name "CFHR2-201"; exon_number 5; exon_id "ENSE00003843688.1"; level 2; protein_id "ENSP00000356385.4"; transcript_support_level "1"; hgnc_id "HGNC:4890"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; tag "appris_principal_2"; tag "CCDS"; ccdsid "CCDS30959.1"; havana_gene "OTTHUMG00000036518.4"; havana_transcript "OTTHUMT00000088815.4"; 12 | chr1 HAVANA CDS 196958881 196959077 . + 2 gene_id "ENSG00000080910.14"; transcript_id "ENST00000367415.8"; gene_type "protein_coding"; gene_name "CFHR2"; transcript_type "protein_coding"; transcript_name "CFHR2-201"; exon_number 5; exon_id "ENSE00003843688.1"; level 2; protein_id "ENSP00000356385.4"; transcript_support_level "1"; hgnc_id "HGNC:4890"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; tag "appris_principal_2"; tag "CCDS"; ccdsid "CCDS30959.1"; havana_gene "OTTHUMG00000036518.4"; havana_transcript "OTTHUMT00000088815.4"; 13 | chr1 HAVANA stop_codon 196959078 196959080 . + 0 gene_id "ENSG00000080910.14"; transcript_id "ENST00000367415.8"; gene_type "protein_coding"; gene_name "CFHR2"; transcript_type "protein_coding"; transcript_name "CFHR2-201"; exon_number 5; exon_id "ENSE00003843688.1"; level 2; protein_id "ENSP00000356385.4"; transcript_support_level "1"; hgnc_id "HGNC:4890"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; tag "appris_principal_2"; tag "CCDS"; ccdsid "CCDS30959.1"; havana_gene "OTTHUMG00000036518.4"; havana_transcript "OTTHUMT00000088815.4"; 14 | chr1 HAVANA UTR 196943738 196943880 . + . gene_id "ENSG00000080910.14"; transcript_id "ENST00000367415.8"; gene_type "protein_coding"; gene_name "CFHR2"; transcript_type "protein_coding"; transcript_name "CFHR2-201"; exon_number 1; exon_id "ENSE00001920108.3"; level 2; protein_id "ENSP00000356385.4"; transcript_support_level "1"; hgnc_id "HGNC:4890"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; tag "appris_principal_2"; tag "CCDS"; ccdsid "CCDS30959.1"; havana_gene "OTTHUMG00000036518.4"; havana_transcript "OTTHUMT00000088815.4"; 15 | chr1 HAVANA UTR 196959078 196959622 . + . gene_id "ENSG00000080910.14"; transcript_id "ENST00000367415.8"; gene_type "protein_coding"; gene_name "CFHR2"; transcript_type "protein_coding"; transcript_name "CFHR2-201"; exon_number 5; exon_id "ENSE00003843688.1"; level 2; protein_id "ENSP00000356385.4"; transcript_support_level "1"; hgnc_id "HGNC:4890"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; tag "appris_principal_2"; tag "CCDS"; ccdsid "CCDS30959.1"; havana_gene "OTTHUMG00000036518.4"; havana_transcript "OTTHUMT00000088815.4"; 16 | -------------------------------------------------------------------------------- /examples/params.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "batch_size": 2, 4 | "shuffle_buffer": 256, 5 | "optimizer": "adam", 6 | "learning_rate": 0.00006, 7 | "loss": "poisson_mn", 8 | "total_weight": 0.2, 9 | "warmup_steps": 20000, 10 | "global_clipnorm": 0.15, 11 | "adam_beta1": 0.9, 12 | "adam_beta2": 0.999, 13 | "patience": 30, 14 | "train_epochs_min": 130, 15 | "train_epochs_max": 180 16 | }, 17 | "model": { 18 | "seq_length": 524288, 19 | "augment_rc": true, 20 | "augment_shift": 3, 21 | "activation": "gelu", 22 | "norm_type": "batch-sync", 23 | "bn_momentum": 0.9, 24 | "kernel_initializer": "lecun_normal", 25 | "l2_scale": 2.0e-8, 26 | "trunk": [ 27 | { 28 | "name": "conv_dna", 29 | "filters": 512, 30 | "kernel_size": 15, 31 | "norm_type": null, 32 | "activation": "linear", 33 | "pool_size": 2 34 | }, 35 | { 36 | "name": "res_tower", 37 | "filters_init": 608, 38 | "filters_end": 1536, 39 | "divisible_by": 32, 40 | "kernel_size": 5, 41 | "num_convs": 1, 42 | "pool_size": 2, 43 | "repeat": 6 44 | }, 45 | { 46 | "name": "transformer_tower", 47 | "key_size": 64, 48 | "heads": 8, 49 | "num_position_features": 32, 50 | "dropout": 0.2, 51 | "mha_l2_scale": 1.0e-8, 52 | "l2_scale": 1.0e-8, 53 | "kernel_initializer": "he_normal", 54 | "repeat": 8 55 | }, 56 | { 57 | "name": "unet_conv", 58 | "kernel_size": 3, 59 | "upsample_conv": true 60 | }, 61 | { 62 | "name": "unet_conv", 63 | "kernel_size": 3, 64 | "upsample_conv": true 65 | }, 66 | { 67 | "name": "Cropping1D", 68 | "cropping": 5120 69 | }, 70 | { 71 | "name": "conv_nac", 72 | "filters": 1920, 73 | "dropout": 0.1 74 | } 75 | ], 76 | "head_human": { 77 | "name": "final", 78 | "units": 7611, 79 | "activation": "softplus" 80 | }, 81 | "head_mouse": { 82 | "name": "final", 83 | "units": 2608, 84 | "activation": "softplus" 85 | } 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /examples/params_pred.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "batch_size": 2, 4 | "shuffle_buffer": 256, 5 | "optimizer": "adam", 6 | "learning_rate": 0.00006, 7 | "loss": "poisson_mn", 8 | "total_weight": 0.2, 9 | "warmup_steps": 20000, 10 | "global_clipnorm": 0.15, 11 | "adam_beta1": 0.9, 12 | "adam_beta2": 0.999, 13 | "patience": 30, 14 | "train_epochs_min": 130, 15 | "train_epochs_max": 180 16 | }, 17 | "model": { 18 | "verbose": false, 19 | "seq_length": 524288, 20 | "augment_rc": true, 21 | "augment_shift": 3, 22 | "activation": "gelu", 23 | "norm_type": "batch-sync", 24 | "bn_momentum": 0.9, 25 | "kernel_initializer": "lecun_normal", 26 | "l2_scale": 2.0e-8, 27 | "trunk": [ 28 | { 29 | "name": "conv_dna", 30 | "filters": 512, 31 | "kernel_size": 15, 32 | "norm_type": null, 33 | "activation": "linear", 34 | "pool_size": 2 35 | }, 36 | { 37 | "name": "res_tower", 38 | "filters_init": 608, 39 | "filters_end": 1536, 40 | "divisible_by": 32, 41 | "kernel_size": 5, 42 | "num_convs": 1, 43 | "pool_size": 2, 44 | "repeat": 6 45 | }, 46 | { 47 | "name": "transformer_tower", 48 | "key_size": 64, 49 | "heads": 8, 50 | "num_position_features": 32, 51 | "dropout": 0.2, 52 | "mha_l2_scale": 1.0e-8, 53 | "l2_scale": 1.0e-8, 54 | "kernel_initializer": "he_normal", 55 | "repeat": 8 56 | }, 57 | { 58 | "name": "unet_conv", 59 | "kernel_size": 3, 60 | "upsample_conv": true 61 | }, 62 | { 63 | "name": "unet_conv", 64 | "kernel_size": 3, 65 | "upsample_conv": true 66 | }, 67 | { 68 | "name": "Cropping1D", 69 | "cropping": 16 70 | }, 71 | { 72 | "name": "conv_nac", 73 | "filters": 1920, 74 | "dropout": 0.1 75 | } 76 | ], 77 | "head_human": { 78 | "name": "final", 79 | "units": 7611, 80 | "activation": "softplus" 81 | }, 82 | "head_mouse": { 83 | "name": "final", 84 | "units": 2608, 85 | "activation": "softplus" 86 | } 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /examples/targets_gtex_liver.txt: -------------------------------------------------------------------------------- 1 | identifier file clip clip_soft scale sum_stat strand_pair description 2 | 7563 GTEX-11EQ9-0526-SM-5A5JZ.1 /home/drk/tillage/datasets/human/rna/recount3/liver/GTEX-11EQ9-0526-SM-5A5JZ.1/coverage.w5 768 384 0.01 sum_sqrt 7563 RNA:liver 3 | 7564 GTEX-1QP66-0226-SM-DPRXS.1 /home/drk/tillage/datasets/human/rna/recount3/liver/GTEX-1QP66-0226-SM-DPRXS.1/coverage.w5 768 384 0.01 sum_sqrt 7564 RNA:liver 4 | 7565 GTEX-ZYT6-0626-SM-5E45V.1 /home/drk/tillage/datasets/human/rna/recount3/liver/GTEX-ZYT6-0626-SM-5E45V.1/coverage.w5 768 384 0.01 sum_sqrt 7565 RNA:liver 5 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=69.0.3", "setuptools_scm>=8.0.4"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "borzoi" 7 | description = "borzoi" 8 | authors = [ 9 | {name = "David Kelley", email = "drk@calicolabs.com"}, 10 | {name = "Johannes Linder", email = "jlinder@calicolabs.com"} 11 | ] 12 | readme = "README.md" 13 | classifiers = ["License :: OSI Approved :: Apache License"] 14 | dynamic = ["version"] 15 | 16 | requires-python = ">=3.9" 17 | dependencies = [ 18 | "h5py~=3.10.0", 19 | "intervaltree~=3.1.0", 20 | "joblib~=1.1.1", 21 | "matplotlib~=3.7.1", 22 | "google-cloud-storage~=2.0.0", 23 | "natsort~=7.1.1", 24 | "networkx~=2.8.4", 25 | "numpy~=1.24.3", 26 | "pandas~=1.5.3", 27 | "pybigwig~=0.3.18", 28 | "pybedtools~=0.10.0", 29 | "pysam~=0.22.0", 30 | "qnorm~=0.8.1", 31 | "seaborn~=0.12.2", 32 | "scikit-learn~=1.2.2", 33 | "scipy~=1.9.1", 34 | "tensorflow~=2.15.0", 35 | "tqdm~=4.65.0", 36 | "pyfaidx~=0.7.1", 37 | "pyranges~=0.0.129", 38 | ] 39 | 40 | [project.optional-dependencies] 41 | dev = [ 42 | "black~=23.12.1", 43 | "pytest~=7.4.4", 44 | "ruff~=0.1.11", 45 | ] 46 | 47 | [project.urls] 48 | Homepage = "https://github.com/calico/borzoi" 49 | 50 | [tool.setuptools_scm] -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/calico/borzoi/77670d9b0f940ff0a4b2b0edbdff445bbaefc6ed/src/__init__.py -------------------------------------------------------------------------------- /src/borzoi/__init__.py: -------------------------------------------------------------------------------- 1 | from importlib.metadata import version, PackageNotFoundError 2 | 3 | __version__ = "0.0.0" 4 | 5 | try: 6 | __version__ = version("calicolabs-$PYTHON_PACKAGE_NAME") 7 | except PackageNotFoundError: 8 | pass 9 | -------------------------------------------------------------------------------- /src/borzoi/helpers/h5_grad_utils.py: -------------------------------------------------------------------------------- 1 | import h5py 2 | import numpy as np 3 | import argparse 4 | import subprocess 5 | import tempfile 6 | import os 7 | from baskerville.helpers.gcs_utils import download_from_gcs, upload_file_gcs 8 | 9 | 10 | def collect_h5_borzoi(out_dir, num_procs, sad_stat) -> None: 11 | h5_file = "scores_f0c0.h5" 12 | 13 | # count sequences 14 | num_seqs = 0 15 | for pi in range(num_procs): 16 | # open job 17 | job_h5_file = "%s/job%d/%s" % (out_dir, pi, h5_file) 18 | job_h5_open = h5py.File(job_h5_file, "r") 19 | num_seqs += job_h5_open[sad_stat].shape[0] 20 | seq_len = job_h5_open[sad_stat].shape[1] 21 | num_targets = job_h5_open[sad_stat].shape[-1] 22 | job_h5_open.close() 23 | 24 | # initialize final h5 25 | final_h5_file = "%s/%s" % (out_dir, h5_file) 26 | final_h5_open = h5py.File(final_h5_file, "w") 27 | 28 | # keep dict for string values 29 | final_strings = {} 30 | 31 | job0_h5_file = "%s/job0/%s" % (out_dir, h5_file) 32 | job0_h5_open = h5py.File(job0_h5_file, "r") 33 | for key in job0_h5_open.keys(): 34 | key_shape = list(job0_h5_open[key].shape) 35 | key_shape[0] = num_seqs 36 | key_shape = tuple(key_shape) 37 | if job0_h5_open[key].dtype.char == "S": 38 | final_strings[key] = [] 39 | else: 40 | final_h5_open.create_dataset( 41 | key, shape=key_shape, dtype=job0_h5_open[key].dtype 42 | ) 43 | 44 | # set values 45 | si = 0 46 | for pi in range(num_procs): 47 | # open job 48 | job_h5_file = "%s/job%d/%s" % (out_dir, pi, h5_file) 49 | job_h5_open = h5py.File(job_h5_file, "r") 50 | 51 | # append to final 52 | for key in job_h5_open.keys(): 53 | job_seqs = job_h5_open[key].shape[0] 54 | if job_h5_open[key].dtype.char == "S": 55 | final_strings[key] += list(job_h5_open[key]) 56 | else: 57 | final_h5_open[key][si : si + job_seqs] = job_h5_open[key] 58 | 59 | job_h5_open.close() 60 | si += job_seqs 61 | 62 | # create final string datasets 63 | for key in final_strings: 64 | final_h5_open.create_dataset(key, data=np.array(final_strings[key], dtype="S")) 65 | 66 | final_h5_open.close() 67 | 68 | 69 | def download_h5_gcs(output_gcs_dir, num_processes) -> str: 70 | temp_dir = tempfile.mkdtemp() # create a temp dir for output 71 | print(f"temp_dir is {temp_dir}") 72 | out_dir = temp_dir + "/" + output_gcs_dir.split("/")[-1] 73 | if not os.path.isdir(out_dir): 74 | os.mkdir(out_dir) 75 | # download output from tempfile 76 | for pi in range(num_processes): 77 | if not os.path.isdir(f"{out_dir}/job{pi}"): 78 | os.mkdir(f"{out_dir}/job{pi}") 79 | download_from_gcs( 80 | f"{output_gcs_dir}/job{pi}/scores_f0c0.h5", 81 | f"{out_dir}/job{pi}/scores_f0c0.h5", 82 | ) 83 | print(f"Done downloading {pi} partition") 84 | # download all of the files in the folder 85 | # Use gsutil to copy the contents recursively 86 | # subprocess.check_call(["gsutil", "-m", "cp", "-r", output_gcs_dir, temp_dir]) 87 | print(f"outdir is {out_dir}") 88 | print(f"gcs_out_dir is {output_gcs_dir}") 89 | print(f"Done dowloading") 90 | return out_dir 91 | 92 | 93 | def main(): 94 | parser = argparse.ArgumentParser(description="Process and collect h5 files.") 95 | 96 | parser.add_argument( 97 | "out_dir", type=str, help="Output directory for processed data." 98 | ) 99 | parser.add_argument("num_procs", type=int, help="Number of processes to use.") 100 | parser.add_argument("sad_stat", type=str, help="Stats to concatenate. E.g. grads") 101 | parser.add_argument( 102 | "--gcs", 103 | action="store_true", 104 | help="Flag indicating if the file is on Google Cloud Storage.", 105 | ) 106 | 107 | args = parser.parse_args() 108 | if args.gcs: 109 | # download files to tempdir 110 | local_out_dir = download_h5_gcs(args.out_dir, args.num_procs) 111 | collect_h5_borzoi(local_out_dir, args.num_procs, args.sad_stat) 112 | # upload to gcs 113 | print(f"is there such a file? {local_out_dir}/scores_f0c0.h5") 114 | print(os.path.isfile(f"{local_out_dir}/scores_f0c0.h5")) 115 | upload_file_gcs(f"{local_out_dir}/scores_f0c0.h5", args.out_dir) 116 | 117 | 118 | if __name__ == "__main__": 119 | main() 120 | -------------------------------------------------------------------------------- /src/scripts/_archive/borzoi_satg_gene_multi.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright 2022 Calico LLC 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | 8 | # https://www.apache.org/licenses/LICENSE-2.0 9 | 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ========================================================================= 16 | from optparse import OptionParser 17 | 18 | import os 19 | import pickle 20 | import sys 21 | 22 | import h5py 23 | import numpy as np 24 | 25 | import slurm 26 | 27 | """ 28 | borzoi_satg_gene_multi.py 29 | 30 | Perform a gradient saliency analysis for genes specified in a GTF file. 31 | using multiple processes. 32 | """ 33 | 34 | ################################################################################ 35 | # main 36 | ################################################################################ 37 | def main(): 38 | usage = "usage: %prog [options] " 39 | parser = OptionParser(usage) 40 | 41 | # borzoi_satg_gene.py options 42 | parser.add_option( 43 | "-f", 44 | dest="genome_fasta", 45 | default="%s/assembly/ucsc/hg38.fa" % os.environ["HG38"], 46 | help="Genome FASTA for sequences [Default: %default]", 47 | ) 48 | parser.add_option( 49 | "-o", 50 | dest="out_dir", 51 | default="satg_out", 52 | help="Output directory [Default: %default]", 53 | ) 54 | parser.add_option( 55 | "--rc", 56 | dest="rc", 57 | default=False, 58 | action="store_true", 59 | help="Ensemble forward and reverse complement predictions [Default: %default]", 60 | ) 61 | parser.add_option( 62 | "--shifts", 63 | dest="shifts", 64 | default="0", 65 | type="str", 66 | help="Ensemble prediction shifts [Default: %default]", 67 | ) 68 | parser.add_option( 69 | "--span", 70 | dest="span", 71 | default=False, 72 | action="store_true", 73 | help="Aggregate entire gene span [Default: %default]", 74 | ) 75 | parser.add_option( 76 | "--sum", 77 | dest="sum_targets", 78 | default=False, 79 | action="store_true", 80 | help="Sum targets for single output [Default: %default]", 81 | ) 82 | parser.add_option( 83 | "-t", 84 | dest="targets_file", 85 | default=None, 86 | type="str", 87 | help="File specifying target indexes and labels in table format", 88 | ) 89 | 90 | # _multi.py options 91 | parser.add_option( 92 | "-e", 93 | dest="conda_env", 94 | default="tf28", 95 | help="Anaconda environment [Default: %default]", 96 | ) 97 | parser.add_option( 98 | "--max_proc", 99 | dest="max_proc", 100 | default=None, 101 | type="int", 102 | help="Maximum concurrent processes [Default: %default]", 103 | ) 104 | parser.add_option( 105 | "-n", 106 | dest="name", 107 | default="satg", 108 | help="SLURM job name prefix [Default: %default]", 109 | ) 110 | parser.add_option( 111 | "-p", 112 | dest="processes", 113 | default=None, 114 | type="int", 115 | help="Number of processes, passed by multi script", 116 | ) 117 | parser.add_option( 118 | "-q", 119 | dest="queue", 120 | default="standard", 121 | help="SLURM queue on which to run the jobs [Default: %default]", 122 | ) 123 | parser.add_option( 124 | "-r", 125 | "--restart", 126 | dest="restart", 127 | default=False, 128 | action="store_true", 129 | help="Restart a partially completed job [Default: %default]", 130 | ) 131 | (options, args) = parser.parse_args() 132 | 133 | ####################################################### 134 | # prep work 135 | 136 | # output directory 137 | if not options.restart: 138 | if os.path.isdir(options.out_dir): 139 | print("Please remove %s" % options.out_dir, file=sys.stderr) 140 | exit(1) 141 | os.mkdir(options.out_dir) 142 | 143 | # pickle options 144 | options_pkl_file = "%s/options.pkl" % options.out_dir 145 | options_pkl = open(options_pkl_file, "wb") 146 | pickle.dump(options, options_pkl) 147 | options_pkl.close() 148 | 149 | if options.queue == "standard": 150 | num_gpu = 0 151 | num_cpu = 8 152 | else: 153 | num_gpu = 1 154 | num_cpu = 2 155 | 156 | ####################################################### 157 | # launch worker threads 158 | jobs = [] 159 | for pi in range(options.processes): 160 | if not options.restart or not job_completed(options, pi): 161 | cmd = ". /home/drk/anaconda3/etc/profile.d/conda.sh;" 162 | cmd += " conda activate %s;" % options.conda_env 163 | 164 | cmd += " borzoi_satg_gene.py %s %s %d" % ( 165 | options_pkl_file, 166 | " ".join(args), 167 | pi, 168 | ) 169 | name = "%s_p%d" % (options.name, pi) 170 | outf = "%s/job%d.out" % (options.out_dir, pi) 171 | errf = "%s/job%d.err" % (options.out_dir, pi) 172 | j = slurm.Job( 173 | cmd, 174 | name, 175 | outf, 176 | errf, 177 | queue=options.queue, 178 | cpu=num_cpu, 179 | gpu=num_gpu, 180 | mem=120000, 181 | time="14-0:0:0", 182 | ) 183 | jobs.append(j) 184 | 185 | slurm.multi_run( 186 | jobs, max_proc=options.max_proc, verbose=True, launch_sleep=10, update_sleep=60 187 | ) 188 | 189 | ####################################################### 190 | # collect output 191 | 192 | collect_h5(options.out_dir, options.processes, "grads") 193 | 194 | # for pi in range(options.processes): 195 | # shutil.rmtree('%s/job%d' % (options.out_dir,pi)) 196 | 197 | 198 | def collect_h5(out_dir, num_procs, sad_stat): 199 | h5_file = "scores.h5" 200 | 201 | # count sequences 202 | num_seqs = 0 203 | for pi in range(num_procs): 204 | # open job 205 | job_h5_file = "%s/job%d/%s" % (out_dir, pi, h5_file) 206 | job_h5_open = h5py.File(job_h5_file, "r") 207 | num_seqs += job_h5_open[sad_stat].shape[0] 208 | seq_len = job_h5_open[sad_stat].shape[1] 209 | num_targets = job_h5_open[sad_stat].shape[-1] 210 | job_h5_open.close() 211 | 212 | # initialize final h5 213 | final_h5_file = "%s/%s" % (out_dir, h5_file) 214 | final_h5_open = h5py.File(final_h5_file, "w") 215 | 216 | # keep dict for string values 217 | final_strings = {} 218 | 219 | job0_h5_file = "%s/job0/%s" % (out_dir, h5_file) 220 | job0_h5_open = h5py.File(job0_h5_file, "r") 221 | for key in job0_h5_open.keys(): 222 | key_shape = list(job0_h5_open[key].shape) 223 | key_shape[0] = num_seqs 224 | key_shape = tuple(key_shape) 225 | if job0_h5_open[key].dtype.char == "S": 226 | final_strings[key] = [] 227 | else: 228 | final_h5_open.create_dataset( 229 | key, shape=key_shape, dtype=job0_h5_open[key].dtype 230 | ) 231 | 232 | # set values 233 | si = 0 234 | for pi in range(num_procs): 235 | # open job 236 | job_h5_file = "%s/job%d/%s" % (out_dir, pi, h5_file) 237 | job_h5_open = h5py.File(job_h5_file, "r") 238 | 239 | # append to final 240 | for key in job_h5_open.keys(): 241 | job_seqs = job_h5_open[key].shape[0] 242 | if job_h5_open[key].dtype.char == "S": 243 | final_strings[key] += list(job_h5_open[key]) 244 | else: 245 | final_h5_open[key][si : si + job_seqs] = job_h5_open[key] 246 | 247 | job_h5_open.close() 248 | si += job_seqs 249 | 250 | # create final string datasets 251 | for key in final_strings: 252 | final_h5_open.create_dataset(key, data=np.array(final_strings[key], dtype="S")) 253 | 254 | final_h5_open.close() 255 | 256 | 257 | def job_completed(options, pi): 258 | """Check whether a specific job has generated its 259 | output file.""" 260 | out_file = "%s/job%d/scores.h5" % (options.out_dir, pi) 261 | valid_file = True 262 | if not os.path.isfile(out_file): 263 | valid_file = False 264 | else: 265 | try: 266 | out_open = h5py.File(out_file, "r") 267 | except OSError: 268 | valid_file = False 269 | return valid_file 270 | 271 | 272 | ################################################################################ 273 | # __main__ 274 | ################################################################################ 275 | if __name__ == "__main__": 276 | main() 277 | -------------------------------------------------------------------------------- /src/scripts/borzoi_bench_trip_folds.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright 2019 Calico LLC 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | 8 | # https://www.apache.org/licenses/LICENSE-2.0 9 | 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ========================================================================= 16 | from optparse import OptionParser, OptionGroup 17 | import os 18 | 19 | import slurm 20 | 21 | """ 22 | borzoi_borzoi_trip_folds.py 23 | 24 | Benchmark Borzoi model replicates on TRIP prediction task. 25 | """ 26 | 27 | ################################################################################ 28 | # main 29 | ################################################################################ 30 | def main(): 31 | usage = "usage: %prog [options] " 32 | parser = OptionParser(usage) 33 | 34 | # trip 35 | trip_options = OptionGroup(parser, "borzoi_trip.py options") 36 | trip_options.add_option( 37 | "-f", 38 | dest="genome_fasta", 39 | default="%s/assembly/ucsc/hg38.fa" % os.environ.get('BORZOI_HG38', 'hg38'), 40 | help="Genome FASTA for sequences [Default: %default]", 41 | ) 42 | trip_options.add_option( 43 | "-o", 44 | dest="out_dir", 45 | default="trip", 46 | help="Output directory for tables and plots [Default: %default]", 47 | ) 48 | trip_options.add_option( 49 | "--site", 50 | dest="site", 51 | default=False, 52 | action="store_true", 53 | help="Return the insertion site without the promoter [Default: %default]", 54 | ) 55 | trip_options.add_option( 56 | "--reporter", 57 | dest="reporter", 58 | default=False, 59 | action="store_true", 60 | help="Insert the flanking piggyback reporter with the promoter [Default: %default]", 61 | ) 62 | trip_options.add_option( 63 | "--reporter_bare", 64 | dest="reporter_bare", 65 | default=False, 66 | action="store_true", 67 | help="Insert the flanking piggyback reporter with the promoter (no terminal repeats) [Default: %default]", 68 | ) 69 | trip_options.add_option( 70 | "--rc", 71 | dest="rc", 72 | default=False, 73 | action="store_true", 74 | help="Average forward and reverse complement predictions [Default: %default]", 75 | ) 76 | trip_options.add_option( 77 | "--shifts", 78 | dest="shifts", 79 | default="0", 80 | type="str", 81 | help="Ensemble prediction shifts [Default: %default]", 82 | ) 83 | trip_options.add_option( 84 | "-t", 85 | dest="targets_file", 86 | default=None, 87 | type="str", 88 | help="File specifying target indexes and labels in table format", 89 | ) 90 | parser.add_option_group(trip_options) 91 | 92 | # cross-fold 93 | fold_options = OptionGroup(parser, "cross-fold options") 94 | fold_options.add_option( 95 | "-c", 96 | dest="crosses", 97 | default=1, 98 | type="int", 99 | help="Number of cross-fold rounds [Default:%default]", 100 | ) 101 | fold_options.add_option( 102 | "--folds", 103 | dest="fold_subset", 104 | default=1, 105 | type="int", 106 | help="Run a subset of folds [Default:%default]", 107 | ) 108 | fold_options.add_option( 109 | "--f_list", 110 | dest="fold_subset_list", 111 | default=None, 112 | help="Run a subset of folds (encoded as comma-separated string) [Default:%default]", 113 | ) 114 | fold_options.add_option( 115 | "-d", 116 | dest="data_head", 117 | default=None, 118 | type="int", 119 | help="Index for dataset/head [Default: %default]", 120 | ) 121 | fold_options.add_option( 122 | "-e", 123 | dest="conda_env", 124 | default="tf210", 125 | help="Anaconda environment [Default: %default]", 126 | ) 127 | fold_options.add_option( 128 | "--name", 129 | dest="name", 130 | default="trip", 131 | help="SLURM name prefix [Default: %default]", 132 | ) 133 | fold_options.add_option( 134 | "--max_proc", 135 | dest="max_proc", 136 | default=None, 137 | type="int", 138 | help="Maximum concurrent processes [Default: %default]", 139 | ) 140 | fold_options.add_option( 141 | "-q", 142 | dest="queue", 143 | default="geforce", 144 | help="SLURM queue on which to run the jobs [Default: %default]", 145 | ) 146 | fold_options.add_option( 147 | "-r", 148 | dest="restart", 149 | default=False, 150 | action="store_true", 151 | help="Restart a partially completed job [Default: %default]", 152 | ) 153 | parser.add_option_group(fold_options) 154 | 155 | (options, args) = parser.parse_args() 156 | 157 | if len(args) != 4: 158 | print(len(args)) 159 | print(args) 160 | parser.error( 161 | "Must provide parameters file, cross-fold directory, TRIP promoter sequences, and TRIP insertion sites" 162 | ) 163 | else: 164 | params_file = args[0] 165 | exp_dir = args[1] 166 | promoters_file = args[2] 167 | insertions_file = args[3] 168 | 169 | ####################################################### 170 | # prep work 171 | 172 | # set folds 173 | num_folds = 1 174 | if options.fold_subset is not None: 175 | num_folds = options.fold_subset 176 | 177 | fold_index = [fold_i for fold_i in range(num_folds)] 178 | 179 | # subset folds (list) 180 | if options.fold_subset_list is not None: 181 | fold_index = [int(fold_str) for fold_str in options.fold_subset_list.split(",")] 182 | 183 | ################################################################ 184 | # TRIP prediction jobs 185 | 186 | # command base 187 | cmd_base = ('. %s; ' % os.environ['BORZOI_CONDA']) if 'BORZOI_CONDA' in os.environ else '' 188 | cmd_base += "conda activate %s;" % options.conda_env 189 | cmd_base += " echo $HOSTNAME;" 190 | 191 | jobs = [] 192 | 193 | for ci in range(options.crosses): 194 | for fi in fold_index: 195 | it_dir = "%s/f%dc%d" % (exp_dir, fi, ci) 196 | name = "%s-f%dc%d" % (options.name, fi, ci) 197 | 198 | # update output directory 199 | it_out_dir = "%s/%s" % (it_dir, options.out_dir) 200 | os.makedirs(it_out_dir, exist_ok=True) 201 | 202 | model_file = "%s/train/model_best.h5" % it_dir 203 | if options.data_head is not None: 204 | model_file = "%s/train/model%d_best.h5" % (it_dir, options.data_head) 205 | 206 | cmd_fold = "%s time borzoi_trip.py %s %s %s %s" % ( 207 | cmd_base, 208 | params_file, 209 | model_file, 210 | promoters_file, 211 | insertions_file, 212 | ) 213 | 214 | # TRIP job 215 | job_out_dir = it_out_dir 216 | if not options.restart or not os.path.isfile("%s/preds.h5" % job_out_dir): 217 | cmd_job = cmd_fold 218 | cmd_job += " %s" % options_string(options, trip_options, job_out_dir) 219 | j = slurm.Job( 220 | cmd_job, 221 | name, 222 | "%s.out" % job_out_dir, 223 | "%s.err" % job_out_dir, 224 | queue=options.queue, 225 | gpu=1, 226 | mem=60000, 227 | time="7-0:0:0", 228 | ) 229 | jobs.append(j) 230 | 231 | slurm.multi_run( 232 | jobs, max_proc=options.max_proc, verbose=True, launch_sleep=10, update_sleep=60 233 | ) 234 | 235 | 236 | def options_string(options, group_options, rep_dir): 237 | options_str = "" 238 | 239 | for opt in group_options.option_list: 240 | opt_str = opt.get_opt_string() 241 | opt_value = options.__dict__[opt.dest] 242 | 243 | # wrap askeriks in "" 244 | if type(opt_value) == str and opt_value.find("*") != -1: 245 | opt_value = '"%s"' % opt_value 246 | 247 | # no value for bools 248 | elif type(opt_value) == bool: 249 | if not opt_value: 250 | opt_str = "" 251 | opt_value = "" 252 | 253 | # skip Nones 254 | elif opt_value is None: 255 | opt_str = "" 256 | opt_value = "" 257 | 258 | # modify 259 | elif opt.dest == "out_dir": 260 | opt_value = rep_dir 261 | 262 | options_str += " %s %s" % (opt_str, opt_value) 263 | 264 | return options_str 265 | 266 | 267 | ################################################################################ 268 | # __main__ 269 | ################################################################################ 270 | if __name__ == "__main__": 271 | main() 272 | -------------------------------------------------------------------------------- /src/scripts/borzoi_sad_folds.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright 2019 Calico LLC 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | 8 | # https://www.apache.org/licenses/LICENSE-2.0 9 | 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ========================================================================= 16 | from optparse import OptionParser, OptionGroup 17 | import glob 18 | import h5py 19 | import json 20 | import pdb 21 | import os 22 | import sys 23 | 24 | import numpy as np 25 | import pandas as pd 26 | 27 | import slurm 28 | 29 | """ 30 | borzoi_sad_folds.py 31 | 32 | Compute SAD scores across model folds. 33 | """ 34 | 35 | ################################################################################ 36 | # main 37 | ################################################################################ 38 | def main(): 39 | usage = 'usage: %prog [options] ' 40 | parser = OptionParser(usage) 41 | 42 | # sad 43 | sad_options = OptionGroup(parser, 'borzoi_sad.py options') 44 | sad_options.add_option( 45 | '-f', 46 | dest='genome_fasta', 47 | default='%s/assembly/ucsc/hg38.fa' % os.environ.get('BORZOI_HG38', 'hg38'), 48 | help='Genome FASTA for sequences [Default: %default]', 49 | ) 50 | sad_options.add_option( 51 | '-o', 52 | dest='out_dir', 53 | default='sad', 54 | help='Output directory for tables and plots [Default: %default]' 55 | ) 56 | sad_options.add_option( 57 | '-p', 58 | dest='processes', 59 | default=None, 60 | type='int', 61 | help='Number of processes, passed by multi script' 62 | ) 63 | sad_options.add_option( 64 | '--rc', 65 | dest='rc', 66 | default=False, 67 | action='store_true', 68 | help='Average forward and reverse complement predictions [Default: %default]' 69 | ) 70 | sad_options.add_option( 71 | '--shifts', dest='shifts', 72 | default='0', 73 | type='str', 74 | help='Ensemble prediction shifts [Default: %default]' 75 | ) 76 | sad_options.add_option( 77 | '--stats', 78 | dest='sad_stats', 79 | default='SAD', 80 | help='Comma-separated list of stats to save. [Default: %default]' 81 | ) 82 | sad_options.add_option( 83 | '-t', 84 | dest='targets_file', 85 | default=None, 86 | type='str', 87 | help='File specifying target indexes and labels in table format' 88 | ) 89 | sad_options.add_option( 90 | '-u', 91 | dest='untransform_old', 92 | default=False, 93 | action='store_true', 94 | ) 95 | sad_options.add_option( 96 | '--no_untransform', 97 | dest='no_untransform', 98 | default=False, 99 | action='store_true', 100 | ) 101 | parser.add_option_group(sad_options) 102 | 103 | # cross-fold 104 | fold_options = OptionGroup(parser, 'cross-fold options') 105 | fold_options.add_option( 106 | '-c', 107 | dest='crosses', 108 | default=1, 109 | type='int', 110 | help='Number of cross-fold rounds [Default:%default]', 111 | ) 112 | fold_options.add_option( 113 | '--folds', 114 | dest='fold_subset', 115 | default=1, 116 | type='int', 117 | help='Run a subset of folds [Default:%default]', 118 | ) 119 | fold_options.add_option( 120 | '--f_list', 121 | dest='fold_subset_list', 122 | default=None, 123 | help='Run a subset of folds (encoded as comma-separated string) [Default:%default]', 124 | ) 125 | fold_options.add_option( 126 | '-d', 127 | dest='data_head', 128 | default=None, 129 | type='int', 130 | help='Index for dataset/head [Default: %default]' 131 | ) 132 | fold_options.add_option( 133 | '-e', 134 | dest='conda_env', 135 | default='tf210', 136 | help='Anaconda environment [Default: %default]' 137 | ) 138 | fold_options.add_option( 139 | '--name', 140 | dest='name', 141 | default='sad', 142 | help='SLURM name prefix [Default: %default]' 143 | ) 144 | fold_options.add_option( 145 | '--max_proc', 146 | dest='max_proc', 147 | default=None, 148 | type='int', 149 | help='Maximum concurrent processes [Default: %default]' 150 | ) 151 | fold_options.add_option( 152 | '-q', 153 | dest='queue', 154 | default='geforce', 155 | help='SLURM queue on which to run the jobs [Default: %default]' 156 | ) 157 | fold_options.add_option( 158 | '-r', 159 | dest='restart', 160 | default=False, 161 | action='store_true', 162 | help='Restart a partially completed job [Default: %default]' 163 | ) 164 | fold_options.add_option( 165 | '--vcf', 166 | dest='vcf_file', 167 | default='/home/jlinder/seqnn/data/satmutmpra/satmutmpra_v1.vcf' 168 | ) 169 | parser.add_option_group(fold_options) 170 | 171 | (options, args) = parser.parse_args() 172 | 173 | if len(args) != 2: 174 | parser.error('Must provide parameters file and cross-fold directory') 175 | else: 176 | params_file = args[0] 177 | exp_dir = args[1] 178 | 179 | ####################################################### 180 | # prep work 181 | 182 | # set folds 183 | num_folds = 1 184 | if options.fold_subset is not None: 185 | num_folds = options.fold_subset 186 | 187 | fold_index = [fold_i for fold_i in range(num_folds)] 188 | 189 | # subset folds (list) 190 | if options.fold_subset_list is not None: 191 | fold_index = [int(fold_str) for fold_str in options.fold_subset_list.split(",")] 192 | 193 | ################################################################ 194 | # SNP scores 195 | 196 | # command base 197 | cmd_base = ('. %s; ' % os.environ['BORZOI_CONDA']) if 'BORZOI_CONDA' in os.environ else '' 198 | cmd_base += 'conda activate %s;' % options.conda_env 199 | cmd_base += ' echo $HOSTNAME;' 200 | 201 | jobs = [] 202 | 203 | for ci in range(options.crosses): 204 | for fi in fold_index: 205 | it_dir = '%s/f%dc%d' % (exp_dir, fi, ci) 206 | name = '%s-f%dc%d' % (options.name, fi, ci) 207 | 208 | # update output directory 209 | it_out_dir = '%s/%s' % (it_dir, options.out_dir) 210 | os.makedirs(it_out_dir, exist_ok=True) 211 | 212 | model_file = '%s/train/model_best.h5' % it_dir 213 | if options.data_head is not None: 214 | model_file = '%s/train/model%d_best.h5' % (it_dir, options.data_head) 215 | 216 | cmd_fold = '%s time borzoi_sad.py %s %s' % (cmd_base, params_file, model_file) 217 | 218 | # variant scoring job 219 | job_out_dir = it_out_dir 220 | if not options.restart or not os.path.isfile('%s/sad.h5'%job_out_dir): 221 | cmd_job = '%s %s' % (cmd_fold, options.vcf_file) 222 | cmd_job += ' %s' % options_string(options, sad_options, job_out_dir) 223 | j = slurm.Job(cmd_job, '%s' % name, 224 | '%s.out'%job_out_dir, '%s.err'%job_out_dir, '%s.sb'%job_out_dir, 225 | queue=options.queue, gpu=1, 226 | mem=45000, time='30-0:0:0') 227 | jobs.append(j) 228 | 229 | slurm.multi_run(jobs, max_proc=options.max_proc, verbose=True, 230 | launch_sleep=10, update_sleep=60) 231 | 232 | def options_string(options, group_options, rep_dir): 233 | options_str = '' 234 | 235 | for opt in group_options.option_list: 236 | opt_str = opt.get_opt_string() 237 | opt_value = options.__dict__[opt.dest] 238 | 239 | # wrap askeriks in "" 240 | if type(opt_value) == str and opt_value.find('*') != -1: 241 | opt_value = '"%s"' % opt_value 242 | 243 | # no value for bools 244 | elif type(opt_value) == bool: 245 | if not opt_value: 246 | opt_str = '' 247 | opt_value = '' 248 | 249 | # skip Nones 250 | elif opt_value is None: 251 | opt_str = '' 252 | opt_value = '' 253 | 254 | # modify 255 | elif opt.dest == 'out_dir': 256 | opt_value = rep_dir 257 | 258 | options_str += ' %s %s' % (opt_str, opt_value) 259 | 260 | return options_str 261 | 262 | ################################################################################ 263 | # __main__ 264 | ################################################################################ 265 | if __name__ == '__main__': 266 | main() 267 | -------------------------------------------------------------------------------- /src/scripts/borzoi_sed_folds.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright 2019 Calico LLC 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | 8 | # https://www.apache.org/licenses/LICENSE-2.0 9 | 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ========================================================================= 16 | from optparse import OptionParser, OptionGroup 17 | import glob 18 | import h5py 19 | import json 20 | import pdb 21 | import os 22 | import sys 23 | 24 | import numpy as np 25 | import pandas as pd 26 | 27 | import slurm 28 | 29 | """ 30 | borzoi_sed_folds.py 31 | 32 | Compute SED scores across model folds. 33 | """ 34 | 35 | ################################################################################ 36 | # main 37 | ################################################################################ 38 | def main(): 39 | usage = 'usage: %prog [options] ' 40 | parser = OptionParser(usage) 41 | 42 | # sed 43 | sed_options = OptionGroup(parser, 'borzoi_sed_folds.py options') 44 | sed_options.add_option( 45 | '-f', 46 | dest='genome_fasta', 47 | default='%s/assembly/ucsc/hg38.fa' % os.environ.get('BORZOI_HG38', 'hg38'), 48 | help='Genome FASTA for sequences [Default: %default]', 49 | ) 50 | sed_options.add_option( 51 | '-g', 52 | dest='genes_gtf', 53 | default='%s/genes/gencode41/gencode41_basic_nort.gtf' % os.environ.get('BORZOI_HG38', 'hg38'), 54 | help='GTF for gene definition [Default %default]', 55 | ) 56 | sed_options.add_option( 57 | '-o', 58 | dest='out_dir', 59 | default='sed', 60 | help='Output directory for tables and plots [Default: %default]', 61 | ) 62 | sed_options.add_option( 63 | '-p', 64 | dest='processes', 65 | default=None, 66 | type='int', 67 | help='Number of processes, passed by multi script', 68 | ) 69 | sed_options.add_option( 70 | '--rc', 71 | dest='rc', 72 | default=False, 73 | action='store_true', 74 | help='Average forward and reverse complement predictions [Default: %default]', 75 | ) 76 | sed_options.add_option( 77 | '--shifts', 78 | dest='shifts', 79 | default='0', 80 | type='str', 81 | help='Ensemble prediction shifts [Default: %default]', 82 | ) 83 | sed_options.add_option( 84 | '--span', 85 | dest='span', 86 | default=False, 87 | action='store_true', 88 | help='Aggregate entire gene span [Default: %default]', 89 | ) 90 | sed_options.add_option( 91 | '--stats', 92 | dest='sed_stats', 93 | default='SED', 94 | help='Comma-separated list of stats to save. [Default: %default]', 95 | ) 96 | sed_options.add_option( 97 | '-t', 98 | dest='targets_file', 99 | default=None, 100 | type='str', 101 | help='File specifying target indexes and labels in table format', 102 | ) 103 | sed_options.add_option( 104 | '-u', 105 | dest='untransform_old', 106 | default=False, 107 | action='store_true', 108 | ) 109 | sed_options.add_option( 110 | '--no_untransform', 111 | dest='no_untransform', 112 | default=False, 113 | action='store_true', 114 | ) 115 | parser.add_option_group(sed_options) 116 | 117 | # cross-fold 118 | fold_options = OptionGroup(parser, 'cross-fold options') 119 | fold_options.add_option( 120 | '-c', 121 | dest='crosses', 122 | default=1, 123 | type='int', 124 | help='Number of cross-fold rounds [Default:%default]', 125 | ) 126 | fold_options.add_option( 127 | '--folds', 128 | dest='fold_subset', 129 | default=1, 130 | type='int', 131 | help='Run a subset of folds [Default:%default]', 132 | ) 133 | fold_options.add_option( 134 | '--f_list', 135 | dest='fold_subset_list', 136 | default=None, 137 | help='Run a subset of folds (encoded as comma-separated string) [Default:%default]', 138 | ) 139 | fold_options.add_option( 140 | '-d', 141 | dest='data_head', 142 | default=None, 143 | type='int', 144 | help='Index for dataset/head [Default: %default]', 145 | ) 146 | fold_options.add_option( 147 | '-e', 148 | dest='conda_env', 149 | default='tf210', 150 | help='Anaconda environment [Default: %default]', 151 | ) 152 | fold_options.add_option( 153 | '--name', 154 | dest='name', 155 | default='sed', 156 | help='SLURM name prefix [Default: %default]', 157 | ) 158 | fold_options.add_option( 159 | '--max_proc', 160 | dest='max_proc', 161 | default=None, 162 | type='int', 163 | help='Maximum concurrent processes [Default: %default]', 164 | ) 165 | fold_options.add_option( 166 | '-q', 167 | dest='queue', 168 | default='geforce', 169 | help='SLURM queue on which to run the jobs [Default: %default]', 170 | ) 171 | fold_options.add_option( 172 | '-r', 173 | dest='restart', 174 | default=False, 175 | action='store_true', 176 | help='Restart a partially completed job [Default: %default]', 177 | ) 178 | fold_options.add_option( 179 | '--vcf', 180 | dest='vcf_file', 181 | default='/home/drk/seqnn/data/gtex_fine/susie_pip90/pos_merge.vcf', 182 | ) 183 | parser.add_option_group(fold_options) 184 | 185 | (options, args) = parser.parse_args() 186 | 187 | if len(args) != 2: 188 | parser.error('Must provide parameters file and cross-fold directory') 189 | else: 190 | params_file = args[0] 191 | exp_dir = args[1] 192 | 193 | ####################################################### 194 | # prep work 195 | 196 | # set folds 197 | num_folds = 1 198 | if options.fold_subset is not None: 199 | num_folds = options.fold_subset 200 | 201 | fold_index = [fold_i for fold_i in range(num_folds)] 202 | 203 | # subset folds (list) 204 | if options.fold_subset_list is not None: 205 | fold_index = [int(fold_str) for fold_str in options.fold_subset_list.split(",")] 206 | 207 | ################################################################ 208 | # SNP scores 209 | 210 | # command base 211 | cmd_base = ('. %s; ' % os.environ['BORZOI_CONDA']) if 'BORZOI_CONDA' in os.environ else '' 212 | cmd_base += 'conda activate %s;' % options.conda_env 213 | cmd_base += ' echo $HOSTNAME;' 214 | 215 | jobs = [] 216 | 217 | for ci in range(options.crosses): 218 | for fi in fold_index: 219 | it_dir = '%s/f%dc%d' % (exp_dir, fi, ci) 220 | name = '%s-f%dc%d' % (options.name, fi, ci) 221 | 222 | # update output directory 223 | it_out_dir = '%s/%s' % (it_dir, options.out_dir) 224 | os.makedirs(it_out_dir, exist_ok=True) 225 | 226 | model_file = '%s/train/model_best.h5' % it_dir 227 | if options.data_head is not None: 228 | model_file = '%s/train/model%d_best.h5' % (it_dir, options.data_head) 229 | 230 | cmd_fold = '%s time borzoi_sed.py %s %s' % (cmd_base, params_file, model_file) 231 | 232 | # variant scoring job 233 | job_out_dir = it_out_dir 234 | if not options.restart or not os.path.isfile('%s/sed.h5'%job_out_dir): 235 | cmd_job = '%s %s' % (cmd_fold, options.vcf_file) 236 | cmd_job += ' %s' % options_string(options, sed_options, job_out_dir) 237 | j = slurm.Job(cmd_job, '%s' % name, 238 | '%s.out'%job_out_dir, '%s.err'%job_out_dir, '%s.sb'%job_out_dir, 239 | queue=options.queue, gpu=1, 240 | mem=60000, time='30-0:0:0') 241 | jobs.append(j) 242 | 243 | slurm.multi_run(jobs, max_proc=options.max_proc, verbose=True, 244 | launch_sleep=10, update_sleep=60) 245 | 246 | def options_string(options, group_options, rep_dir): 247 | options_str = '' 248 | 249 | for opt in group_options.option_list: 250 | opt_str = opt.get_opt_string() 251 | opt_value = options.__dict__[opt.dest] 252 | 253 | # wrap askeriks in "" 254 | if type(opt_value) == str and opt_value.find('*') != -1: 255 | opt_value = '"%s"' % opt_value 256 | 257 | # no value for bools 258 | elif type(opt_value) == bool: 259 | if not opt_value: 260 | opt_str = '' 261 | opt_value = '' 262 | 263 | # skip Nones 264 | elif opt_value is None: 265 | opt_str = '' 266 | opt_value = '' 267 | 268 | # modify 269 | elif opt.dest == 'out_dir': 270 | opt_value = rep_dir 271 | 272 | options_str += ' %s %s' % (opt_str, opt_value) 273 | 274 | return options_str 275 | 276 | ################################################################################ 277 | # __main__ 278 | ################################################################################ 279 | if __name__ == '__main__': 280 | main() 281 | -------------------------------------------------------------------------------- /src/scripts/borzoi_test_apa_folds.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright 2019 Calico LLC 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | 8 | # https://www.apache.org/licenses/LICENSE-2.0 9 | 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ========================================================================= 16 | from optparse import OptionParser 17 | import json 18 | import os 19 | 20 | import slurm 21 | 22 | """ 23 | borzoi_test_apa_folds.py 24 | 25 | Measure accuracy at polyadenylation-level for multiple model replicates. 26 | """ 27 | 28 | ################################################################################ 29 | # main 30 | ################################################################################ 31 | def main(): 32 | usage = "usage: %prog [options] ..." 33 | parser = OptionParser(usage) 34 | parser.add_option( 35 | "-c", 36 | dest="crosses", 37 | default=1, 38 | type="int", 39 | help="Number of cross-fold rounds [Default:%default]", 40 | ) 41 | parser.add_option( 42 | "-d", 43 | dest="dataset_i", 44 | default=None, 45 | type="int", 46 | help="Dataset index [Default:%default]", 47 | ) 48 | parser.add_option( 49 | "-e", 50 | dest="conda_env", 51 | default="tf210", 52 | help="Anaconda environment [Default: %default]", 53 | ) 54 | parser.add_option( 55 | "-f", 56 | dest="fold_subset", 57 | default=None, 58 | type="int", 59 | help="Run a subset of folds [Default:%default]", 60 | ) 61 | parser.add_option( 62 | "--f_list", 63 | dest="fold_subset_list", 64 | default=None, 65 | help="Run a subset of folds (encoded as comma-separated string) [Default:%default]", 66 | ) 67 | parser.add_option( 68 | "-g", 69 | dest="apa_file", 70 | default="%s/genes/polyadb/polyadb_human_v3.csv.gz" % os.environ.get('BORZOI_HG38', 'hg38'), 71 | help="Csv for polya site definition [Default %default]", 72 | ) 73 | parser.add_option( 74 | "--name", 75 | dest="name", 76 | default="teste", 77 | help="SLURM name prefix [Default: %default]", 78 | ) 79 | parser.add_option( 80 | "-o", 81 | dest="exp_dir", 82 | default=None, 83 | help="Output experiment directory [Default: %default]", 84 | ) 85 | parser.add_option( 86 | "-q", 87 | dest="queue", 88 | default="geforce" 89 | ) 90 | parser.add_option( 91 | "--rc", 92 | dest="rc", 93 | default=False, 94 | action="store_true", 95 | help="Average forward and reverse complement predictions [Default: %default]", 96 | ) 97 | parser.add_option( 98 | "--shifts", 99 | dest="shifts", 100 | default="0", 101 | type="str", 102 | help="Ensemble prediction shifts [Default: %default]", 103 | ) 104 | parser.add_option( 105 | "-t", 106 | dest="targets_file", 107 | default=None, 108 | type="str", 109 | help="File specifying target indexes and labels in table format", 110 | ) 111 | parser.add_option( 112 | "-u", 113 | dest="untransform_old", 114 | default=False, 115 | action="store_true", 116 | help="Untransform old models [Default: %default]", 117 | ) 118 | (options, args) = parser.parse_args() 119 | 120 | if len(args) < 2: 121 | parser.error("Must provide parameters file and data directory") 122 | else: 123 | params_file = args[0] 124 | data_dirs = [os.path.abspath(arg) for arg in args[1:]] 125 | 126 | # using -o for required argument for compatibility with the training script 127 | assert options.exp_dir is not None 128 | 129 | # read data parameters 130 | data_stats_file = "%s/statistics.json" % data_dirs[0] 131 | with open(data_stats_file) as data_stats_open: 132 | data_stats = json.load(data_stats_open) 133 | 134 | if options.dataset_i is None: 135 | head_i = 0 136 | else: 137 | head_i = options.dataset_i 138 | 139 | # count folds 140 | num_folds = len([dkey for dkey in data_stats if dkey.startswith("fold")]) 141 | 142 | # subset folds 143 | if options.fold_subset is not None: 144 | num_folds = min(options.fold_subset, num_folds) 145 | 146 | fold_index = [fold_i for fold_i in range(num_folds)] 147 | 148 | # subset folds (list) 149 | if options.fold_subset_list is not None: 150 | fold_index = [int(fold_str) for fold_str in options.fold_subset_list.split(",")] 151 | 152 | if options.queue == "standard": 153 | num_cpu = 4 154 | num_gpu = 0 155 | else: 156 | num_cpu = 2 157 | num_gpu = 1 158 | 159 | ################################################################ 160 | # test best 161 | ################################################################ 162 | jobs = [] 163 | 164 | for ci in range(options.crosses): 165 | for fi in fold_index: 166 | it_dir = "%s/f%dc%d" % (options.exp_dir, fi, ci) 167 | 168 | if options.dataset_i is None: 169 | out_dir = "%s/teste" % it_dir 170 | model_file = "%s/train/model_best.h5" % it_dir 171 | else: 172 | out_dir = "%s/teste%d" % (it_dir, options.dataset_i) 173 | model_file = "%s/train/model%d_best.h5" % (it_dir, options.dataset_i) 174 | 175 | # check if done 176 | acc_file = "%s/apa_preds.tsv.gz" % out_dir 177 | if os.path.isfile(acc_file): 178 | # print('%s already generated.' % acc_file) 179 | pass 180 | else: 181 | # evaluate 182 | cmd = ('. %s; ' % os.environ['BORZOI_CONDA']) if 'BORZOI_CONDA' in os.environ else '' 183 | cmd += "conda activate %s;" % options.conda_env 184 | cmd += " time borzoi_test_apa.py" 185 | cmd += " --head %d" % head_i 186 | cmd += " -o %s" % out_dir 187 | if options.rc: 188 | cmd += " --rc" 189 | if options.shifts: 190 | cmd += " --shifts %s" % options.shifts 191 | if options.targets_file is not None: 192 | cmd += " -t %s" % options.targets_file 193 | if options.untransform_old: 194 | cmd += " -u" 195 | cmd += " %s" % params_file 196 | cmd += " %s" % model_file 197 | cmd += " %s/data%d" % (it_dir, head_i) 198 | cmd += " %s" % options.apa_file 199 | 200 | name = "%s-f%dc%d" % (options.name, fi, ci) 201 | j = slurm.Job( 202 | cmd, 203 | name=name, 204 | out_file="%s.out" % out_dir, 205 | err_file="%s.err" % out_dir, 206 | queue=options.queue, 207 | cpu=num_cpu, 208 | gpu=num_gpu, 209 | mem=45000, 210 | time="2-00:00:00", 211 | ) 212 | jobs.append(j) 213 | 214 | slurm.multi_run(jobs, verbose=True) 215 | 216 | 217 | ################################################################################ 218 | # __main__ 219 | ################################################################################ 220 | if __name__ == "__main__": 221 | main() 222 | -------------------------------------------------------------------------------- /src/scripts/borzoi_test_tss_folds.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright 2019 Calico LLC 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | 8 | # https://www.apache.org/licenses/LICENSE-2.0 9 | 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ========================================================================= 16 | from optparse import OptionParser 17 | import json 18 | import os 19 | 20 | import slurm 21 | 22 | """ 23 | borzoi_test_tss_folds.py 24 | 25 | Measure accuracy at TSS-level for multiple model replicates. 26 | """ 27 | 28 | ################################################################################ 29 | # main 30 | ################################################################################ 31 | def main(): 32 | usage = 'usage: %prog [options] ...' 33 | parser = OptionParser(usage) 34 | parser.add_option( 35 | '-c', 36 | dest='crosses', 37 | default=1, 38 | type='int', 39 | help='Number of cross-fold rounds [Default:%default]', 40 | ) 41 | parser.add_option( 42 | '-d', 43 | dest='dataset_i', 44 | default=None, 45 | type='int', 46 | help='Dataset index [Default:%default]', 47 | ) 48 | parser.add_option( 49 | '-e', 50 | dest='conda_env', 51 | default='tf210', 52 | help='Anaconda environment [Default: %default]', 53 | ) 54 | parser.add_option( 55 | '-f', 56 | dest='fold_subset', 57 | default=None, 58 | type='int', 59 | help='Run a subset of folds [Default:%default]', 60 | ) 61 | parser.add_option( 62 | '--f_list', 63 | dest='fold_subset_list', 64 | default=None, 65 | help='Run a subset of folds (encoded as comma-separated string) [Default:%default]', 66 | ) 67 | parser.add_option( 68 | '-g', 69 | dest='tss_file', 70 | default='%s/genes/gencode41/gencode41_basic_tss2.bed' % os.environ.get('BORZOI_HG38', 'hg38'), 71 | help='Bed for tss definition [Default %default]', 72 | ) 73 | parser.add_option( 74 | '--name', 75 | dest='name', 76 | default='teste', 77 | help='SLURM name prefix [Default: %default]', 78 | ) 79 | parser.add_option( 80 | '-o', 81 | dest='exp_dir', 82 | default=None, 83 | help='Output experiment directory [Default: %default]', 84 | ) 85 | parser.add_option( 86 | '-q', 87 | dest='queue', 88 | default='geforce', 89 | ) 90 | parser.add_option( 91 | '--rc', 92 | dest='rc', 93 | default=False, 94 | action='store_true', 95 | help='Average forward and reverse complement predictions [Default: %default]', 96 | ) 97 | parser.add_option( 98 | '--shifts', 99 | dest='shifts', 100 | default='0', 101 | type='str', 102 | help='Ensemble prediction shifts [Default: %default]', 103 | ) 104 | parser.add_option( 105 | '--windowcov', 106 | dest='windowcov', 107 | default=4, 108 | type='int', 109 | help='Coverage bin window size [Default: %default]', 110 | ) 111 | parser.add_option( 112 | '--maxcov', 113 | dest='maxcov', 114 | default=False, 115 | action='store_true', 116 | help='Store max instead of avg bin value in local window [Default: %default]', 117 | ) 118 | parser.add_option( 119 | '-t', 120 | dest='targets_file', 121 | default=None, 122 | type='str', 123 | help='File specifying target indexes and labels in table format', 124 | ) 125 | parser.add_option( 126 | '-u', 127 | dest='untransform_old', 128 | default=False, 129 | action='store_true', 130 | help='Untransform old models [Default: %default]', 131 | ) 132 | (options, args) = parser.parse_args() 133 | 134 | if len(args) < 2: 135 | parser.error('Must provide parameters file and data directory') 136 | else: 137 | params_file = args[0] 138 | data_dirs = [os.path.abspath(arg) for arg in args[1:]] 139 | 140 | # using -o for required argument for compatibility with the training script 141 | assert(options.exp_dir is not None) 142 | 143 | # read data parameters 144 | data_stats_file = '%s/statistics.json' % data_dirs[0] 145 | with open(data_stats_file) as data_stats_open: 146 | data_stats = json.load(data_stats_open) 147 | 148 | if options.dataset_i is None: 149 | head_i = 0 150 | else: 151 | head_i = options.dataset_i 152 | 153 | # count folds 154 | num_folds = len([dkey for dkey in data_stats if dkey.startswith("fold")]) 155 | 156 | # subset folds 157 | if options.fold_subset is not None: 158 | num_folds = min(options.fold_subset, num_folds) 159 | 160 | fold_index = [fold_i for fold_i in range(num_folds)] 161 | 162 | # subset folds (list) 163 | if options.fold_subset_list is not None: 164 | fold_index = [int(fold_str) for fold_str in options.fold_subset_list.split(",")] 165 | 166 | if options.queue == 'standard': 167 | num_cpu = 4 168 | num_gpu = 0 169 | else: 170 | num_cpu = 2 171 | num_gpu = 1 172 | 173 | ################################################################ 174 | # test best 175 | ################################################################ 176 | jobs = [] 177 | 178 | for ci in range(options.crosses): 179 | for fi in fold_index: 180 | it_dir = '%s/f%dc%d' % (options.exp_dir, fi, ci) 181 | 182 | max_str = '' 183 | if options.maxcov: 184 | max_str = 'max' 185 | 186 | windowcov_str = '' 187 | if options.windowcov != 4: 188 | windowcov_str = 'w' + str(options.windowcov) 189 | 190 | if options.dataset_i is None: 191 | out_dir = '%s/testetss%s%s' % (it_dir, max_str, windowcov_str) 192 | model_file = '%s/train/model_best.h5' % it_dir 193 | else: 194 | out_dir = '%s/testetss%s%s%d' % (it_dir, max_str, windowcov_str, options.dataset_i) 195 | model_file = '%s/train/model%d_best.h5' % (it_dir, options.dataset_i) 196 | 197 | # check if done 198 | acc_file = '%s/tss_preds.tsv.gz' % out_dir 199 | if os.path.isfile(acc_file): 200 | # print('%s already generated.' % acc_file) 201 | pass 202 | else: 203 | # evaluate 204 | cmd = ('. %s; ' % os.environ['BORZOI_CONDA']) if 'BORZOI_CONDA' in os.environ else '' 205 | cmd += 'conda activate %s;' % options.conda_env 206 | cmd += ' time borzoi_test_tss.py' 207 | cmd += ' --head %d' % head_i 208 | cmd += ' -o %s' % out_dir 209 | if options.rc: 210 | cmd += ' --rc' 211 | if options.shifts: 212 | cmd += ' --shifts %s' % options.shifts 213 | if options.windowcov != 4: 214 | cmd += ' --windowcov %d' % options.windowcov 215 | if options.maxcov: 216 | cmd += ' --maxcov' 217 | if options.targets_file is not None: 218 | cmd += ' -t %s' % options.targets_file 219 | if options.untransform_old: 220 | cmd += ' -u' 221 | cmd += ' %s' % params_file 222 | cmd += ' %s' % model_file 223 | cmd += ' %s/data%d' % (it_dir, head_i) 224 | cmd += ' %s' % options.tss_file 225 | 226 | name = '%s-f%dc%d' % (options.name, fi, ci) 227 | j = slurm.Job(cmd, 228 | name=name, 229 | out_file='%s.out'%out_dir, 230 | err_file='%s.err'%out_dir, 231 | queue=options.queue, 232 | cpu=num_cpu, gpu=num_gpu, 233 | mem=45000, 234 | time='2-00:00:00') 235 | jobs.append(j) 236 | 237 | slurm.multi_run(jobs, verbose=True) 238 | 239 | ################################################################################ 240 | # __main__ 241 | ################################################################################ 242 | if __name__ == '__main__': 243 | main() 244 | -------------------------------------------------------------------------------- /src/scripts/bw_h5.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import sys 4 | 5 | import h5py 6 | import numpy as np 7 | import pyBigWig 8 | import scipy.interpolate 9 | 10 | ''' 11 | bw_h5.py 12 | 13 | Convert a BigWig to HDF5. 14 | ''' 15 | 16 | ################################################################################ 17 | # main 18 | ################################################################################ 19 | def main(): 20 | usage = 'usage: %prog [options] ' 21 | parser = OptionParser(usage) 22 | parser.add_option('-c', '--chr_strip', dest='chr_strip', 23 | default=False, action='store_true') 24 | parser.add_option('-i', dest='interp_nan', 25 | default=False, action='store_true', 26 | help='Interpolate NaNs [Default: %default]') 27 | parser.add_option('-m', dest='min_norm', 28 | default=False, action='store_true', 29 | help='Normalize the minimum nonzero value to 1 [Default: %default]') 30 | # parser.add_option('--mode_max', dest='mode_norm_max', 31 | # default=10, type='float', 32 | # help='Maximum norm scale value determined by mode [Default: %default]') 33 | parser.add_option('-s', dest='scale', 34 | default=1.0, type='float', 35 | help='Scale all values (e.g. to undo normalization) [Default: %default]') 36 | parser.add_option('-v', dest='verbose', 37 | default=False, action='store_true') 38 | parser.add_option('-z', dest='clip_zero', 39 | default=False, action='store_true', 40 | help='Clip negative values at zero [Default: %default]') 41 | (options,args) = parser.parse_args() 42 | 43 | if len(args) != 2: 44 | parser.error('Must provide input BigWig and output HDF5.') 45 | else: 46 | bw_file = args[0] 47 | hdf5_file = args[1] 48 | 49 | # open files 50 | bw_in = pyBigWig.open(bw_file) 51 | h5_out = h5py.File(hdf5_file, 'w') 52 | 53 | # process chromosomes in length order 54 | chrom_lengths = bw_in.chroms() 55 | chroms = sorted(chrom_lengths.keys()) 56 | length_chroms = [(chrom_lengths[chrm],chrm) for chrm in chroms] 57 | length_chroms = sorted(length_chroms)[::-1] 58 | min_factor = None 59 | 60 | # for each chromosome 61 | for clength, chrom in length_chroms: 62 | if options.verbose: 63 | print(chrom) 64 | 65 | # read values 66 | x = bw_in.values(chrom, 0, chrom_lengths[chrom], numpy=True) 67 | 68 | # scale 69 | if options.scale != 1: 70 | x = x*options.scale 71 | 72 | if options.min_norm: 73 | if min_factor is None: 74 | min_factor = x[x>0].min() 75 | # vals, counts = np.unique(x[x>0], return_counts=True) 76 | # mode_factor = vals[0] 77 | # mode_factor = np.clip(vals[0], 1/options.mode_norm_max, options.mode_norm_max) 78 | print('Min normalization factor: %f' % min_factor, file=sys.stderr) 79 | x /= min_factor 80 | 81 | # interpolate NaN 82 | if options.interp_nan: 83 | x = interp_nan(x) 84 | else: 85 | x = np.nan_to_num(x) 86 | 87 | # clip negative values 88 | if options.clip_zero: 89 | x = np.clip(x, 0, np.inf) 90 | 91 | # clip float16 min/max 92 | x = np.clip(x, np.finfo(np.float16).min, np.finfo(np.float16).max) 93 | 94 | # strip "chr" 95 | if options.chr_strip: 96 | chrom = chrom.replace('chr','') 97 | 98 | # write gzipped into HDF5 99 | x = x.astype('float16') 100 | h5_out.create_dataset(chrom, data=x, dtype='float16', compression='gzip', shuffle=True) 101 | 102 | # close files 103 | h5_out.close() 104 | bw_in.close() 105 | 106 | 107 | def interp_nan(x, kind='linear'): 108 | '''Linearly interpolate to fill NaN.''' 109 | 110 | # pad zeroes 111 | xp = np.zeros(len(x)+2) 112 | xp[1:-1] = x 113 | 114 | # find NaN 115 | x_nan = np.isnan(xp) 116 | 117 | if np.sum(x_nan) == 0: 118 | # unnecessary 119 | return x 120 | 121 | else: 122 | # interpolate 123 | inds = np.arange(len(xp)) 124 | interpolator = scipy.interpolate.interp1d( 125 | inds[~x_nan], 126 | xp[~x_nan], 127 | kind=kind, 128 | bounds_error=False) 129 | 130 | loc = np.where(x_nan) 131 | xp[loc] = interpolator(loc) 132 | 133 | # slice off pad 134 | return xp[1:-1] 135 | 136 | ################################################################################ 137 | # __main__ 138 | ################################################################################ 139 | if __name__ == '__main__': 140 | main() 141 | -------------------------------------------------------------------------------- /src/scripts/idx_genome.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import os 4 | import sys 5 | import pyfaidx 6 | 7 | ''' 8 | idx_genome.py 9 | 10 | Create .fai index file for input .fa. 11 | ''' 12 | 13 | ################################################################################ 14 | # main 15 | ################################################################################ 16 | def main(): 17 | usage = 'usage: %prog [options] ' 18 | parser = OptionParser(usage) 19 | (options, args) = parser.parse_args() 20 | 21 | if len(args) != 1: 22 | parser.error('Must provide input fasta file') 23 | else: 24 | genome_fa = args[0] 25 | 26 | pyfaidx.Faidx(genome_fa) 27 | 28 | ################################################################################ 29 | # __main__ 30 | ################################################################################ 31 | if __name__ == '__main__': 32 | main() 33 | -------------------------------------------------------------------------------- /src/scripts/util.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import print_function 3 | #import pdb 4 | import operator, os, sys, subprocess, time 5 | 6 | ############################################################ 7 | # util 8 | # 9 | # Helpful methods that are difficult to categorize. 10 | ############################################################ 11 | 12 | ############################################################ 13 | # condorify 14 | ############################################################ 15 | def condorify(cmds): 16 | return ['runCmd -c "%s"' % c for c in cmds] 17 | 18 | ############################################################ 19 | # slurmify 20 | ############################################################ 21 | def slurmify(cmds, mem_mb=None): 22 | if mem != None: 23 | mem_str = '--mem %d' % mem_mb 24 | else: 25 | mem_str = '' 26 | 27 | return ['srun -p general -n 1 %s "%s"' % (mem_str,c) for c in cmds] 28 | 29 | ############################################################ 30 | # exec_par 31 | # 32 | # Execute the commands in the list 'cmds' in parallel, but 33 | # only running 'max_proc' at a time. 34 | ############################################################ 35 | def exec_par(cmds, max_proc=None, verbose=False): 36 | total = len(cmds) 37 | finished = 0 38 | running = 0 39 | p = [] 40 | 41 | if max_proc == None: 42 | max_proc = len(cmds) 43 | 44 | if max_proc == 1: 45 | while finished < total: 46 | if verbose: 47 | print(cmds[finished], file=sys.stderr) 48 | op = subprocess.Popen(cmds[finished], shell=True) 49 | os.waitpid(op.pid, 0) 50 | finished += 1 51 | 52 | else: 53 | while finished + running < total: 54 | # launch jobs up to max 55 | while running < max_proc and finished+running < total: 56 | if verbose: 57 | print(cmds[finished+running], file=sys.stderr) 58 | p.append(subprocess.Popen(cmds[finished+running], shell=True)) 59 | # print('Running %d' % p[running].pid) 60 | running += 1 61 | 62 | # are any jobs finished 63 | new_p = [] 64 | for i in range(len(p)): 65 | # print('POLLING', i, p[i].poll()) 66 | if p[i].poll() != None: 67 | running -= 1 68 | finished += 1 69 | else: 70 | new_p.append(p[i]) 71 | 72 | # if none finished, sleep 73 | if len(new_p) == len(p): 74 | time.sleep(1) 75 | p = new_p 76 | 77 | # wait for all to finish 78 | for i in range(len(p)): 79 | p[i].wait() 80 | 81 | ############################################################ 82 | # slurm_par 83 | # 84 | # Execute the commands in the list 'cmds' in parallel on 85 | # SLURM, but only running 'max_proc' at a time. 86 | # 87 | # Doesn't work. Jobs are allocated resources, but won't run. 88 | # Also, I'd have to screen into login nodes, which 89 | # isn't great because I can't get back to them. 90 | ############################################################ 91 | def slurm_par(cmds, max_proc, queue='general', cpu=1, mem=None, out_files=None, err_files=None): 92 | # preprocess cmds 93 | if mem != None: 94 | mem_str = '--mem %d' % mem 95 | else: 96 | mem_str = '' 97 | 98 | if out_files != None: 99 | out_strs = ['-o %s' % of for of in out_files] 100 | else: 101 | out_strs = ['']*len(cmds) 102 | 103 | if err_files != None: 104 | err_strs = ['-e %s' % ef for ef in err_files] 105 | else: 106 | err_strs = ['']*len(cmds) 107 | 108 | slurm_cmds = ['srun -p %s -n %d %s %s %s "%s"' % (queue, cpu, mem_str, out_strs[i], err_strs[i], cmds[i]) for i in range(len(cmds))] 109 | 110 | exec_par(slurm_cmds, max_proc, print_cmd=True) 111 | 112 | 113 | ############################################################ 114 | # sort_dict 115 | # 116 | # Sort a dict by the values, returning a list of tuples 117 | ############################################################ 118 | def sort_dict(hash, reverse=False): 119 | return sorted(hash.items(), key=operator.itemgetter(1), reverse=reverse) 120 | 121 | -------------------------------------------------------------------------------- /src/scripts/w5_merge.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import os 4 | import sys 5 | 6 | import h5py 7 | import numpy as np 8 | 9 | ''' 10 | w5_merge.py 11 | 12 | Merge wig5 files using a specified summary statistic. 13 | ''' 14 | 15 | ################################################################################ 16 | # main 17 | ################################################################################ 18 | def main(): 19 | usage = 'usage: %prog [options] ...' 20 | parser = OptionParser(usage) 21 | parser.add_option('-s', dest='sum_stat', 22 | default='sum', help='Summary statistic [Default: %default]') 23 | parser.add_option('-v', dest='verbose', 24 | default=False, action='store_true') 25 | parser.add_option('-w', dest='overwrite', 26 | default=False, action='store_true') 27 | parser.add_option('-z', dest='gzip', 28 | default=False, action='store_true') 29 | (options,args) = parser.parse_args() 30 | 31 | if len(args) < 3: 32 | parser.error('Must provide output and two or more input wig5.') 33 | else: 34 | out_w5_file = args[0] 35 | in_w5_files = args[1:] 36 | 37 | compression_args = {} 38 | if options.gzip: 39 | compression_args['compression'] = 'gzip' 40 | compression_args['shuffle'] = True 41 | 42 | # open input wig5 43 | in_w5_opens = [h5py.File(iwf) for iwf in in_w5_files] 44 | in_num = len(in_w5_opens) 45 | 46 | # take keys union 47 | in_keys = set() 48 | for in_w5_open in in_w5_opens: 49 | in_keys |= in_w5_open.keys() 50 | 51 | # open output file 52 | if os.path.isfile(out_w5_file) and not options.overwrite: 53 | parser.error('%s exists. Please remove.' % out_w5_file) 54 | out_w5_open = h5py.File(out_w5_file, 'w') 55 | 56 | for out_key in in_keys: 57 | if options.verbose: 58 | print(out_key) 59 | 60 | # initialize array 61 | for i in range(in_num): 62 | if out_key in in_w5_opens[i]: 63 | in_key_len = len(in_w5_opens[i][out_key]) 64 | break 65 | in_key_data = np.zeros((in_num,in_key_len), dtype='float32') 66 | 67 | # read data 68 | for i in range(in_num): 69 | if out_key in in_w5_opens[i]: 70 | in_key_data[i] = np.array(in_w5_opens[i][out_key]) 71 | else: 72 | print('%s missing %s' % (in_w5_files[i], out_key), file=sys.stderr) 73 | 74 | # summarize 75 | if options.sum_stat == 'sum': 76 | out_key_data = in_key_data.sum(axis=0) 77 | 78 | elif options.sum_stat == 'mean': 79 | out_key_data = in_key_data.mean(axis=0) 80 | 81 | elif options.sum_stat == 'geo-mean': 82 | in_key_data_log = np.log(in_key_data) 83 | in_key_data_log_mean = in_key_data_log.mean(axis=0) 84 | out_key_data = np.exp(in_key_data_log_mean) 85 | 86 | elif options.sum_stat == 'sqrt-mean': 87 | in_key_data_sqrt = in_key_data**0.5 88 | in_key_data_sqrt_mean = in_key_data_sqrt.mean(axis=0) 89 | out_key_data = in_key_data_sqrt_mean**2 90 | 91 | else: 92 | print('Cannot identify summary statistic %s' % options.sum_stat) 93 | 94 | # carefully decrease resolution 95 | out_key_data = np.clip(out_key_data, np.finfo(np.float16).min, np.finfo(np.float16).max) 96 | out_key_data = out_key_data.astype('float16') 97 | 98 | # write 99 | out_w5_open.create_dataset(out_key, data=out_key_data, 100 | dtype='float16', **compression_args) 101 | 102 | out_w5_open.close() 103 | 104 | 105 | 106 | ################################################################################ 107 | # __main__ 108 | ################################################################################ 109 | if __name__ == '__main__': 110 | main() 111 | -------------------------------------------------------------------------------- /src/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/calico/borzoi/77670d9b0f940ff0a4b2b0edbdff445bbaefc6ed/src/tests/__init__.py -------------------------------------------------------------------------------- /src/tests/test_dummy.py: -------------------------------------------------------------------------------- 1 | def test_dummy(): 2 | pass 3 | -------------------------------------------------------------------------------- /tutorials/latest/analyze_sv/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Shift augmentation for improved indel scoring in DNA sequence-based ML models 3 | This repository contains example analyses related to indels, structural variants, and tandem repeats. The manuscript is available here:
4 | 5 | "Shift augmentation for improved indel scoring in DNA sequence-based ML models" - biorXiv link. 6 | 7 | Contact *drk (at) @calicolabs.com* or *anya (at) @calicolabs.com* for questions. 8 | 9 | ## Indel / structural variant effect visualization 10 | 11 | Please follow the installation steps on the main page. This code depends on the [baskerville](https://github.com/calico/baskerville.git) library and on plotly. 12 | Install plotly into the working environment: 13 | 14 | ```sh 15 | pip install plotly 16 | ``` 17 | 18 | After you've installed baskerville, download the dependencies for SV visualization example, and run the example script: 19 | 20 | ```sh 21 | bash download_dependencies_SV.sh 22 | python analyze_indel.sh 23 | ``` 24 | 25 | This will plot one indel/SV provided in the .vcf format. The script currently only handles one variant per run, so make sure your .vcf contains one variant. 26 | Interactive plots for each available GTEx tissue and across all GTEx tissues will be put in the specified output directory. 27 | 28 | ## Tandem repeat scoring 29 | 30 | This script will analyze the effect of tandem repeats by reducing and extending the specified short tandem repeat in the reference genome, then performing linear 31 | regression over log2FC of the gene expression of interest. A tiny STR table (subset of the result obtained in [this paper](https://www.nature.com/articles/s41588-019-0521-9)) 32 | is provided in the data folder. 33 | 34 | ```sh 35 | bash download_dependencies_STR.sh 36 | python score_STR.sh 37 | ``` 38 | -------------------------------------------------------------------------------- /tutorials/latest/analyze_sv/analyze_indel.sh: -------------------------------------------------------------------------------- 1 | #!bin/bash 2 | 3 | python analyze_vcf.py --vcf data/chr6_41897087_SV.vcf \ 4 | --fasta data/hg38.fa \ 5 | --model data/model \ 6 | --params data/params.json \ 7 | --targets data/targets.txt \ 8 | --gencode data/gencode41_basic_exons.bed \ 9 | --output_dir temp \ 10 | --fig_width 1000 11 | -------------------------------------------------------------------------------- /tutorials/latest/analyze_sv/data/STR.csv: -------------------------------------------------------------------------------- 1 | chrom,str.start,str.end,gene,gene.name,num.e,beta,tissue_info,pval,score,str.motif.forward,str.motif.reverse,tissue_list,score_concord,max_tissue,num_tissues,num_motifs,motif_coords_0,start_partial,end_partial,tissues,vcf,repeats 2 | chr1,28250549,28250559,ENSG00000130768,SMPDL3B,4,0.702792765,Adipose-Subcutaneous_0.29_0.01;Esophagus-Mucosa_0.53_0.51;Esophagus-Muscularis_0.24_0.02;Lung_0.70_1.00,1.0100000000000001e-35,1.0,A,T,"['Adipose-Subcutaneous', 'Esophagus-Mucosa', 'Esophagus-Muscularis', 'Lung']",True,Lung,4,11,"[(28250548, 28250549), (28250549, 28250550), (28250550, 28250551), (28250551, 28250552), (28250552, 28250553), (28250553, 28250554), (28250554, 28250555), (28250555, 28250556), (28250556, 28250557), (28250557, 28250558), (28250558, 28250559)]",False,False,"Adipose-Subcutaneous,Esophagus-Mucosa,Esophagus-Muscularis,Lung",chr1_28250548,"7,8,9,10,12,13,14,15,16,17,18,19" 3 | chr10,71984970,71984992,ENSG00000042286,AIFM2,1,-0.444886046,Esophagus-Mucosa_-0.44_1.00,1.19e-13,1.0,ATTT,AAAT,['Esophagus-Mucosa'],True,Esophagus-Mucosa,1,5,"[(71984971, 71984975), (71984975, 71984979), (71984979, 71984983), (71984983, 71984987), (71984987, 71984991)]",True,True,Esophagus-Mucosa,chr10_71984971,"1,2,3,4,6,7,8,9,10,11,12,13" 4 | -------------------------------------------------------------------------------- /tutorials/latest/analyze_sv/data/chr6_41897087_SV.vcf: -------------------------------------------------------------------------------- 1 | chr6 41897088 chr6_41897088_GTTGGAGGTTGCAGTGAGCTGAGATCGTGCCACAGCACTCCAGCCTGGCAACGGAGTGAGACTCCATCTCAAAAAAAAAAAAAAAAAAAAAAAAGTGTCGCCTGGAAAGGCCTAGGGATCTCTGAGACCCTTTGGGCTGGGGGGATAGTGGGGTGCCTGAGATCAAAACGATTTTCCTAATAATACTGAGACATATCTGCATTGTCACTGTGATGATATTTGCACAATGATACAAAAGTAGCAATGGGTAAAACTGCTGCCTTAGCACAAATCAAGGCAACTGCACCAAGTTGTGCTAGAGGTCAAGGTATTCTTCACTGCTACAGTAAAAAAACACCTGTTTCAGGCCGGATGGGTGCAGTGGCTCACACCTGTAATCCCAACACTTTGGGAGGCCAAGGCAGGTGGATCACTTGAGGTCAGGAATTCGAGACCAGCCTGGCCAACATGGTGAAACCCCTCTCTCTACTAAAAATACAGAAATTAGCTGGGCGTGGTGGCACGCACCTGTAATCCCAGCTACTCGGGAAGCTGAGGCA_G_b38 GTTGGAGGTTGCAGTGAGCTGAGATCGTGCCACAGCACTCCAGCCTGGCAACGGAGTGAGACTCCATCTCAAAAAAAAAAAAAAAAAAAAAAAAGTGTCGCCTGGAAAGGCCTAGGGATCTCTGAGACCCTTTGGGCTGGGGGGATAGTGGGGTGCCTGAGATCAAAACGATTTTCCTAATAATACTGAGACATATCTGCATTGTCACTGTGATGATATTTGCACAATGATACAAAAGTAGCAATGGGTAAAACTGCTGCCTTAGCACAAATCAAGGCAACTGCACCAAGTTGTGCTAGAGGTCAAGGTATTCTTCACTGCTACAGTAAAAAAACACCTGTTTCAGGCCGGATGGGTGCAGTGGCTCACACCTGTAATCCCAACACTTTGGGAGGCCAAGGCAGGTGGATCACTTGAGGTCAGGAATTCGAGACCAGCCTGGCCAACATGGTGAAACCCCTCTCTCTACTAAAAATACAGAAATTAGCTGGGCGTGGTGGCACGCACCTGTAATCCCAGCTACTCGGGAAGCTGAGGCA G . . 2 | -------------------------------------------------------------------------------- /tutorials/latest/analyze_sv/download_dependencies_STR.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # create additional folder in borzoi data folders 4 | mkdir -p "data/model" 5 | mkdir -p "data/model/f0" 6 | mkdir -p "data/model/f1" 7 | mkdir -p "data/model/f2" 8 | mkdir -p "data/model/f3" 9 | 10 | # download dependencies and the model 11 | if [ -f "data/hg19.fa" ]; then 12 | echo "hg19.fa already exists." 13 | else 14 | wget -O - "ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/bigZips/hg19.fa.gz" | gunzip -c > "data/hg19.fa" 15 | fi 16 | 17 | if [ -f "data/gencode41_lift37_exons.bed" ]; then 18 | echo "gencode41_lift37_exons.bed already exists." 19 | else 20 | wget -O - "https://storage.googleapis.com/seqnn-share/helper/gencode41_lift37_exons.bed.gz" | gunzip -c > "data/gencode41_lift37_exons.bed" 21 | fi 22 | 23 | if [ -f "data/model/f0/model0_best.h5" ]; then 24 | echo "f0/model0_best.h5 already exists." 25 | else 26 | wget "https://storage.googleapis.com/seqnn-share/borzoi/f0/model0_best.h5" -O "data/model/f0/model0_best.h5" 27 | fi 28 | 29 | if [ -f "data/model/f1/model0_best.h5" ]; then 30 | echo "f1/model0_best.h5 already exists." 31 | else 32 | wget "https://storage.googleapis.com/seqnn-share/borzoi/f1/model0_best.h5" -O "data/model/f1/model0_best.h5" 33 | fi 34 | 35 | if [ -f "data/model/f2/model0_best.h5" ]; then 36 | echo "f2/model0_best.h5 already exists." 37 | else 38 | wget "https://storage.googleapis.com/seqnn-share/borzoi/f2/model0_best.h5" -O "data/model/f2/model0_best.h5" 39 | fi 40 | 41 | if [ -f "data/model/f3/model0_best.h5" ]; then 42 | echo "f3/model0_best.h5 already exists." 43 | else 44 | wget "https://storage.googleapis.com/seqnn-share/borzoi/f3/model0_best.h5" -O "data/model/f3/model0_best.h5" 45 | fi 46 | 47 | if [ -f "data/targets.txt" ]; then 48 | echo "targets.txt already exists." 49 | else 50 | wget "https://storage.googleapis.com/seqnn-share/borzoi/hg38/targets.txt" -O "data/targets.txt" 51 | fi 52 | 53 | if [ -f "data/params.json" ]; then 54 | echo "params.json already exists." 55 | else 56 | wget "https://storage.googleapis.com/seqnn-share/borzoi/params.json" -O "data/params.json" 57 | fi 58 | -------------------------------------------------------------------------------- /tutorials/latest/analyze_sv/download_dependencies_SV.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # create additional folder in borzoi data folders 4 | mkdir -p "data/model" 5 | mkdir -p "data/model/f0" 6 | mkdir -p "data/model/f1" 7 | mkdir -p "data/model/f2" 8 | mkdir -p "data/model/f3" 9 | 10 | # download dependencies and the model 11 | if [ -f "data/hg38.fa" ]; then 12 | echo "hg38.fa already exists." 13 | else 14 | wget -O - "ftp://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz" | gunzip -c > "data/hg38.fa" 15 | fi 16 | 17 | if [ -f "data/gencode41_basic_exons.bed" ]; then 18 | echo "gencode41_basic_exons.bed already exists." 19 | else 20 | wget -O - "https://storage.googleapis.com/seqnn-share/helper/gencode41_basic_exons.bed.gz" | gunzip -c > "data/gencode41_basic_exons.bed" 21 | fi 22 | 23 | if [ -f "data/model/f0/model0_best.h5" ]; then 24 | echo "f0/model0_best.h5 already exists." 25 | else 26 | wget "https://storage.googleapis.com/seqnn-share/borzoi/f0/model0_best.h5" -O "data/model/f0/model0_best.h5" 27 | fi 28 | 29 | if [ -f "data/model/f1/model0_best.h5" ]; then 30 | echo "f1/model0_best.h5 already exists." 31 | else 32 | wget "https://storage.googleapis.com/seqnn-share/borzoi/f1/model0_best.h5" -O "data/model/f1/model0_best.h5" 33 | fi 34 | 35 | if [ -f "data/model/f2/model0_best.h5" ]; then 36 | echo "f2/model0_best.h5 already exists." 37 | else 38 | wget "https://storage.googleapis.com/seqnn-share/borzoi/f2/model0_best.h5" -O "data/model/f2/model0_best.h5" 39 | fi 40 | 41 | if [ -f "data/model/f3/model0_best.h5" ]; then 42 | echo "f3/model0_best.h5 already exists." 43 | else 44 | wget "https://storage.googleapis.com/seqnn-share/borzoi/f3/model0_best.h5" -O "data/model/f3/model0_best.h5" 45 | fi 46 | 47 | if [ -f "data/targets.txt" ]; then 48 | echo "targets.txt already exists." 49 | else 50 | wget "https://storage.googleapis.com/seqnn-share/borzoi/hg38/targets.txt" -O "data/targets.txt" 51 | fi 52 | 53 | if [ -f "data/params.json" ]; then 54 | echo "params.json already exists." 55 | else 56 | wget "https://storage.googleapis.com/seqnn-share/borzoi/params.json" -O "data/params.json" 57 | fi 58 | -------------------------------------------------------------------------------- /tutorials/latest/analyze_sv/save_STR_vcf.py: -------------------------------------------------------------------------------- 1 | 2 | import re 3 | import os 4 | import argparse 5 | import numpy as np 6 | import pandas as pd 7 | from Bio import SeqIO 8 | 9 | 10 | def strip_tissue(tissues): 11 | tissue_list = [] 12 | for tissue in tissues: 13 | tissue_new = tissue.split("_")[0] 14 | tissue_list.append(tissue_new) 15 | return tissue_list 16 | 17 | def strip_score(tissues): 18 | score_list = [] 19 | for tissue in tissues: 20 | score = tissue.split("_")[1] 21 | score_list.append(float(score)) 22 | if len(score_list) == 1: 23 | return True 24 | else: 25 | # check if signs of all scores are the same 26 | if all(x > 0 for x in score_list) or all(x < 0 for x in score_list): 27 | return True 28 | else: 29 | return False 30 | 31 | # find tissue with the highest score 32 | def max_tissue(tissues): 33 | score_list = [] 34 | for tissue in tissues: 35 | score = tissue.split("_")[1] 36 | score_list.append(float(score)) 37 | max_index = score_list.index(max(score_list)) 38 | tissue_clean = tissues[max_index].split("_")[0] 39 | return tissue_clean 40 | 41 | # find motif occurence numbers with regex 42 | def find_motif(seq_dict, coords, motif): 43 | seq_to_search = seq_dict[coords[0]][coords[1]:coords[2]].upper() 44 | motif_dict = [] 45 | if len(motif)>1: 46 | matches = re.finditer(motif, seq_to_search) 47 | for match in matches: 48 | start = match.start() 49 | end = match.end() 50 | motif_dict.append((coords[1]+start, coords[1]+end)) 51 | else: 52 | if seq_to_search==motif*len(seq_to_search): 53 | for i in range(len(seq_to_search)): 54 | motif_dict.append((coords[1]+i, coords[1]+i+1)) 55 | 56 | return motif_dict 57 | 58 | 59 | def save_to_vcf(df, seq_dict, args): 60 | 61 | reduce_motifs = args.reduce 62 | extend_motifs = args.extend 63 | 64 | if not os.path.exists(args.output_dir): 65 | os.makedirs(args.output_dir) 66 | 67 | names_vcf = [] 68 | arr_repeats = [] 69 | 70 | for index, row in df.iterrows(): 71 | 72 | chrom = row['chrom'] 73 | start = row['str.start']-1 74 | end = row['str.end'] 75 | num_motifs = row['num_motifs'] 76 | first_start = row['motif_coords_0'][0][0] 77 | first_end = row['motif_coords_0'][0][1] 78 | last_end = row['motif_coords_0'][-1][1] 79 | motif_coords = row['motif_coords_0'] 80 | partial_start = row['start_partial'] 81 | partial_end = row['end_partial'] 82 | 83 | ref_allele_full = seq_dict[chrom][start:end].upper() 84 | motif = row['str.motif.forward'].upper() 85 | 86 | range_repeats = [] 87 | if num_motifs-reduce_motifs>1: 88 | range_repeats.extend(np.arange(num_motifs-reduce_motifs, num_motifs)) 89 | else: 90 | range_repeats.extend(np.arange(1, num_motifs)) 91 | range_repeats.extend(np.arange(num_motifs+1, num_motifs+extend_motifs)) 92 | 93 | for repeat in range_repeats: 94 | # if number of repeats is less than num_motifs, it's a deletion 95 | if repeat0.25 and betas concordant between tissues 143 | df = df[df['score']>0.25] 144 | df = df[df['score_concord']==True] 145 | 146 | # dictionary to store hg19 sequences 147 | seq_dict = {} 148 | 149 | with open(args.fasta, mode="r") as handle: 150 | # process each record in .fa file if there's more than one 151 | for record in SeqIO.parse(handle, "fasta"): 152 | identifier = record.id 153 | sequence = record.seq 154 | seq_dict[identifier] = str(sequence) 155 | 156 | # parse sequences chrom:start-end from hg19 157 | num_motifs, motif_coords, start_partial, end_partial = [], [], [], [] 158 | 159 | for index, row in df.iterrows(): 160 | chrom = row['chrom'] 161 | start = row['str.start']-1 162 | end = row['str.end'] 163 | coords = (chrom, start, end) 164 | motif = row['str.motif.forward'].upper() 165 | motif_dict = find_motif(seq_dict, coords, motif) 166 | if len(motif_dict)>0: 167 | if motif_dict[0][0]==start: 168 | start_partial.append(False) 169 | else: 170 | start_partial.append(True) 171 | if motif_dict[-1][1]==end: 172 | end_partial.append(False) 173 | else: 174 | end_partial.append(True) 175 | else: 176 | start_partial.append(False) 177 | end_partial.append(False) 178 | num_motifs.append(len(motif_dict)) 179 | motif_coords.append(motif_dict) 180 | 181 | df['num_motifs'], df['motif_coords_0'], df['start_partial'], df['end_partial'] = num_motifs, motif_coords, start_partial, end_partial 182 | 183 | # filter and retain only rows with >0 motifs 184 | df = df[df['num_motifs']>0] 185 | df['tissues'] = [','.join(x) for x in df['tissue_list']] 186 | 187 | # save to vcf 188 | save_to_vcf(df, seq_dict, args) 189 | 190 | 191 | if __name__ == "__main__": 192 | main() -------------------------------------------------------------------------------- /tutorials/latest/analyze_sv/score_STRs.sh: -------------------------------------------------------------------------------- 1 | #!bin/bash 2 | 3 | #python save_STR_vcf.py --input data/STR.csv \ 4 | # --fasta data/hg19.fa \ 5 | # --output_dir data/vcfs_STR 6 | 7 | python score_tandem_repeats.py --table data/STR.csv \ 8 | --input data/vcfs_STR \ 9 | --fasta data/hg19.fa \ 10 | --model data/model \ 11 | --params data/params.json \ 12 | --targets data/targets.txt \ 13 | --gencode data/gencode41_lift37_exons.bed \ 14 | --output_dir out_STR \ 15 | -------------------------------------------------------------------------------- /tutorials/latest/interpret_sequence/README.md: -------------------------------------------------------------------------------- 1 | ## Interpretation 2 | 3 | This tutorial describes how to compute gradient saliency scores (sequence attributions) with respect to various statistics computed for a list of input genes specified in a .gtf file. This example relies on the Mini Borzoi model trained on sample K562 RNA-seq data from the [train_model tutorial](https://github.com/calico/borzoi/tree/main/tutorials/latest/train_model), which clearly is a significantly weaker model than the pre-trained, published Borzoi model. 4 | 5 | To compute input gradients with respect to the log-sum of coverage across the exons of the example gene HBE1, run the script 'run_gradients_expr_HBE1.sh'. 6 | ```sh 7 | conda activate borzoi_py310 8 | cd ~/borzoi/tutorials/latest/interpret_sequence 9 | ./run_gradients_expr_HBE1.sh 10 | ``` 11 | 12 | *Notes*: 13 | - The track scale, squashing exponentiation, and clip-soft threshold, are specific in the .py script arguments (flags: '--track_scale, '--track_transform', '--clip_soft'), and the values in the targets file are ignored. This means that the same data transformation parameters are applied to all tracks specified in the targets file. To calculate gradients for groups of tracks with different data transforms, separate these tracks into different targets files, and execute the gradient script on each group separately. 14 | -------------------------------------------------------------------------------- /tutorials/latest/interpret_sequence/explore_grads_k562_HBE1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "7030e9ad", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import sys\n", 11 | "import os\n", 12 | "import numpy as np\n", 13 | "import pandas as pd\n", 14 | "\n", 15 | "import h5py\n", 16 | "\n", 17 | "import matplotlib.pyplot as plt\n", 18 | "from scipy.ndimage import gaussian_filter1d\n", 19 | "\n", 20 | "from vis_helpers import *\n" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "id": "3bcaea3d", 27 | "metadata": {}, 28 | "outputs": [ 29 | { 30 | "name": "stdout", 31 | "output_type": "stream", 32 | "text": [ 33 | "scores_hyp.shape = (1, 1, 393216, 4)\n", 34 | "scores.shape = (1, 1, 393216, 4)\n" 35 | ] 36 | }, 37 | { 38 | "data": { 39 | "text/plain": [ 40 | "0" 41 | ] 42 | }, 43 | "execution_count": 2, 44 | "metadata": {}, 45 | "output_type": "execute_result" 46 | } 47 | ], 48 | "source": [ 49 | "#Load scores for the selected set of targets (grad)\n", 50 | "\n", 51 | "import gc\n", 52 | "\n", 53 | "seqs = None\n", 54 | "strands = None\n", 55 | "chrs = None\n", 56 | "starts = None\n", 57 | "ends = None\n", 58 | "genes = None\n", 59 | "\n", 60 | "all_scores_hyp = []\n", 61 | "all_scores = []\n", 62 | "\n", 63 | "gtex_tissues = ['liver']\n", 64 | "\n", 65 | "#Load score file\n", 66 | "score_file = h5py.File('k562_HBE1/scores_f0c0.h5', 'r')\n", 67 | "\n", 68 | "#Get scores and onehots\n", 69 | "scores = score_file['grads'][()][..., 0]\n", 70 | "seqs = score_file['seqs'][()]\n", 71 | "\n", 72 | "#Get auxiliary information\n", 73 | "strands = score_file['strand'][()]\n", 74 | "strands = np.array([strands[j].decode() for j in range(strands.shape[0])])\n", 75 | "\n", 76 | "chrs = score_file['chr'][()]\n", 77 | "chrs = np.array([chrs[j].decode() for j in range(chrs.shape[0])])\n", 78 | "\n", 79 | "starts = np.array(score_file['start'][()])\n", 80 | "ends = np.array(score_file['end'][()])\n", 81 | "\n", 82 | "genes = score_file['gene'][()]\n", 83 | "genes = np.array([genes[j].decode().split(\".\")[0] for j in range(genes.shape[0])])\n", 84 | "\n", 85 | "#Append hypothetical scores\n", 86 | "all_scores_hyp.append(scores[None, ...])\n", 87 | "\n", 88 | "#Append input-gated scores\n", 89 | "all_scores.append((scores * seqs)[None, ...])\n", 90 | "\n", 91 | "#Collect garbage\n", 92 | "gc.collect()\n", 93 | "\n", 94 | "#Collect final scores\n", 95 | "scores_hyp = np.concatenate(all_scores_hyp, axis=0)\n", 96 | "scores = np.concatenate(all_scores, axis=0)\n", 97 | "\n", 98 | "print(\"scores_hyp.shape = \" + str(scores_hyp.shape))\n", 99 | "print(\"scores.shape = \" + str(scores.shape))\n", 100 | "\n", 101 | "score_file = None\n", 102 | "\n", 103 | "#Collect garbage\n", 104 | "gc.collect()\n" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "id": "955bf762", 111 | "metadata": { 112 | "scrolled": false 113 | }, 114 | "outputs": [], 115 | "source": [ 116 | "#Enumerate and visualize attributions; k562 example HBE1\n", 117 | "\n", 118 | "save_index = []\n", 119 | "\n", 120 | "#Visualization parameters\n", 121 | "logo_width = 192\n", 122 | "\n", 123 | "top_n = 1\n", 124 | "\n", 125 | "use_gaussian = True\n", 126 | "min_padding = 65536\n", 127 | "gaussian_sigma = 8\n", 128 | "local_window = 1024\n", 129 | "\n", 130 | "main_tissue_ix = 0\n", 131 | "\n", 132 | "tissue_colors = ['darkblue']\n", 133 | "\n", 134 | "#Loop over examples\n", 135 | "for example_ix in range(top_n) :\n", 136 | " \n", 137 | " print(\"-- Example = \" + str(example_ix)+ \" --\")\n", 138 | " \n", 139 | " print(\" - \" + genes[example_ix] + \"(\" + str(strands[example_ix]) + \")\")\n", 140 | " print(\" - \" + chrs[example_ix] + \":\" + str(starts[example_ix]) + \"-\" + str(ends[example_ix]))\n", 141 | "\n", 142 | " #Grad analysis\n", 143 | " \n", 144 | " #Calculate min and max scores globally (for scales)\n", 145 | " min_val = np.min(scores[:, example_ix, ...])\n", 146 | " max_val = np.max(scores[:, example_ix, ...])\n", 147 | " \n", 148 | " print(\" -- min_val = \" + str(round(min_val, 4)))\n", 149 | " print(\" -- max_val = \" + str(round(max_val, 4)))\n", 150 | " \n", 151 | " max_abs_val = max(np.abs(min_val), np.abs(max_val))\n", 152 | "\n", 153 | " min_val -= 0.1 * max_abs_val\n", 154 | " max_val += 0.1 * max_abs_val\n", 155 | "\n", 156 | " print(\" - (Gradient score profiles per tissue) - \")\n", 157 | " \n", 158 | " #Gradient profiles across input sequence\n", 159 | " f, ax = plt.subplots(len(gtex_tissues), 1, figsize=(8, len(gtex_tissues) * 1.5))\n", 160 | " \n", 161 | " if len(gtex_tissues) == 1 :\n", 162 | " ax = [ax]\n", 163 | "\n", 164 | " #Loop over tissues\n", 165 | " for tissue_ix in range(len(gtex_tissues)) :\n", 166 | "\n", 167 | " #Get tissue scores\n", 168 | " score = scores[tissue_ix, example_ix, ...]\n", 169 | "\n", 170 | " l1 = ax[tissue_ix].plot(np.arange(seqs.shape[1]), np.sum(score, axis=-1), linewidth=1, linestyle='-', color=tissue_colors[tissue_ix], label=gtex_tissues[tissue_ix])\n", 171 | " \n", 172 | " plt.sca(ax[tissue_ix])\n", 173 | " \n", 174 | " plt.xlim(0, seqs.shape[1])\n", 175 | " plt.ylim(min_val, max_val)\n", 176 | " \n", 177 | " plt.legend(handles=[l1[0]], fontsize=8)\n", 178 | " \n", 179 | " plt.yticks([], [])\n", 180 | " plt.xticks([], [])\n", 181 | " \n", 182 | " plt.sca(ax[0])\n", 183 | " plt.title(\"Gradient Saliency for gene = '\" + genes[example_ix] + \"' (\" + str(strands[example_ix]) + \")\", fontsize=8)\n", 184 | " \n", 185 | " plt.sca(ax[len(gtex_tissues)-1])\n", 186 | " plt.xlabel(chrs[example_ix] + \":\" + str(starts[example_ix]) + \"-\" + str(ends[example_ix]), fontsize=8)\n", 187 | " \n", 188 | " plt.sca(plt.gca())\n", 189 | " plt.tight_layout()\n", 190 | " \n", 191 | " plt.show()\n", 192 | "\n", 193 | " #Apply gaussian filter\n", 194 | " smooth_score = np.sum(scores[main_tissue_ix, example_ix, ...], axis=-1)\n", 195 | " if use_gaussian :\n", 196 | " smooth_score = gaussian_filter1d(smooth_score.astype('float32'), sigma=gaussian_sigma, truncate=2).astype('float16')\n", 197 | " \n", 198 | " #Calculate min/max positions and (differential) values\n", 199 | " #max_pos = np.argmax(smooth_score[min_padding:-min_padding]) + min_padding\n", 200 | " \n", 201 | " max_pos = np.argmax(smooth_score[min_padding:-min_padding]) + min_padding\n", 202 | "\n", 203 | " print(\" - (Attribution at position of Max positive differential saliency) -\")\n", 204 | "\n", 205 | " print(\" - max_pos (rel) = \" + str(max_pos))\n", 206 | " print(\" - max_pos (abs) = \" + str(starts[example_ix] + max_pos))\n", 207 | " \n", 208 | " #Visualize contribution scores\n", 209 | " plot_start = max_pos - logo_width // 2\n", 210 | " plot_end = max_pos + logo_width // 2\n", 211 | " \n", 212 | " print(\" - \" + chrs[example_ix] + \":\" + str(starts[example_ix] + max_pos - logo_width // 2) + \"-\" + str(starts[example_ix] + max_pos + logo_width // 2))\n", 213 | "\n", 214 | " #Logo min/max value across tissues\n", 215 | " min_logo_val = np.min(scores[:, example_ix, plot_start:plot_end, :])\n", 216 | " max_logo_val = np.max(scores[:, example_ix, plot_start:plot_end, :])\n", 217 | "\n", 218 | " max_abs_logo_val = max(np.abs(min_logo_val), np.abs(max_logo_val))\n", 219 | "\n", 220 | " min_logo_val -= 0.02 * max_abs_logo_val\n", 221 | " max_logo_val += 0.02 * max_abs_logo_val\n", 222 | "\n", 223 | " print(\" - y_min = \" + str(round(min_logo_val, 8)))\n", 224 | " print(\" - y_max = \" + str(round(max_logo_val, 8)))\n", 225 | "\n", 226 | " #Loop over tissues\n", 227 | " for tissue_ix in range(len(gtex_tissues)) :\n", 228 | " print(gtex_tissues[tissue_ix])\n", 229 | "\n", 230 | " #Get tissue-specific scores\n", 231 | " score = scores[tissue_ix, example_ix, plot_start:plot_end, :]\n", 232 | "\n", 233 | " #Plot scores as sequence logo\n", 234 | " plot_seq_scores(\n", 235 | " score,\n", 236 | " y_min=min_logo_val,\n", 237 | " y_max=max_logo_val,\n", 238 | " figsize=(8, 1),\n", 239 | " plot_y_ticks=False,\n", 240 | " )\n", 241 | " \n", 242 | " print(\"--------------------\")\n", 243 | " print(\"\")\n" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "id": "67a3cf9d", 250 | "metadata": {}, 251 | "outputs": [], 252 | "source": [] 253 | } 254 | ], 255 | "metadata": { 256 | "kernelspec": { 257 | "display_name": "Python 3 (ipykernel)", 258 | "language": "python", 259 | "name": "python3" 260 | }, 261 | "language_info": { 262 | "codemirror_mode": { 263 | "name": "ipython", 264 | "version": 3 265 | }, 266 | "file_extension": ".py", 267 | "mimetype": "text/x-python", 268 | "name": "python", 269 | "nbconvert_exporter": "python", 270 | "pygments_lexer": "ipython3", 271 | "version": "3.8.15" 272 | } 273 | }, 274 | "nbformat": 4, 275 | "nbformat_minor": 5 276 | } 277 | -------------------------------------------------------------------------------- /tutorials/latest/interpret_sequence/run_gradients_expr_HBE1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | borzoi_satg_gene.py -o k562_HBE1 -f 0 -c 0 --rc --track_scale 0.3 --track_transform 0.5 --clip_soft 384.0 -t ../make_data/targets_human.txt ../train_model/params_mini.json ../train_model/mini_models HBE1_example.gtf 4 | -------------------------------------------------------------------------------- /tutorials/latest/interpret_sequence/vis_helpers.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import numpy as np 4 | 5 | import matplotlib.pyplot as plt 6 | 7 | import matplotlib.cm as cm 8 | import matplotlib.colors as colors 9 | 10 | import matplotlib as mpl 11 | from matplotlib.text import TextPath 12 | from matplotlib.patches import PathPatch, Rectangle 13 | from matplotlib.font_manager import FontProperties 14 | from matplotlib import gridspec 15 | from matplotlib.ticker import FormatStrFormatter 16 | 17 | #Helper function to draw a letter at a given position 18 | def dna_letter_at(letter, x, y, yscale=1, ax=None, color=None, alpha=1.0): 19 | 20 | fp = FontProperties(family="DejaVu Sans", weight="bold") 21 | globscale = 1.35 22 | LETTERS = { "T" : TextPath((-0.305, 0), "T", size=1, prop=fp), 23 | "G" : TextPath((-0.384, 0), "G", size=1, prop=fp), 24 | "A" : TextPath((-0.35, 0), "A", size=1, prop=fp), 25 | "C" : TextPath((-0.366, 0), "C", size=1, prop=fp), 26 | "UP" : TextPath((-0.488, 0), '$\\Uparrow$', size=1, prop=fp), 27 | "DN" : TextPath((-0.488, 0), '$\\Downarrow$', size=1, prop=fp), 28 | "(" : TextPath((-0.25, 0), "(", size=1, prop=fp), 29 | "." : TextPath((-0.125, 0), "-", size=1, prop=fp), 30 | ")" : TextPath((-0.1, 0), ")", size=1, prop=fp)} 31 | COLOR_SCHEME = {'G': 'orange',#'orange', 32 | 'A': 'green',#'red', 33 | 'C': 'blue',#'blue', 34 | 'T': 'red',#'darkgreen', 35 | 'UP': 'green', 36 | 'DN': 'red', 37 | '(': 'black', 38 | '.': 'black', 39 | ')': 'black'} 40 | 41 | 42 | text = LETTERS[letter] 43 | 44 | chosen_color = COLOR_SCHEME[letter] 45 | if color is not None : 46 | chosen_color = color 47 | 48 | t = mpl.transforms.Affine2D().scale(1*globscale, yscale*globscale) + \ 49 | mpl.transforms.Affine2D().translate(x,y) + ax.transData 50 | p = PathPatch(text, lw=0, fc=chosen_color, alpha=alpha, transform=t) 51 | if ax != None: 52 | ax.add_artist(p) 53 | return p 54 | 55 | #Function to plot sequence logo 56 | def plot_seq_scores(importance_scores, figsize=(16, 2), plot_y_ticks=True, y_min=None, y_max=None, save_figs=False, fig_name="default") : 57 | 58 | importance_scores = importance_scores.T 59 | 60 | fig = plt.figure(figsize=figsize) 61 | 62 | ref_seq = "" 63 | for j in range(importance_scores.shape[1]) : 64 | argmax_nt = np.argmax(np.abs(importance_scores[:, j])) 65 | 66 | if argmax_nt == 0 : 67 | ref_seq += "A" 68 | elif argmax_nt == 1 : 69 | ref_seq += "C" 70 | elif argmax_nt == 2 : 71 | ref_seq += "G" 72 | elif argmax_nt == 3 : 73 | ref_seq += "T" 74 | 75 | ax = plt.gca() 76 | 77 | for i in range(0, len(ref_seq)) : 78 | mutability_score = np.sum(importance_scores[:, i]) 79 | color = None 80 | dna_letter_at(ref_seq[i], i + 0.5, 0, mutability_score, ax, color=color) 81 | 82 | plt.sca(ax) 83 | plt.xticks([], []) 84 | plt.gca().yaxis.set_major_formatter(FormatStrFormatter('%.3f')) 85 | 86 | plt.xlim((0, len(ref_seq))) 87 | 88 | #plt.axis('off') 89 | 90 | if plot_y_ticks : 91 | plt.yticks(fontsize=12) 92 | else : 93 | plt.yticks([], []) 94 | 95 | if y_min is not None and y_max is not None : 96 | plt.ylim(y_min, y_max) 97 | elif y_min is not None : 98 | plt.ylim(y_min) 99 | else : 100 | plt.ylim( 101 | np.min(importance_scores) - 0.1 * np.max(np.abs(importance_scores)), 102 | np.max(importance_scores) + 0.1 * np.max(np.abs(importance_scores)) 103 | ) 104 | 105 | plt.axhline(y=0., color='black', linestyle='-', linewidth=1) 106 | 107 | #for axis in fig.axes : 108 | # axis.get_xaxis().set_visible(False) 109 | # axis.get_yaxis().set_visible(False) 110 | 111 | plt.tight_layout() 112 | 113 | if save_figs : 114 | plt.savefig(fig_name + ".png", transparent=True, dpi=300) 115 | plt.savefig(fig_name + ".eps") 116 | 117 | plt.show() 118 | 119 | #Function to visualize a pair of sequence logos 120 | def visualize_input_gradient_pair(att_grad_wt, att_grad_mut, plot_start=0, plot_end=100, save_figs=False, fig_name='') : 121 | 122 | scores_wt = att_grad_wt[plot_start:plot_end, :] 123 | scores_mut = att_grad_mut[plot_start:plot_end, :] 124 | 125 | y_min = min(np.min(scores_wt), np.min(scores_mut)) 126 | y_max = max(np.max(scores_wt), np.max(scores_mut)) 127 | 128 | y_max_abs = max(np.abs(y_min), np.abs(y_max)) 129 | 130 | y_min = y_min - 0.05 * y_max_abs 131 | y_max = y_max + 0.05 * y_max_abs 132 | 133 | if np.sum(scores_mut) != 0. : 134 | print("--- WT ---") 135 | 136 | plot_seq_scores( 137 | scores_wt, y_min=y_min, y_max=y_max, 138 | figsize=(8, 1), 139 | plot_y_ticks=False, 140 | save_figs=save_figs, 141 | fig_name=fig_name + '_wt', 142 | ) 143 | 144 | if np.sum(scores_mut) != 0. : 145 | 146 | print("--- Mut ---") 147 | plot_seq_scores( 148 | scores_mut, y_min=y_min, y_max=y_max, 149 | figsize=(8, 1), 150 | plot_y_ticks=False, 151 | save_figs=save_figs, 152 | fig_name=fig_name + '_mut', 153 | ) 154 | -------------------------------------------------------------------------------- /tutorials/latest/make_data/Makefile: -------------------------------------------------------------------------------- 1 | FASTA_HUMAN=$$BORZOI_HG38/assembly/gnomad/hg38.ml.fa 2 | GAPS_HUMAN=$$BORZOI_HG38/assembly/ucsc/hg38_gaps.bed 3 | UMAP_HUMAN=$$BORZOI_HG38/mappability/umap_k36_t10_l32.bed 4 | BLACK_HUMAN=$$BORZOI_HG38/blacklist/blacklist_hg38_all.bed 5 | 6 | FASTA_MOUSE=$$BORZOI_MM10/assembly/ucsc/mm10.ml.fa 7 | GAPS_MOUSE=$$BORZOI_MM10/assembly/ucsc/mm10_gaps.bed 8 | UMAP_MOUSE=$$BORZOI_MM10/mappability/umap_k36_t10_l32.bed 9 | BLACK_MOUSE=$$BORZOI_MM10/blacklist/blacklist_mm10_all.bed 10 | 11 | ALIGN=$$BORZOI_HG38/align/hg38.mm10.syn.net.gz 12 | 13 | OUT=data 14 | 15 | # mini borzoi configuration 16 | LENGTH=393216 17 | TSTRIDE=131087 # 393216/3 - 15 18 | CROP=0 19 | WIDTH=32 20 | FOLDS=8 21 | 22 | AOPTS=--break 2097152 -c $(CROP) --nf 524288 --no 393216 -l $(LENGTH) --stride $(TSTRIDE) -f $(FOLDS) --umap_t 0.5 -w $(WIDTH) 23 | DOPTS=-c $(CROP) -d 2 -f $(FOLDS) -l $(LENGTH) -p 64 -r 16 --umap_clip 0.5 -w $(WIDTH) 24 | 25 | all: $(OUT)/hg38/tfrecords/train-0.tfr # $(OUT)/mm10/tfrecords/train-0.tfr 26 | 27 | umap_human.bed: 28 | cat $(UMAP_HUMAN) $(BLACK_HUMAN) | awk 'BEGIN {OFS="\t"} {print $$1, $$2, $$3}' | bedtools sort -i - | bedtools merge -i - > umap_human.bed 29 | 30 | umap_mouse.bed: 31 | cat $(UMAP_MOUSE) $(BLACK_MOUSE) | awk 'BEGIN {OFS="\t"} {print $$1, $$2, $$3}' | bedtools sort -i - | bedtools merge -i - > umap_mouse.bed 32 | 33 | # targets file is already generated in this example 34 | #targets_human.txt targets_mouse.txt: 35 | # ./make_targets.py 36 | 37 | $(OUT)/hg38/sequences.bed $(OUT)/mm10/sequences.bed: umap_human.bed umap_mouse.bed 38 | hound_data_align.py -a hg38,mm10 -g $(GAPS_HUMAN),$(GAPS_MOUSE) -u umap_human.bed,umap_mouse.bed $(AOPTS) -o $(OUT) $(ALIGN) $(FASTA_HUMAN),$(FASTA_MOUSE) 39 | 40 | $(OUT)/hg38/tfrecords/train-0.tfr: $(OUT)/hg38/sequences.bed targets_human.txt 41 | hound_data.py --restart $(DOPTS) -b $(BLACK_HUMAN) -o $(OUT)/hg38 $(FASTA_HUMAN) -u umap_human.bed targets_human.txt 42 | 43 | # no mouse data in this example 44 | #$(OUT)/mm10/tfrecords/train-0.tfr: $(OUT)/mm10/sequences.bed targets_mouse.txt 45 | # hound_data.py --restart $(DOPTS) -b $(BLACK_MOUSE) -o $(OUT)/mm10 $(FASTA_MOUSE) -u umap_mouse.bed targets_mouse.txt 46 | -------------------------------------------------------------------------------- /tutorials/latest/make_data/README.md: -------------------------------------------------------------------------------- 1 | ## Data Processing 2 | 3 | This tutorial decribes how to process a .bigwig sequencing experiment into compressed .w5 format, merge replicates, generate QC metrics, and finally create TFRecord files containing binned coverage values suitable for training Borzoi models. We will exemplify this for the ENCODE K562 RNA-seq experiment [ENCSR000AEL](https://www.encodeproject.org/experiments/ENCSR000AEL/). 4 | 5 | First, activate the conda environment and run the script 'download_dependencies.sh' to download required auxiliary files. 6 | ```sh 7 | conda activate borzoi_py310 8 | cd ~/borzoi/tutorials/latest/make_data 9 | ./download_dependencies.sh 10 | ``` 11 | 12 | Next, run the script 'download_bw.sh' to download sample ENCODE .bigwig files and arrange them in a folder structure. 13 | ```sh 14 | ./download_bw.sh 15 | ``` 16 | 17 | Then run script 'process_w5.sh' to generate compressed .w5 files (hdf5) from the input .bigwig files, merge the two replicates, and calculate basic QC metrics. This .sh script internally calls 'bw_h5.py' to generate .w5 files, 'w5_merge.py' to merge replicates, and 'w5_qc.py' to calculate QC metrics. 18 | ```sh 19 | ./process_w5.sh 20 | ``` 21 | 22 | Finally, run the Makefile to create genome-wide binned coverage tracks, stored as compressed TFRecords. 23 | ```sh 24 | make 25 | ``` 26 | 27 | In this example, the Makefile creates 8 cross-validation folds of TFRecords with input sequences of length 393216 bp, generated with a genome-wide stride of 131087 bp (which is ~1/3 of the sequence length, but shifts the bin boundaries, too). The output coverage tracks corresponding to each input sequence are not cropped in the latest version of Borzoi models. This results in 12288 coverage bins per 393kb sequence. The specific .w5 tracks to include in the TFRecord generation, and the scales and pooling transforms applied to the bins of each experiment, are given in the targets file 'targets_human.txt'. Below is a description of the columns in this file. 28 | 29 | *targets_human.txt*: 30 | - (unnamed) => integer index of each track (must start from 0 when training a new model). 31 | - 'identifier' => unique identifier of each experiment (and strand). 32 | - 'file' => local file path to .w5 file. 33 | - 'clip' => hard clipping threshold to be applied to each bin, after soft-clipping. 34 | - 'clip_soft' => soft clipping (squashing) threshold. 35 | - 'scale' => scale value applied to each bp-level position before clipping. 36 | - 'sum_stat' => type of bin-level pooling operation ('sum_sqrt' = sum and square-root). 37 | - 'strand_pair' => integer index of the other stranded track of an experiment (same index as current row if unstranded). 38 | - 'description' => text description of experiment. 39 | 40 | *Notes*: 41 | - See [here](https://github.com/calico/borzoi-paper/tree/main/data/training) for a description of the scripts called by the Makefile to create TFRecords. 42 | - In the latest version of Borzoi models, a modified hg38 fasta genome is used in the Makefile where the allele with highest overall frequency (from gnomAD) is substituted at each position. 43 | -------------------------------------------------------------------------------- /tutorials/latest/make_data/download_bw.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # download example data from ENCODE (ENCSR000AEL - K562 RNA-seq); 2 replicates 4 | 5 | # define ENCODE ID 6 | ENC_ID='ENCSR000AEL' 7 | 8 | # define remote urls 9 | URL_P_REP1='https://www.encodeproject.org/files/ENCFF980ZHM/@@download/ENCFF980ZHM.bigWig' 10 | URL_M_REP1='https://www.encodeproject.org/files/ENCFF533LJF/@@download/ENCFF533LJF.bigWig' 11 | 12 | URL_P_REP2='https://www.encodeproject.org/files/ENCFF335LVS/@@download/ENCFF335LVS.bigWig' 13 | URL_M_REP2='https://www.encodeproject.org/files/ENCFF257NOL/@@download/ENCFF257NOL.bigWig' 14 | 15 | # define ENCODE file IDs 16 | FILE_P_REP1='ENCFF980ZHM' 17 | FILE_M_REP1='ENCFF533LJF' 18 | 19 | FILE_P_REP2='ENCFF335LVS' 20 | FILE_M_REP2='ENCFF257NOL' 21 | 22 | # create folder for bigwig files 23 | mkdir -p "human/rna/encode/$ENC_ID/rep1" 24 | mkdir -p "human/rna/encode/$ENC_ID/rep2" 25 | 26 | 27 | # download bigwig files; rep1 28 | if [ -f "human/rna/encode/$ENC_ID/rep1/$FILE_P_REP1.bigWig" ]; then 29 | echo "example RNA-seq data already downloaded (rep 1)." 30 | else 31 | wget $URL_P_REP1 -O "human/rna/encode/$ENC_ID/rep1/$FILE_P_REP1.bigWig" 32 | wget $URL_M_REP1 -O "human/rna/encode/$ENC_ID/rep1/$FILE_M_REP1.bigWig" 33 | fi 34 | 35 | # download bigwig files; rep2 36 | if [ -f "human/rna/encode/$ENC_ID/rep2/$FILE_P_REP2.bigWig" ]; then 37 | echo "example RNA-seq data already downloaded (rep 2)." 38 | else 39 | wget $URL_P_REP2 -O "human/rna/encode/$ENC_ID/rep2/$FILE_P_REP2.bigWig" 40 | wget $URL_M_REP2 -O "human/rna/encode/$ENC_ID/rep2/$FILE_M_REP2.bigWig" 41 | fi 42 | -------------------------------------------------------------------------------- /tutorials/latest/make_data/download_dependencies.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # create additional folder in borzoi data folders 4 | mkdir -p "$BORZOI_HG38/assembly/ucsc" 5 | mkdir -p "$BORZOI_HG38/assembly/gnomad" 6 | mkdir -p "$BORZOI_HG38/mappability" 7 | mkdir -p "$BORZOI_HG38/blacklist" 8 | mkdir -p "$BORZOI_HG38/align" 9 | 10 | mkdir -p "$BORZOI_MM10/assembly/ucsc" 11 | mkdir -p "$BORZOI_MM10/mappability" 12 | mkdir -p "$BORZOI_MM10/blacklist" 13 | 14 | 15 | # download and uncompress auxiliary files required for Makefile (hg38) 16 | if [ -f "$BORZOI_HG38/assembly/ucsc/hg38_gaps.bed" ]; then 17 | echo "hg38_gaps.bed already exists." 18 | else 19 | wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/hg38_gaps.bed.gz | gunzip -c > "$BORZOI_HG38/assembly/ucsc/hg38_gaps.bed" 20 | fi 21 | 22 | if [ -f "$BORZOI_HG38/mappability/umap_k36_t10_l32.bed" ]; then 23 | echo "umap_k36_t10_l32.bed (hg38) already exists." 24 | else 25 | wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/umap_k36_t10_l32_hg38.bed.gz | gunzip -c > "$BORZOI_HG38/mappability/umap_k36_t10_l32.bed" 26 | fi 27 | 28 | if [ -f "$BORZOI_HG38/blacklist/blacklist_hg38_all.bed" ]; then 29 | echo "blacklist_hg38_all.bed already exists." 30 | else 31 | wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/blacklist_hg38_all.bed.gz | gunzip -c > "$BORZOI_HG38/blacklist/blacklist_hg38_all.bed" 32 | fi 33 | 34 | if [ -f "$BORZOI_HG38/align/hg38.mm10.syn.net.gz" ]; then 35 | echo "Splice site annotation already exist." 36 | else 37 | wget https://storage.googleapis.com/seqnn-share/helper/dependencies/hg38.mm10.syn.net.gz -O "$BORZOI_HG38/align/hg38.mm10.syn.net.gz" 38 | fi 39 | 40 | 41 | # download and uncompress auxiliary files required for Makefile (mm10) 42 | if [ -f "$BORZOI_MM10/assembly/ucsc/mm10_gaps.bed" ]; then 43 | echo "mm10_gaps.bed already exists." 44 | else 45 | wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/mm10_gaps.bed.gz | gunzip -c > "$BORZOI_MM10/assembly/ucsc/mm10_gaps.bed" 46 | fi 47 | 48 | if [ -f "$BORZOI_MM10/mappability/umap_k36_t10_l32.bed" ]; then 49 | echo "umap_k36_t10_l32.bed (mm10) already exists." 50 | else 51 | wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/umap_k36_t10_l32_mm10.bed.gz | gunzip -c > "$BORZOI_MM10/mappability/umap_k36_t10_l32.bed" 52 | fi 53 | 54 | if [ -f "$BORZOI_MM10/blacklist/blacklist_mm10_all.bed" ]; then 55 | echo "blacklist_mm10_all.bed already exists." 56 | else 57 | wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/blacklist_mm10_all.bed.gz | gunzip -c > "$BORZOI_MM10/blacklist/blacklist_mm10_all.bed" 58 | fi 59 | 60 | 61 | # download and uncompress pre-compiled umap bed files 62 | if [ -f umap_human.bed ]; then 63 | echo "umap_human.bed already exists." 64 | else 65 | wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/umap_human.bed.gz | gunzip -c > umap_human.bed 66 | fi 67 | 68 | if [ -f umap_mouse.bed ]; then 69 | echo "umap_mouse.bed already exists." 70 | else 71 | wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/umap_mouse.bed.gz | gunzip -c > umap_mouse.bed 72 | fi 73 | 74 | 75 | # download and index hg38 ml genome 76 | if [ -f "$BORZOI_HG38/assembly/ucsc/hg38.ml.fa" ]; then 77 | echo "hg38.ml.fa already exists." 78 | else 79 | wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/hg38.ml.fa.gz | gunzip -c > "$BORZOI_HG38/assembly/ucsc/hg38.ml.fa" 80 | idx_genome.py "$BORZOI_HG38/assembly/ucsc/hg38.ml.fa" 81 | fi 82 | 83 | # download and index hg38 ml genome (gnomad major alleles) 84 | if [ -f "$BORZOI_HG38/assembly/gnomad/hg38.ml.fa" ]; then 85 | echo "hg38.ml.fa (gnomad) already exists." 86 | else 87 | wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/hg38_gnomad.ml.fa.gz | gunzip -c > "$BORZOI_HG38/assembly/gnomad/hg38.ml.fa" 88 | idx_genome.py "$BORZOI_HG38/assembly/gnomad/hg38.ml.fa" 89 | fi 90 | 91 | # download and index mm10 ml genome 92 | if [ -f "$BORZOI_MM10/assembly/ucsc/mm10.ml.fa" ]; then 93 | echo "mm10.ml.fa already exists." 94 | else 95 | wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/mm10.ml.fa.gz | gunzip -c > "$BORZOI_MM10/assembly/ucsc/mm10.ml.fa" 96 | idx_genome.py "$BORZOI_MM10/assembly/ucsc/mm10.ml.fa" 97 | fi 98 | -------------------------------------------------------------------------------- /tutorials/latest/make_data/process_w5.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # merge bigwig replicates, generate .w5 files and run qc 4 | 5 | # define ENCODE ID 6 | ENC_ID='ENCSR000AEL' 7 | 8 | # define ENCODE file IDs 9 | FILE_P_REP1='ENCFF980ZHM' 10 | FILE_M_REP1='ENCFF533LJF' 11 | 12 | FILE_P_REP2='ENCFF335LVS' 13 | FILE_M_REP2='ENCFF257NOL' 14 | 15 | # create folder for merged replicate files 16 | mkdir -p "human/rna/encode/$ENC_ID/summary" 17 | 18 | 19 | # step 1: generate per-replicate .w5 files 20 | 21 | # rep1 22 | if [ -f "human/rna/encode/$ENC_ID/rep1/$FILE_P_REP1+.w5" ]; then 23 | echo "example RNA-seq .w5 already exists (rep 1)." 24 | else 25 | bw_h5.py -z "human/rna/encode/$ENC_ID/rep1/$FILE_P_REP1.bigWig" "human/rna/encode/$ENC_ID/rep1/$FILE_P_REP1+.w5" 26 | bw_h5.py -z "human/rna/encode/$ENC_ID/rep1/$FILE_M_REP1.bigWig" "human/rna/encode/$ENC_ID/rep1/$FILE_M_REP1-.w5" 27 | fi 28 | 29 | # rep2 30 | if [ -f "human/rna/encode/$ENC_ID/rep2/$FILE_P_REP2+.w5" ]; then 31 | echo "example RNA-seq .w5 already exists (rep 2)." 32 | else 33 | bw_h5.py -z "human/rna/encode/$ENC_ID/rep2/$FILE_P_REP2.bigWig" "human/rna/encode/$ENC_ID/rep2/$FILE_P_REP2+.w5" 34 | bw_h5.py -z "human/rna/encode/$ENC_ID/rep2/$FILE_M_REP2.bigWig" "human/rna/encode/$ENC_ID/rep2/$FILE_M_REP2-.w5" 35 | fi 36 | 37 | 38 | # step 2: merge replicates 39 | 40 | if [ -f "human/rna/encode/$ENC_ID/summary/coverage+.w5" ]; then 41 | echo "example RNA-seq .w5 already exists (merged)." 42 | else 43 | w5_merge.py -w -s mean -z "human/rna/encode/$ENC_ID/summary/coverage+.w5" "human/rna/encode/$ENC_ID/rep1/$FILE_P_REP1+.w5" "human/rna/encode/$ENC_ID/rep2/$FILE_P_REP2+.w5" 44 | w5_merge.py -w -s mean -z "human/rna/encode/$ENC_ID/summary/coverage-.w5" "human/rna/encode/$ENC_ID/rep1/$FILE_M_REP1-.w5" "human/rna/encode/$ENC_ID/rep2/$FILE_M_REP2-.w5" 45 | fi 46 | 47 | 48 | # step 3: run qc on each replicate and the merged file 49 | 50 | if [ -f "human/rna/encode/$ENC_ID/summary/covqc/means.txt" ]; then 51 | echo "qc statistics already exist." 52 | else 53 | # rep1 54 | w5_qc.py -b "$BORZOI_HG38/blacklist/blacklist_hg38_all.bed" -o "human/rna/encode/$ENC_ID/rep1/covqc" "human/rna/encode/$ENC_ID/rep1/$FILE_P_REP1+.w5" 55 | w5_qc.py -b "$BORZOI_HG38/blacklist/blacklist_hg38_all.bed" -o "human/rna/encode/$ENC_ID/rep1/covqc_m" "human/rna/encode/$ENC_ID/rep1/$FILE_M_REP1-.w5" 56 | 57 | # rep2 58 | w5_qc.py -b "$BORZOI_HG38/blacklist/blacklist_hg38_all.bed" -o "human/rna/encode/$ENC_ID/rep2/covqc" "human/rna/encode/$ENC_ID/rep2/$FILE_P_REP2+.w5" 59 | w5_qc.py -b "$BORZOI_HG38/blacklist/blacklist_hg38_all.bed" -o "human/rna/encode/$ENC_ID/rep2/covqc_m" "human/rna/encode/$ENC_ID/rep2/$FILE_M_REP2-.w5" 60 | 61 | # summary 62 | w5_qc.py -b "$BORZOI_HG38/blacklist/blacklist_hg38_all.bed" -o "human/rna/encode/$ENC_ID/summary/covqc" "human/rna/encode/$ENC_ID/summary/coverage+.w5" 63 | w5_qc.py -b "$BORZOI_HG38/blacklist/blacklist_hg38_all.bed" -o "human/rna/encode/$ENC_ID/summary/covqc_m" "human/rna/encode/$ENC_ID/summary/coverage-.w5" 64 | fi 65 | 66 | -------------------------------------------------------------------------------- /tutorials/latest/make_data/targets_human.txt: -------------------------------------------------------------------------------- 1 | identifier file clip clip_soft scale sum_stat strand_pair description 2 | 0 ENCFF980ZHM+ human/rna/encode/ENCSR000AEL/summary/coverage+.w5 768 384 0.3 sum_sqrt 1 RNA:K562 3 | 1 ENCFF980ZHM- human/rna/encode/ENCSR000AEL/summary/coverage-.w5 768 384 0.3 sum_sqrt 0 RNA:K562 4 | -------------------------------------------------------------------------------- /tutorials/latest/score_variants/README.md: -------------------------------------------------------------------------------- 1 | ## Variant Scoring 2 | 3 | This tutorial describes how to predict variant effect scores for a small set of SNVs defined in a .vcf file. This example relies on the Mini Borzoi model trained on sample K562 RNA-seq data from the [train_model tutorial](https://github.com/calico/borzoi/tree/main/tutorials/latest/train_model), which clearly is a significantly weaker model than the pre-trained, published Borzoi model. For examples showcasing variant effect prediction at a larger scale with the pre-trained model (e.g. fine-mapped eQTL classification benchmarks), we refer the user to the [borzoi-paper respository](https://github.com/calico/borzoi-paper/tree/main). Additionally, we refer the user to the **legacy** version of [this tutorial](https://github.com/calico/borzoi/tree/main/tutorials/legacy/score_variants), which uses the pre-trained, published model. 4 | 5 | First, to calculate **gene-specific expression** scores, run the script 'score_expr_sed.sh'. Two different statistics are computed: (1) logSED (gene expression log fold change), and (2) logD2 (bin-level L2 norm across the coverage profile intersecting the exons of the gene). 6 | ```sh 7 | conda activate borzoi_py310 8 | cd ~/borzoi/tutorials/latest/score_variants 9 | ./score_expr_sed.sh 10 | ``` 11 | 12 | To calculate **gene-agnostic expression** scores, run the script 'score_expr_sad.sh'. One statistic is computed: logD2 (bin-level L2 norm across the entire predicted coverage track). 13 | ```sh 14 | ./score_expr_sad.sh 15 | ``` 16 | 17 | To calculate **gene-specific polyadenylation** scores, run the script 'score_polya.sh'. One statistic is computed: COVR (3' coverage ratio across pA junctions of the target gene). 18 | ```sh 19 | ./score_polya.sh 20 | ``` 21 | 22 | To calculate **gene-specific splicing** scores, run the script 'score_splice.sh'. One statistic is computed: nDi (normalized maximum absolute difference in coverage bins across the target gene span). 23 | ```sh 24 | ./score_splice.sh 25 | ``` 26 | 27 | Finally, the jupyter notebook 'run_variant_scripts.ipynb' is provided for convenience to execute all above scripts. The notebook also exemplifies how to navigate the variant prediction hdf5 files and print some example scores. 28 | -------------------------------------------------------------------------------- /tutorials/latest/score_variants/run_variant_scripts.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "f5d0f9fb", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import os\n", 11 | "import sys\n", 12 | "import h5py\n" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": null, 18 | "id": "7a94cbf8", 19 | "metadata": { 20 | "scrolled": true 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "#Calculate gene-specific variant effect scores\n", 25 | "\n", 26 | "!./score_expr_sed.sh\n" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "id": "1047ff0f", 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "#Print an example variant effect prediction for a SNP-gene pair (gene-specific expression)\n", 37 | "\n", 38 | "sed_h5 = h5py.File('snp_sed/f0c0/sed.h5', 'r')\n", 39 | "\n", 40 | "row_ix = 63\n", 41 | "target_ix = 0\n", 42 | "\n", 43 | "print(\"score: 'logSED', snp: '\" + str(sed_h5['snp'][sed_h5['si'][row_ix]].decode()) + \"', gene: '\" + str(sed_h5['gene'][sed_h5['si'][row_ix]].decode()) + \"', track: '\" + str(sed_h5['target_labels'][target_ix].decode()) + \"' => \" + str(round(sed_h5['logSED'][row_ix, target_ix], 4)))\n" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "id": "f105ecd9", 50 | "metadata": { 51 | "scrolled": true 52 | }, 53 | "outputs": [], 54 | "source": [ 55 | "#Calculate gene-agnostic variant effect scores\n", 56 | "\n", 57 | "!./score_expr_sad.sh\n" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "id": "96e4f7cb", 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "#Print an example variant effect prediction for a SNP (gene-agnostic expression)\n", 68 | "\n", 69 | "sad_h5 = h5py.File('snp_sad/f0c0/sad.h5', 'r')\n", 70 | "\n", 71 | "snp_ix = 1\n", 72 | "target_ix = 0\n", 73 | "\n", 74 | "print(\"score: 'logD2', snp: '\" + str(sad_h5['snp'][snp_ix].decode()) + \"', track: '\" + str(sad_h5['target_labels'][target_ix].decode()) + \"' => \" + str(round(sad_h5['logD2'][snp_ix, target_ix], 4)))\n" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "id": "c56efaef", 81 | "metadata": { 82 | "scrolled": true 83 | }, 84 | "outputs": [], 85 | "source": [ 86 | "#Calculate splice variant effect scores\n", 87 | "\n", 88 | "!./score_splice.sh\n" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "id": "980993fc", 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "#Print an example variant effect prediction for a SNP-gene pair (splicing)\n", 99 | "\n", 100 | "sed_h5 = h5py.File('snp_splice/f0c0/sed.h5', 'r')\n", 101 | "\n", 102 | "row_ix = 116\n", 103 | "target_ix = 755\n", 104 | "\n", 105 | "print(\"score: 'nDi', snp: '\" + str(sed_h5['snp'][sed_h5['si'][row_ix]].decode()) + \"', gene: '\" + str(sed_h5['gene'][sed_h5['si'][row_ix]].decode()) + \"', track: '\" + str(sed_h5['target_labels'][target_ix].decode()) + \"' => \" + str(round(sed_h5['nDi'][row_ix, target_ix], 4)))\n" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "id": "05cccfb6", 112 | "metadata": { 113 | "scrolled": true 114 | }, 115 | "outputs": [], 116 | "source": [ 117 | "#Calculate polyadenylation variant effect scores\n", 118 | "\n", 119 | "!./score_polya.sh\n" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "id": "43ac562f", 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "#Print an example variant effect prediction for a SNP-gene pair (polyadenylation)\n", 130 | "\n", 131 | "sed_h5 = h5py.File('snp_polya/f0c0/sed.h5', 'r')\n", 132 | "\n", 133 | "row_ix = 47\n", 134 | "target_ix = 100\n", 135 | "\n", 136 | "print(\"score: 'logSED', snp: '\" + str(sed_h5['snp'][sed_h5['si'][row_ix]].decode()) + \"', gene: '\" + str(sed_h5['gene'][sed_h5['si'][row_ix]].decode()) + \"', track: '\" + str(sed_h5['target_labels'][target_ix].decode()) + \"' => \" + str(round(sed_h5['COVR'][row_ix, target_ix], 4)))\n" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "id": "0ba23572", 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [] 146 | } 147 | ], 148 | "metadata": { 149 | "kernelspec": { 150 | "display_name": "Python 3 (ipykernel)", 151 | "language": "python", 152 | "name": "python3" 153 | }, 154 | "language_info": { 155 | "codemirror_mode": { 156 | "name": "ipython", 157 | "version": 3 158 | }, 159 | "file_extension": ".py", 160 | "mimetype": "text/x-python", 161 | "name": "python", 162 | "nbconvert_exporter": "python", 163 | "pygments_lexer": "ipython3", 164 | "version": "3.8.15" 165 | } 166 | }, 167 | "nbformat": 4, 168 | "nbformat_minor": 5 169 | } 170 | -------------------------------------------------------------------------------- /tutorials/latest/score_variants/score_expr_sad.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | mkdir -p snp_sad/f0c0 4 | 5 | borzoi_sad.py -o snp_sad/f0c0 --rc --stats logD2 -t ../make_data/targets_human.txt ../train_model/params_mini.json ../train_model/mini_models/f0c0/train/model_best.h5 snps_expr.vcf 6 | -------------------------------------------------------------------------------- /tutorials/latest/score_variants/score_expr_sed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | mkdir -p snp_sed/f0c0 4 | 5 | borzoi_sed.py -o snp_sed/f0c0 --rc --stats logSED,logD2 -t ../make_data/targets_human.txt ../train_model/params_mini.json ../train_model/mini_models/f0c0/train/model_best.h5 snps_expr.vcf 6 | -------------------------------------------------------------------------------- /tutorials/latest/score_variants/score_polya.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | mkdir -p snp_polya/f0c0 4 | 5 | borzoi_sed_paqtl_cov.py -o snp_polya/f0c0 --rc --stats COVR -t ../make_data/targets_human.txt ../train_model/params_mini.json ../train_model/mini_models/f0c0/train/model_best.h5 snps_polya.vcf 6 | -------------------------------------------------------------------------------- /tutorials/latest/score_variants/score_splice.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | mkdir -p snp_splice/f0c0 4 | 5 | borzoi_sed.py -o snp_splice/f0c0 --span --no_untransform --rc --stats nDi -t ../make_data/targets_human.txt ../train_model/params_mini.json ../train_model/mini_models/f0c0/train/model_best.h5 snps_splice.vcf 6 | -------------------------------------------------------------------------------- /tutorials/latest/score_variants/snps_expr.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.2 2 | chr1 43110773 chr1_43110773_G_A_b38 G A . . 3 | chr1 43120331 chr1_43120331_C_T_b38 C T . . 4 | chr1 46309111 chr1_46309111_A_G_b38 A G . . 5 | chr1 52632886 chr1_52632886_A_C_b38 A C . . 6 | chr1 54053434 chr1_54053434_G_A_b38 G A . . 7 | -------------------------------------------------------------------------------- /tutorials/latest/score_variants/snps_polya.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.2 2 | ##INFO= 3 | ##INFO= 4 | ##INFO= 5 | #CHROM POS ID REF ALT QUAL FILTER INFO 6 | chr1 11790946 chr1_11790946_G_C G C . . MT=ENSG00000177000.grp_2.downstream.ENST00000641805;PD=924;PI=chr1_11790946_G_C 7 | chr1 150160094 chr1_150160094_C_G C G . . MT=ENSG00000023902.grp_1.downstream.ENST00000369126;PD=29;PI=chr1_150160094_C_G 8 | chr16 57665101 chr16_57665101_A_G A G . . MT=ENSG00000205336.grp_1.downstream.ENST00000568908;PD=73;PI=chr16_57665101_A_G 9 | chr16 80976052 chr16_80976052_T_G T G . . MT=ENSG00000103121.grp_2.downstream.ENST00000565925;PD=24;PI=chr16_80976052_T_G 10 | chr16 88857261 chr16_88857261_T_C T C . . MT=ENSG00000167515.grp_2.downstream.ENST00000564547;PD=3851;PI=chr16_88857261_T_C -------------------------------------------------------------------------------- /tutorials/latest/score_variants/snps_splice.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.2 2 | ##INFO= 3 | ##INFO= 4 | ##INFO= 5 | #CHROM POS ID REF ALT QUAL FILTER INFO 6 | chr1 1665061 chr1_1665061_C_T C T . . MT=ENSG00000189339.grp_2.contained.ENST00000611123;SD=959;PI=chr1_1665061_C_T 7 | chr1 1689221 chr1_1689221_G_A G A . . MT=ENSG00000189339.grp_1.contained.ENST00000614300;SD=1753;PI=chr1_1689221_G_A 8 | chr1 50655526 chr1_50655526_T_C T C . . MT=ENSG00000185104.grp_2.contained.ENST00000396153;SD=3;PI=chr1_50655526_T_C 9 | chr1 109489368 chr1_109489368_C_G C G . . MT=ENSG00000143537.grp_2.contained.ENST00000360674;SD=1;PI=chr1_155060832_G_A 10 | chr1 156236330 chr1_156236330_G_A G A . . MT=ENSG00000160783.grp_1.contained.ENST00000368279;SD=17;PI=chr1_156236330_G_A 11 | -------------------------------------------------------------------------------- /tutorials/latest/train_model/README.md: -------------------------------------------------------------------------------- 1 | ## Model Training 2 | 3 | This tutorial describes how to train smaller Borzoi models on the example RNA-seq experiment processed in the [make_data tutorial](https://github.com/calico/borzoi/tree/main/tutorials/latest/make_data). 4 | 5 | To train a 'Mini Borzoi' ensemble (~40M parameters, 2 cross-validation folds), run the script 'train_mini.sh'. The model parameters are specified in 'params_mini.json'. This model can be trained with a batch size of 2 on a 24GB NVIDIA Titan RTX or RTX4090 GPU. 6 | ```sh 7 | conda activate borzoi_py310 8 | cd ~/borzoi/tutorials/latest/train_model 9 | ./train_mini.sh 10 | ``` 11 | 12 | Alternatively, to train an even smaller 'Micro Borzoi' ensemble (~5M parameters), run the script 'train_micro.sh'. This model can fit into the above GPU cards with a batch size of 4, which means the learning rate can be doubled and each epoch finished in half the time. 13 | ```sh 14 | ./train_micro.sh 15 | ``` 16 | 17 | *Notes*: 18 | - See [here](https://github.com/calico/borzoi-paper/tree/main/model) for a description of the scripts called internally by the training .sh script. 19 | - Rather than cropping the output predictions before applying the training loss, in the latest version of Borzoi models a smooth position-specific loss weight is applied that penalizes prediction errors less at the left/right boundaries. 20 | -------------------------------------------------------------------------------- /tutorials/latest/train_model/params_micro.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "batch_size": 4, 4 | "shuffle_buffer": 256, 5 | "optimizer": "adam", 6 | "learning_rate": 0.0002, 7 | "loss": "poisson_mn", 8 | "total_weight": 0.2, 9 | "weight_range": 8, 10 | "weight_exp": 6, 11 | "warmup_steps": 10000, 12 | "global_clipnorm": 0.2, 13 | "adam_beta1": 0.9, 14 | "adam_beta2": 0.999, 15 | "patience": 30, 16 | "train_epochs_min": 130, 17 | "train_epochs_max": 180 18 | }, 19 | "model": { 20 | "seq_length": 393216, 21 | "augment_rc": true, 22 | "augment_shift": 3, 23 | "activation": "gelu", 24 | "norm_type": "batch", 25 | "bn_momentum": 0.9, 26 | "kernel_initializer": "lecun_normal", 27 | "l2_scale": 1.0e-6, 28 | "trunk": [ 29 | { 30 | "name": "conv_dna", 31 | "filters": 128, 32 | "kernel_size": 11, 33 | "norm_type": null, 34 | "activation": "linear", 35 | "pool_size": 2 36 | }, 37 | { 38 | "name": "res_tower", 39 | "filters_init": 160, 40 | "filters_end": 320, 41 | "divisible_by": 8, 42 | "kernel_size": 5, 43 | "num_convs": 1, 44 | "pool_size": 2, 45 | "repeat": 6 46 | }, 47 | { 48 | "name": "transformer_tower", 49 | "key_size": 32, 50 | "heads": 4, 51 | "num_position_features": 32, 52 | "dropout": 0.1, 53 | "attention_dropout": 0.01, 54 | "mha_l2_scale": 1.0e-8, 55 | "l2_scale": 1.0e-8, 56 | "kernel_initializer": "he_normal", 57 | "repeat": 4 58 | }, 59 | { 60 | "name": "unet_conv", 61 | "kernel_size": 3 62 | }, 63 | { 64 | "name": "unet_conv", 65 | "kernel_size": 3 66 | } 67 | ], 68 | "head_human": { 69 | "name": "final", 70 | "units": 2, 71 | "activation": "softplus" 72 | } 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /tutorials/latest/train_model/params_mini.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "batch_size": 2, 4 | "shuffle_buffer": 256, 5 | "optimizer": "adam", 6 | "learning_rate": 0.0001, 7 | "loss": "poisson_mn", 8 | "total_weight": 0.2, 9 | "weight_range": 8, 10 | "weight_exp": 6, 11 | "warmup_steps": 20000, 12 | "global_clipnorm": 0.1, 13 | "adam_beta1": 0.9, 14 | "adam_beta2": 0.999, 15 | "patience": 30, 16 | "train_epochs_min": 130, 17 | "train_epochs_max": 180 18 | }, 19 | "model": { 20 | "seq_length": 393216, 21 | "augment_rc": true, 22 | "augment_shift": 3, 23 | "activation": "gelu", 24 | "norm_type": "batch", 25 | "bn_momentum": 0.9, 26 | "kernel_initializer": "lecun_normal", 27 | "l2_scale": 5.0e-7, 28 | "trunk": [ 29 | { 30 | "name": "conv_dna", 31 | "filters": 320, 32 | "kernel_size": 11, 33 | "norm_type": null, 34 | "activation": "linear", 35 | "pool_size": 2 36 | }, 37 | { 38 | "name": "res_tower", 39 | "filters_init": 384, 40 | "filters_end": 768, 41 | "divisible_by": 16, 42 | "kernel_size": 5, 43 | "num_convs": 1, 44 | "pool_size": 2, 45 | "repeat": 6 46 | }, 47 | { 48 | "name": "transformer_tower", 49 | "key_size": 64, 50 | "heads": 4, 51 | "num_position_features": 32, 52 | "dropout": 0.2, 53 | "mha_l2_scale": 1.0e-8, 54 | "l2_scale": 1.0e-8, 55 | "kernel_initializer": "he_normal", 56 | "repeat": 8 57 | }, 58 | { 59 | "name": "unet_conv", 60 | "kernel_size": 3 61 | }, 62 | { 63 | "name": "unet_conv", 64 | "kernel_size": 3 65 | } 66 | ], 67 | "head_human": { 68 | "name": "final", 69 | "units": 2, 70 | "activation": "softplus" 71 | } 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /tutorials/latest/train_model/train_micro.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | westminster_train_folds.py -e borzoi_py310 -f 2 -c 1 -q rtx4090 -o micro_models params_micro.json ../make_data/data/hg38 4 | -------------------------------------------------------------------------------- /tutorials/latest/train_model/train_mini.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | westminster_train_folds.py -e borzoi_py310 -f 2 -c 1 -q rtx4090 -o mini_models params_mini.json ../make_data/data/hg38 4 | -------------------------------------------------------------------------------- /tutorials/legacy/interpret_sequence/README.md: -------------------------------------------------------------------------------- 1 | ## Interpretation 2 | 3 | This tutorial describes how to compute gradient saliency scores (sequence attributions) with respect to various statistics computed for a list of input genes specified in a .gtf file. This example uses the pre-trained, published Borzoi model to compute gradients. To download this model, run the script 'download_models.sh' in the 'borzoi' root folder. 4 | 5 | First, to compute input gradients with respect to the log-sum of coverage across the exons of the target gene, run the script 'run_gradients_expr_CFHR2.sh'. 6 | ```sh 7 | conda activate borzoi_py310 8 | cd ~/borzoi/tutorials/legacy/interpret_sequence 9 | ./run_gradients_expr_CFHR2.sh 10 | ``` 11 | 12 | To compute input gradients with respect to the log-ratio of coverage immediately upstream and downstream of the distal polyA site of the target gene, run the script 'run_gradients_polya_CD99.sh'. 13 | ```sh 14 | ./run_gradients_polya_CD99.sh 15 | ``` 16 | 17 | To compute input gradients with respect to the log-ratio of coverage of an exon of the target gene relative to intronic coverage, run the script 'run_gradients_splice_GCFC2.sh'. 18 | ```sh 19 | ./run_gradients_splice_GCFC2.sh 20 | ``` 21 | Currently, the splicing gradient script chooses one exon at random to compute gradients for. While this approach was favorable for the specific analysis of the manuscript, we acknowledge that this is not particularly useful to users wanting to investigate an exon of their choice. We plan on updating this script soon to allow users to specify which exon to calculate gradients for. 22 | 23 | *Notes*: 24 | - The track scale, squashing exponentiation, and clip-soft threshold, are specific in the .py script arguments (flags: '--track_scale, '--track_transform', '--clip_soft'), and the values in the targets file are ignored. This means that the same data transformation parameters are applied to all tracks specified in the targets file. To calculate gradients for groups of tracks with different data transforms, separate these tracks into different targets files, and execute the gradient script on each group separately. 25 | - The legacy data transforms are activated in all above .sh scripts with the flag '--untransform_old'. 26 | -------------------------------------------------------------------------------- /tutorials/legacy/interpret_sequence/run_gradients_expr_CFHR2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | borzoi_satg_gene.py -o ../../../examples/saved_models/gtex_CFHR2 -f 3 -c 0 --rc --untransform_old --track_scale 0.01 --track_transform 0.75 --clip_soft 384.0 -t ../../../examples/targets_gtex_liver.txt ../../../examples/params_pred.json ../../../examples/saved_models ../../../examples/CFHR2_example.gtf 4 | -------------------------------------------------------------------------------- /tutorials/legacy/interpret_sequence/run_gradients_polya_CD99.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | borzoi_satg_polya.py -o ../../../examples/saved_models/gtex_CD99 -f 3 -c 0 --rc --untransform_old --track_scale 0.01 --track_transform 0.75 --clip_soft 384.0 -t ../../../examples/targets_gtex.txt ../../../examples/params_pred.json ../../../examples/saved_models ../../../examples/CD99_example.gtf 4 | -------------------------------------------------------------------------------- /tutorials/legacy/interpret_sequence/run_gradients_splice_GCFC2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | borzoi_satg_splice.py -o ../../../examples/saved_models/gtex_GCFC2 -f 3 -c 0 --rc --untransform_old --track_scale 0.01 --track_transform 0.75 --clip_soft 384.0 -t ../../../examples/targets_gtex.txt ../../../examples/params_pred.json ../../../examples/saved_models ../../../examples/GCFC2_example.gtf 4 | -------------------------------------------------------------------------------- /tutorials/legacy/interpret_sequence/vis_helpers.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import numpy as np 4 | 5 | import matplotlib.pyplot as plt 6 | 7 | import matplotlib.cm as cm 8 | import matplotlib.colors as colors 9 | 10 | import matplotlib as mpl 11 | from matplotlib.text import TextPath 12 | from matplotlib.patches import PathPatch, Rectangle 13 | from matplotlib.font_manager import FontProperties 14 | from matplotlib import gridspec 15 | from matplotlib.ticker import FormatStrFormatter 16 | 17 | #Helper function to draw a letter at a given position 18 | def dna_letter_at(letter, x, y, yscale=1, ax=None, color=None, alpha=1.0): 19 | 20 | fp = FontProperties(family="DejaVu Sans", weight="bold") 21 | globscale = 1.35 22 | LETTERS = { "T" : TextPath((-0.305, 0), "T", size=1, prop=fp), 23 | "G" : TextPath((-0.384, 0), "G", size=1, prop=fp), 24 | "A" : TextPath((-0.35, 0), "A", size=1, prop=fp), 25 | "C" : TextPath((-0.366, 0), "C", size=1, prop=fp), 26 | "UP" : TextPath((-0.488, 0), '$\\Uparrow$', size=1, prop=fp), 27 | "DN" : TextPath((-0.488, 0), '$\\Downarrow$', size=1, prop=fp), 28 | "(" : TextPath((-0.25, 0), "(", size=1, prop=fp), 29 | "." : TextPath((-0.125, 0), "-", size=1, prop=fp), 30 | ")" : TextPath((-0.1, 0), ")", size=1, prop=fp)} 31 | COLOR_SCHEME = {'G': 'orange',#'orange', 32 | 'A': 'green',#'red', 33 | 'C': 'blue',#'blue', 34 | 'T': 'red',#'darkgreen', 35 | 'UP': 'green', 36 | 'DN': 'red', 37 | '(': 'black', 38 | '.': 'black', 39 | ')': 'black'} 40 | 41 | 42 | text = LETTERS[letter] 43 | 44 | chosen_color = COLOR_SCHEME[letter] 45 | if color is not None : 46 | chosen_color = color 47 | 48 | t = mpl.transforms.Affine2D().scale(1*globscale, yscale*globscale) + \ 49 | mpl.transforms.Affine2D().translate(x,y) + ax.transData 50 | p = PathPatch(text, lw=0, fc=chosen_color, alpha=alpha, transform=t) 51 | if ax != None: 52 | ax.add_artist(p) 53 | return p 54 | 55 | #Function to plot sequence logo 56 | def plot_seq_scores(importance_scores, figsize=(16, 2), plot_y_ticks=True, y_min=None, y_max=None, save_figs=False, fig_name="default") : 57 | 58 | importance_scores = importance_scores.T 59 | 60 | fig = plt.figure(figsize=figsize) 61 | 62 | ref_seq = "" 63 | for j in range(importance_scores.shape[1]) : 64 | argmax_nt = np.argmax(np.abs(importance_scores[:, j])) 65 | 66 | if argmax_nt == 0 : 67 | ref_seq += "A" 68 | elif argmax_nt == 1 : 69 | ref_seq += "C" 70 | elif argmax_nt == 2 : 71 | ref_seq += "G" 72 | elif argmax_nt == 3 : 73 | ref_seq += "T" 74 | 75 | ax = plt.gca() 76 | 77 | for i in range(0, len(ref_seq)) : 78 | mutability_score = np.sum(importance_scores[:, i]) 79 | color = None 80 | dna_letter_at(ref_seq[i], i + 0.5, 0, mutability_score, ax, color=color) 81 | 82 | plt.sca(ax) 83 | plt.xticks([], []) 84 | plt.gca().yaxis.set_major_formatter(FormatStrFormatter('%.3f')) 85 | 86 | plt.xlim((0, len(ref_seq))) 87 | 88 | #plt.axis('off') 89 | 90 | if plot_y_ticks : 91 | plt.yticks(fontsize=12) 92 | else : 93 | plt.yticks([], []) 94 | 95 | if y_min is not None and y_max is not None : 96 | plt.ylim(y_min, y_max) 97 | elif y_min is not None : 98 | plt.ylim(y_min) 99 | else : 100 | plt.ylim( 101 | np.min(importance_scores) - 0.1 * np.max(np.abs(importance_scores)), 102 | np.max(importance_scores) + 0.1 * np.max(np.abs(importance_scores)) 103 | ) 104 | 105 | plt.axhline(y=0., color='black', linestyle='-', linewidth=1) 106 | 107 | #for axis in fig.axes : 108 | # axis.get_xaxis().set_visible(False) 109 | # axis.get_yaxis().set_visible(False) 110 | 111 | plt.tight_layout() 112 | 113 | if save_figs : 114 | plt.savefig(fig_name + ".png", transparent=True, dpi=300) 115 | plt.savefig(fig_name + ".eps") 116 | 117 | plt.show() 118 | 119 | #Function to visualize a pair of sequence logos 120 | def visualize_input_gradient_pair(att_grad_wt, att_grad_mut, plot_start=0, plot_end=100, save_figs=False, fig_name='') : 121 | 122 | scores_wt = att_grad_wt[plot_start:plot_end, :] 123 | scores_mut = att_grad_mut[plot_start:plot_end, :] 124 | 125 | y_min = min(np.min(scores_wt), np.min(scores_mut)) 126 | y_max = max(np.max(scores_wt), np.max(scores_mut)) 127 | 128 | y_max_abs = max(np.abs(y_min), np.abs(y_max)) 129 | 130 | y_min = y_min - 0.05 * y_max_abs 131 | y_max = y_max + 0.05 * y_max_abs 132 | 133 | if np.sum(scores_mut) != 0. : 134 | print("--- WT ---") 135 | 136 | plot_seq_scores( 137 | scores_wt, y_min=y_min, y_max=y_max, 138 | figsize=(8, 1), 139 | plot_y_ticks=False, 140 | save_figs=save_figs, 141 | fig_name=fig_name + '_wt', 142 | ) 143 | 144 | if np.sum(scores_mut) != 0. : 145 | 146 | print("--- Mut ---") 147 | plot_seq_scores( 148 | scores_mut, y_min=y_min, y_max=y_max, 149 | figsize=(8, 1), 150 | plot_y_ticks=False, 151 | save_figs=save_figs, 152 | fig_name=fig_name + '_mut', 153 | ) 154 | -------------------------------------------------------------------------------- /tutorials/legacy/make_data/Makefile: -------------------------------------------------------------------------------- 1 | FASTA_HUMAN=$$BORZOI_HG38/assembly/ucsc/hg38.ml.fa 2 | GAPS_HUMAN=$$BORZOI_HG38/assembly/ucsc/hg38_gaps.bed 3 | UMAP_HUMAN=$$BORZOI_HG38/mappability/umap_k36_t10_l32.bed 4 | BLACK_HUMAN=$$BORZOI_HG38/blacklist/blacklist_hg38_all.bed 5 | 6 | FASTA_MOUSE=$$BORZOI_MM10/assembly/ucsc/mm10.ml.fa 7 | GAPS_MOUSE=$$BORZOI_MM10/assembly/ucsc/mm10_gaps.bed 8 | UMAP_MOUSE=$$BORZOI_MM10/mappability/umap_k36_t10_l32.bed 9 | BLACK_MOUSE=$$BORZOI_MM10/blacklist/blacklist_mm10_all.bed 10 | 11 | ALIGN=$$BORZOI_HG38/align/hg38.mm10.syn.net.gz 12 | 13 | OUT=data 14 | 15 | # mini borzoi configuration 16 | LENGTH=393216 17 | TSTRIDE=65551 # (393216-2*98304)/3 + 15 18 | CROP=98304 19 | WIDTH=32 20 | FOLDS=8 21 | 22 | AOPTS=--break 2097152 -c $(CROP) --nf 524288 --no 393216 -l $(LENGTH) --stride $(TSTRIDE) -f $(FOLDS) --umap_t 0.5 -w $(WIDTH) 23 | DOPTS=-c $(CROP) -d 2 -f $(FOLDS) -l $(LENGTH) -p 64 -r 16 --umap_clip 0.5 -w $(WIDTH) --transform_old 24 | 25 | all: $(OUT)/hg38/tfrecords/train-0.tfr # $(OUT)/mm10/tfrecords/train-0.tfr 26 | 27 | umap_human.bed: 28 | cat $(UMAP_HUMAN) $(BLACK_HUMAN) | awk 'BEGIN {OFS="\t"} {print $$1, $$2, $$3}' | bedtools sort -i - | bedtools merge -i - > umap_human.bed 29 | 30 | umap_mouse.bed: 31 | cat $(UMAP_MOUSE) $(BLACK_MOUSE) | awk 'BEGIN {OFS="\t"} {print $$1, $$2, $$3}' | bedtools sort -i - | bedtools merge -i - > umap_mouse.bed 32 | 33 | # targets file is already generated in this example 34 | #targets_human.txt targets_mouse.txt: 35 | # ./make_targets.py 36 | 37 | $(OUT)/hg38/sequences.bed $(OUT)/mm10/sequences.bed: umap_human.bed umap_mouse.bed 38 | hound_data_align.py -a hg38,mm10 -g $(GAPS_HUMAN),$(GAPS_MOUSE) -u umap_human.bed,umap_mouse.bed $(AOPTS) -o $(OUT) $(ALIGN) $(FASTA_HUMAN),$(FASTA_MOUSE) 39 | 40 | $(OUT)/hg38/tfrecords/train-0.tfr: $(OUT)/hg38/sequences.bed targets_human.txt 41 | hound_data.py --restart $(DOPTS) -b $(BLACK_HUMAN) -o $(OUT)/hg38 $(FASTA_HUMAN) -u umap_human.bed targets_human.txt 42 | 43 | # no mouse data in this example 44 | #$(OUT)/mm10/tfrecords/train-0.tfr: $(OUT)/mm10/sequences.bed targets_mouse.txt 45 | # hound_data.py --restart $(DOPTS) -b $(BLACK_MOUSE) -o $(OUT)/mm10 $(FASTA_MOUSE) -u umap_mouse.bed targets_mouse.txt 46 | -------------------------------------------------------------------------------- /tutorials/legacy/make_data/README.md: -------------------------------------------------------------------------------- 1 | ## Data Processing 2 | 3 | This tutorial decribes how to process a .bigwig sequencing experiment into compressed .w5 format, merge replicates, generate QC metrics, and finally create TFRecord files containing binned coverage values suitable for training Borzoi models. We will exemplify this for the ENCODE K562 RNA-seq experiment [ENCSR000AEL](https://www.encodeproject.org/experiments/ENCSR000AEL/). 4 | 5 | First, activate the conda environment and run the script 'download_dependencies.sh' to download required auxiliary files. 6 | ```sh 7 | conda activate borzoi_py310 8 | cd ~/borzoi/tutorials/legacy/make_data 9 | ./download_dependencies.sh 10 | ``` 11 | 12 | Next, run the script 'download_bw.sh' to download sample ENCODE .bigwig files and arrange them in a folder structure. 13 | ```sh 14 | ./download_bw.sh 15 | ``` 16 | 17 | Then run script 'process_w5.sh' to generate compressed .w5 files (hdf5) from the input .bigwig files, merge the two replicates, and calculate basic QC metrics. This .sh script internally calls 'bw_h5.py' to generate .w5 files, 'w5_merge.py' to merge replicates, and 'w5_qc.py' to calculate QC metrics. 18 | ```sh 19 | ./process_w5.sh 20 | ``` 21 | 22 | Finally, run the Makefile to create genome-wide binned coverage tracks, stored as compressed TFRecords. 23 | ```sh 24 | make 25 | ``` 26 | 27 | In this example, the Makefile creates 8 cross-validation folds of TFRecords with input sequences of length 393216 bp, generated with a genome-wide stride of 65551 bp (which is ~1/3 of the cropped sequence length, but shifts the bin boundaries, too). The output coverage tracks corresponding to each input sequence are cropped by 98304 bp on each side, before pooling the measurements in 32 bp bins. This results in 6144 coverage bins per 393kb sequence. The specific .w5 tracks to include in the TFRecord generation, and the scales and pooling transforms applied to the bins of each experiment, are given in the targets file 'targets_human.txt'. Below is a description of the columns in this file. 28 | 29 | *targets_human.txt*: 30 | - (unnamed) => integer index of each track (must start from 0 when training a new model). 31 | - 'identifier' => unique identifier of each experiment (and strand). 32 | - 'file' => local file path to .w5 file. 33 | - 'clip' => hard clipping threshold to be applied to each bin, after soft-clipping. 34 | - 'clip_soft' => soft clipping (squashing) threshold. 35 | - 'scale' => scale value applied to each 32 bp bin after clipping. 36 | - 'sum_stat' => type of bin-level pooling operation ('sum_sqrt' = sum and exponentiate by 3/4). 37 | - 'strand_pair' => integer index of the other stranded track of an experiment (same index as current row if unstranded). 38 | - 'description' => text description of experiment. 39 | 40 | *Notes*: 41 | - See [here](https://github.com/calico/borzoi-paper/tree/main/data/training) for a description of the scripts called by the Makefile to create TFRecords. 42 | - Of note, the **legacy** settings are activated in these data processing scripts with the flag '--transform_old' in the Makefile. 43 | - The **legacy** approach crops to the coverage tracks, a practice we have since abandonded in favor of a position-specific loss scale. 44 | -------------------------------------------------------------------------------- /tutorials/legacy/make_data/download_bw.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # download example data from ENCODE (ENCSR000AEL - K562 RNA-seq); 2 replicates 4 | 5 | # define ENCODE ID 6 | ENC_ID='ENCSR000AEL' 7 | 8 | # define remote urls 9 | URL_P_REP1='https://www.encodeproject.org/files/ENCFF980ZHM/@@download/ENCFF980ZHM.bigWig' 10 | URL_M_REP1='https://www.encodeproject.org/files/ENCFF533LJF/@@download/ENCFF533LJF.bigWig' 11 | 12 | URL_P_REP2='https://www.encodeproject.org/files/ENCFF335LVS/@@download/ENCFF335LVS.bigWig' 13 | URL_M_REP2='https://www.encodeproject.org/files/ENCFF257NOL/@@download/ENCFF257NOL.bigWig' 14 | 15 | # define ENCODE file IDs 16 | FILE_P_REP1='ENCFF980ZHM' 17 | FILE_M_REP1='ENCFF533LJF' 18 | 19 | FILE_P_REP2='ENCFF335LVS' 20 | FILE_M_REP2='ENCFF257NOL' 21 | 22 | # create folder for bigwig files 23 | mkdir -p "human/rna/encode/$ENC_ID/rep1" 24 | mkdir -p "human/rna/encode/$ENC_ID/rep2" 25 | 26 | 27 | # download bigwig files; rep1 28 | if [ -f "human/rna/encode/$ENC_ID/rep1/$FILE_P_REP1.bigWig" ]; then 29 | echo "example RNA-seq data already downloaded (rep 1)." 30 | else 31 | wget $URL_P_REP1 -O "human/rna/encode/$ENC_ID/rep1/$FILE_P_REP1.bigWig" 32 | wget $URL_M_REP1 -O "human/rna/encode/$ENC_ID/rep1/$FILE_M_REP1.bigWig" 33 | fi 34 | 35 | # download bigwig files; rep2 36 | if [ -f "human/rna/encode/$ENC_ID/rep2/$FILE_P_REP2.bigWig" ]; then 37 | echo "example RNA-seq data already downloaded (rep 2)." 38 | else 39 | wget $URL_P_REP2 -O "human/rna/encode/$ENC_ID/rep2/$FILE_P_REP2.bigWig" 40 | wget $URL_M_REP2 -O "human/rna/encode/$ENC_ID/rep2/$FILE_M_REP2.bigWig" 41 | fi 42 | -------------------------------------------------------------------------------- /tutorials/legacy/make_data/download_dependencies.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # create additional folder in borzoi data folders 4 | mkdir -p "$BORZOI_HG38/assembly/ucsc" 5 | mkdir -p "$BORZOI_HG38/assembly/gnomad" 6 | mkdir -p "$BORZOI_HG38/mappability" 7 | mkdir -p "$BORZOI_HG38/blacklist" 8 | mkdir -p "$BORZOI_HG38/align" 9 | 10 | mkdir -p "$BORZOI_MM10/assembly/ucsc" 11 | mkdir -p "$BORZOI_MM10/mappability" 12 | mkdir -p "$BORZOI_MM10/blacklist" 13 | 14 | 15 | # download and uncompress auxiliary files required for Makefile (hg38) 16 | if [ -f "$BORZOI_HG38/assembly/ucsc/hg38_gaps.bed" ]; then 17 | echo "hg38_gaps.bed already exists." 18 | else 19 | wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/hg38_gaps.bed.gz | gunzip -c > "$BORZOI_HG38/assembly/ucsc/hg38_gaps.bed" 20 | fi 21 | 22 | if [ -f "$BORZOI_HG38/mappability/umap_k36_t10_l32.bed" ]; then 23 | echo "umap_k36_t10_l32.bed (hg38) already exists." 24 | else 25 | wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/umap_k36_t10_l32_hg38.bed.gz | gunzip -c > "$BORZOI_HG38/mappability/umap_k36_t10_l32.bed" 26 | fi 27 | 28 | if [ -f "$BORZOI_HG38/blacklist/blacklist_hg38_all.bed" ]; then 29 | echo "blacklist_hg38_all.bed already exists." 30 | else 31 | wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/blacklist_hg38_all.bed.gz | gunzip -c > "$BORZOI_HG38/blacklist/blacklist_hg38_all.bed" 32 | fi 33 | 34 | if [ -f "$BORZOI_HG38/align/hg38.mm10.syn.net.gz" ]; then 35 | echo "Splice site annotation already exist." 36 | else 37 | wget https://storage.googleapis.com/seqnn-share/helper/dependencies/hg38.mm10.syn.net.gz -O "$BORZOI_HG38/align/hg38.mm10.syn.net.gz" 38 | fi 39 | 40 | 41 | # download and uncompress auxiliary files required for Makefile (mm10) 42 | if [ -f "$BORZOI_MM10/assembly/ucsc/mm10_gaps.bed" ]; then 43 | echo "mm10_gaps.bed already exists." 44 | else 45 | wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/mm10_gaps.bed.gz | gunzip -c > "$BORZOI_MM10/assembly/ucsc/mm10_gaps.bed" 46 | fi 47 | 48 | if [ -f "$BORZOI_MM10/mappability/umap_k36_t10_l32.bed" ]; then 49 | echo "umap_k36_t10_l32.bed (mm10) already exists." 50 | else 51 | wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/umap_k36_t10_l32_mm10.bed.gz | gunzip -c > "$BORZOI_MM10/mappability/umap_k36_t10_l32.bed" 52 | fi 53 | 54 | if [ -f "$BORZOI_MM10/blacklist/blacklist_mm10_all.bed" ]; then 55 | echo "blacklist_mm10_all.bed already exists." 56 | else 57 | wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/blacklist_mm10_all.bed.gz | gunzip -c > "$BORZOI_MM10/blacklist/blacklist_mm10_all.bed" 58 | fi 59 | 60 | 61 | # download and uncompress pre-compiled umap bed files 62 | if [ -f umap_human.bed ]; then 63 | echo "umap_human.bed already exists." 64 | else 65 | wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/umap_human.bed.gz | gunzip -c > umap_human.bed 66 | fi 67 | 68 | if [ -f umap_mouse.bed ]; then 69 | echo "umap_mouse.bed already exists." 70 | else 71 | wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/umap_mouse.bed.gz | gunzip -c > umap_mouse.bed 72 | fi 73 | 74 | 75 | # download and index hg38 ml genome 76 | if [ -f "$BORZOI_HG38/assembly/ucsc/hg38.ml.fa" ]; then 77 | echo "hg38.ml.fa already exists." 78 | else 79 | wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/hg38.ml.fa.gz | gunzip -c > "$BORZOI_HG38/assembly/ucsc/hg38.ml.fa" 80 | idx_genome.py "$BORZOI_HG38/assembly/ucsc/hg38.ml.fa" 81 | fi 82 | 83 | # download and index hg38 ml genome (gnomad major alleles) 84 | if [ -f "$BORZOI_HG38/assembly/gnomad/hg38.ml.fa" ]; then 85 | echo "hg38.ml.fa (gnomad) already exists." 86 | else 87 | wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/hg38_gnomad.ml.fa.gz | gunzip -c > "$BORZOI_HG38/assembly/gnomad/hg38.ml.fa" 88 | idx_genome.py "$BORZOI_HG38/assembly/gnomad/hg38.ml.fa" 89 | fi 90 | 91 | # download and index mm10 ml genome 92 | if [ -f "$BORZOI_MM10/assembly/ucsc/mm10.ml.fa" ]; then 93 | echo "mm10.ml.fa already exists." 94 | else 95 | wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/mm10.ml.fa.gz | gunzip -c > "$BORZOI_MM10/assembly/ucsc/mm10.ml.fa" 96 | idx_genome.py "$BORZOI_MM10/assembly/ucsc/mm10.ml.fa" 97 | fi 98 | -------------------------------------------------------------------------------- /tutorials/legacy/make_data/process_w5.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # merge bigwig replicates, generate .w5 files and run qc 4 | 5 | # define ENCODE ID 6 | ENC_ID='ENCSR000AEL' 7 | 8 | # define ENCODE file IDs 9 | FILE_P_REP1='ENCFF980ZHM' 10 | FILE_M_REP1='ENCFF533LJF' 11 | 12 | FILE_P_REP2='ENCFF335LVS' 13 | FILE_M_REP2='ENCFF257NOL' 14 | 15 | # create folder for merged replicate files 16 | mkdir -p "human/rna/encode/$ENC_ID/summary" 17 | 18 | 19 | # step 1: generate per-replicate .w5 files 20 | 21 | # rep1 22 | if [ -f "human/rna/encode/$ENC_ID/rep1/$FILE_P_REP1+.w5" ]; then 23 | echo "example RNA-seq .w5 already exists (rep 1)." 24 | else 25 | bw_h5.py -z "human/rna/encode/$ENC_ID/rep1/$FILE_P_REP1.bigWig" "human/rna/encode/$ENC_ID/rep1/$FILE_P_REP1+.w5" 26 | bw_h5.py -z "human/rna/encode/$ENC_ID/rep1/$FILE_M_REP1.bigWig" "human/rna/encode/$ENC_ID/rep1/$FILE_M_REP1-.w5" 27 | fi 28 | 29 | # rep2 30 | if [ -f "human/rna/encode/$ENC_ID/rep2/$FILE_P_REP2+.w5" ]; then 31 | echo "example RNA-seq .w5 already exists (rep 2)." 32 | else 33 | bw_h5.py -z "human/rna/encode/$ENC_ID/rep2/$FILE_P_REP2.bigWig" "human/rna/encode/$ENC_ID/rep2/$FILE_P_REP2+.w5" 34 | bw_h5.py -z "human/rna/encode/$ENC_ID/rep2/$FILE_M_REP2.bigWig" "human/rna/encode/$ENC_ID/rep2/$FILE_M_REP2-.w5" 35 | fi 36 | 37 | 38 | # step 2: merge replicates 39 | 40 | if [ -f "human/rna/encode/$ENC_ID/summary/coverage+.w5" ]; then 41 | echo "example RNA-seq .w5 already exists (merged)." 42 | else 43 | w5_merge.py -w -s mean -z "human/rna/encode/$ENC_ID/summary/coverage+.w5" "human/rna/encode/$ENC_ID/rep1/$FILE_P_REP1+.w5" "human/rna/encode/$ENC_ID/rep2/$FILE_P_REP2+.w5" 44 | w5_merge.py -w -s mean -z "human/rna/encode/$ENC_ID/summary/coverage-.w5" "human/rna/encode/$ENC_ID/rep1/$FILE_M_REP1-.w5" "human/rna/encode/$ENC_ID/rep2/$FILE_M_REP2-.w5" 45 | fi 46 | 47 | 48 | # step 3: run qc on each replicate and the merged file 49 | 50 | if [ -f "human/rna/encode/$ENC_ID/summary/covqc/means.txt" ]; then 51 | echo "qc statistics already exist." 52 | else 53 | # rep1 54 | w5_qc.py -b "$BORZOI_HG38/blacklist/blacklist_hg38_all.bed" -o "human/rna/encode/$ENC_ID/rep1/covqc" "human/rna/encode/$ENC_ID/rep1/$FILE_P_REP1+.w5" 55 | w5_qc.py -b "$BORZOI_HG38/blacklist/blacklist_hg38_all.bed" -o "human/rna/encode/$ENC_ID/rep1/covqc_m" "human/rna/encode/$ENC_ID/rep1/$FILE_M_REP1-.w5" 56 | 57 | # rep2 58 | w5_qc.py -b "$BORZOI_HG38/blacklist/blacklist_hg38_all.bed" -o "human/rna/encode/$ENC_ID/rep2/covqc" "human/rna/encode/$ENC_ID/rep2/$FILE_P_REP2+.w5" 59 | w5_qc.py -b "$BORZOI_HG38/blacklist/blacklist_hg38_all.bed" -o "human/rna/encode/$ENC_ID/rep2/covqc_m" "human/rna/encode/$ENC_ID/rep2/$FILE_M_REP2-.w5" 60 | 61 | # summary 62 | w5_qc.py -b "$BORZOI_HG38/blacklist/blacklist_hg38_all.bed" -o "human/rna/encode/$ENC_ID/summary/covqc" "human/rna/encode/$ENC_ID/summary/coverage+.w5" 63 | w5_qc.py -b "$BORZOI_HG38/blacklist/blacklist_hg38_all.bed" -o "human/rna/encode/$ENC_ID/summary/covqc_m" "human/rna/encode/$ENC_ID/summary/coverage-.w5" 64 | fi 65 | 66 | -------------------------------------------------------------------------------- /tutorials/legacy/make_data/targets_human.txt: -------------------------------------------------------------------------------- 1 | identifier file clip clip_soft scale sum_stat strand_pair description 2 | 0 ENCFF980ZHM+ human/rna/encode/ENCSR000AEL/summary/coverage+.w5 768 384 0.3 sum_sqrt 1 RNA:K562 3 | 1 ENCFF980ZHM- human/rna/encode/ENCSR000AEL/summary/coverage-.w5 768 384 0.3 sum_sqrt 0 RNA:K562 4 | -------------------------------------------------------------------------------- /tutorials/legacy/score_variants/README.md: -------------------------------------------------------------------------------- 1 | ## Variant Scoring 2 | 3 | This tutorial describes how to predict variant effect scores for a small set of SNVs defined in a .vcf file. For examples showcasing variant effect prediction at a larger scale (e.g. fine-mapped eQTL classification benchmarks), we refer the user to the [borzoi-paper respository](https://github.com/calico/borzoi-paper/tree/main). This example uses the pre-trained, published Borzoi model to predict variant effects. To download this model, run the script 'download_models.sh' in the 'borzoi' root folder. 4 | 5 | First, to calculate **gene-specific expression** scores, run the script 'score_expr_sed.sh'. Two different statistics are computed: (1) logSED (gene expression log fold change), and (2) logD2 (bin-level L2 norm across the coverage profile intersecting the exons of the gene). 6 | ```sh 7 | conda activate borzoi_py310 8 | cd ~/borzoi/tutorials/legacy/score_variants 9 | ./score_expr_sed.sh 10 | ``` 11 | 12 | To calculate **gene-agnostic expression** scores, run the script 'score_expr_sad.sh'. One statistic is computed: logD2 (bin-level L2 norm across the entire predicted coverage track). 13 | ```sh 14 | ./score_expr_sad.sh 15 | ``` 16 | 17 | To calculate **gene-specific polyadenylation** scores, run the script 'score_polya.sh'. One statistic is computed: COVR (3' coverage ratio across pA junctions of the target gene). 18 | ```sh 19 | ./score_polya.sh 20 | ``` 21 | 22 | To calculate **gene-specific splicing** scores, run the script 'score_splice.sh'. One statistic is computed: nDi (normalized maximum absolute difference in coverage bins across the target gene span). 23 | ```sh 24 | ./score_splice.sh 25 | ``` 26 | 27 | Finally, the jupyter notebook 'run_variant_scripts.ipynb' is provided for convenience to execute all above scripts. The notebook also exemplifies how to navigate the variant prediction hdf5 files and print some example scores. 28 | 29 | *Notes*: 30 | - The legacy data transforms are activated in all above .sh scripts with the flag '-u'. 31 | 32 | -------------------------------------------------------------------------------- /tutorials/legacy/score_variants/run_variant_scripts.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "f5d0f9fb", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import os\n", 11 | "import sys\n", 12 | "import h5py\n" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": null, 18 | "id": "7a94cbf8", 19 | "metadata": { 20 | "scrolled": true 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "#Calculate gene-specific variant effect scores\n", 25 | "\n", 26 | "!./score_expr_sed.sh\n" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 2, 32 | "id": "1047ff0f", 33 | "metadata": {}, 34 | "outputs": [ 35 | { 36 | "name": "stdout", 37 | "output_type": "stream", 38 | "text": [ 39 | "score: 'logSED', snp: 'chr1_46309111_A_G_b38', gene: 'ENSG00000237090.1', track: 'RNA:adipose_tissue' => -0.2551\n" 40 | ] 41 | } 42 | ], 43 | "source": [ 44 | "#Print an example variant effect prediction for a SNP-gene pair (gene-specific expression)\n", 45 | "\n", 46 | "sed_h5 = h5py.File('snp_sed/f3c0/sed.h5', 'r')\n", 47 | "\n", 48 | "row_ix = 63\n", 49 | "target_ix = 0\n", 50 | "\n", 51 | "print(\"score: 'logSED', snp: '\" + str(sed_h5['snp'][sed_h5['si'][row_ix]].decode()) + \"', gene: '\" + str(sed_h5['gene'][sed_h5['si'][row_ix]].decode()) + \"', track: '\" + str(sed_h5['target_labels'][target_ix].decode()) + \"' => \" + str(round(sed_h5['logSED'][row_ix, target_ix], 4)))\n" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "id": "f105ecd9", 58 | "metadata": { 59 | "scrolled": true 60 | }, 61 | "outputs": [], 62 | "source": [ 63 | "#Calculate gene-agnostic variant effect scores\n", 64 | "\n", 65 | "!./score_expr_sad.sh\n" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 3, 71 | "id": "96e4f7cb", 72 | "metadata": {}, 73 | "outputs": [ 74 | { 75 | "name": "stdout", 76 | "output_type": "stream", 77 | "text": [ 78 | "score: 'logD2', snp: 'chr1_43120331_C_T_b38', track: 'RNA:adipose_tissue' => 0.1057\n" 79 | ] 80 | } 81 | ], 82 | "source": [ 83 | "#Print an example variant effect prediction for a SNP (gene-agnostic expression)\n", 84 | "\n", 85 | "sad_h5 = h5py.File('snp_sad/f3c0/sad.h5', 'r')\n", 86 | "\n", 87 | "snp_ix = 1\n", 88 | "target_ix = 0\n", 89 | "\n", 90 | "print(\"score: 'logD2', snp: '\" + str(sad_h5['snp'][snp_ix].decode()) + \"', track: '\" + str(sad_h5['target_labels'][target_ix].decode()) + \"' => \" + str(round(sad_h5['logD2'][snp_ix, target_ix], 4)))\n" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "id": "c56efaef", 97 | "metadata": { 98 | "scrolled": true 99 | }, 100 | "outputs": [], 101 | "source": [ 102 | "#Calculate splice variant effect scores\n", 103 | "\n", 104 | "!./score_splice.sh\n" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 4, 110 | "id": "980993fc", 111 | "metadata": {}, 112 | "outputs": [ 113 | { 114 | "name": "stdout", 115 | "output_type": "stream", 116 | "text": [ 117 | "score: 'nDi', snp: 'chr1_156236330_G_A', gene: 'ENSG00000225905.1', track: 'RNA:foreskin fibroblast male newborn' => 0.0022\n" 118 | ] 119 | } 120 | ], 121 | "source": [ 122 | "#Print an example variant effect prediction for a SNP-gene pair (splicing)\n", 123 | "\n", 124 | "sed_h5 = h5py.File('snp_splice/f3c0/sed.h5', 'r')\n", 125 | "\n", 126 | "row_ix = 116\n", 127 | "target_ix = 755\n", 128 | "\n", 129 | "print(\"score: 'nDi', snp: '\" + str(sed_h5['snp'][sed_h5['si'][row_ix]].decode()) + \"', gene: '\" + str(sed_h5['gene'][sed_h5['si'][row_ix]].decode()) + \"', track: '\" + str(sed_h5['target_labels'][target_ix].decode()) + \"' => \" + str(round(sed_h5['nDi'][row_ix, target_ix], 4)))\n" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "id": "05cccfb6", 136 | "metadata": { 137 | "scrolled": true 138 | }, 139 | "outputs": [], 140 | "source": [ 141 | "#Calculate polyadenylation variant effect scores\n", 142 | "\n", 143 | "!./score_polya.sh\n" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 5, 149 | "id": "43ac562f", 150 | "metadata": {}, 151 | "outputs": [ 152 | { 153 | "name": "stdout", 154 | "output_type": "stream", 155 | "text": [ 156 | "score: 'logSED', snp: 'chr16_80976052_T_G', gene: 'ENSG00000132879.14', track: 'RNA:HeLa-S3 nuclear fraction' => 0.0628\n" 157 | ] 158 | } 159 | ], 160 | "source": [ 161 | "#Print an example variant effect prediction for a SNP-gene pair (polyadenylation)\n", 162 | "\n", 163 | "sed_h5 = h5py.File('snp_polya/f3c0/sed.h5', 'r')\n", 164 | "\n", 165 | "row_ix = 47\n", 166 | "target_ix = 100\n", 167 | "\n", 168 | "print(\"score: 'logSED', snp: '\" + str(sed_h5['snp'][sed_h5['si'][row_ix]].decode()) + \"', gene: '\" + str(sed_h5['gene'][sed_h5['si'][row_ix]].decode()) + \"', track: '\" + str(sed_h5['target_labels'][target_ix].decode()) + \"' => \" + str(round(sed_h5['COVR'][row_ix, target_ix], 4)))\n" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "id": "0ba23572", 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [] 178 | } 179 | ], 180 | "metadata": { 181 | "kernelspec": { 182 | "display_name": "Python 3 (ipykernel)", 183 | "language": "python", 184 | "name": "python3" 185 | }, 186 | "language_info": { 187 | "codemirror_mode": { 188 | "name": "ipython", 189 | "version": 3 190 | }, 191 | "file_extension": ".py", 192 | "mimetype": "text/x-python", 193 | "name": "python", 194 | "nbconvert_exporter": "python", 195 | "pygments_lexer": "ipython3", 196 | "version": "3.8.15" 197 | } 198 | }, 199 | "nbformat": 4, 200 | "nbformat_minor": 5 201 | } 202 | -------------------------------------------------------------------------------- /tutorials/legacy/score_variants/score_expr_sad.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | mkdir -p snp_sad/f3c0 4 | 5 | borzoi_sad.py -o snp_sad/f3c0 --rc --stats logD2 -u -t ../../../examples/targets_human.txt ../../../examples/params_pred.json ../../../examples/saved_models/f3c0/train/model0_best.h5 snps_expr.vcf 6 | -------------------------------------------------------------------------------- /tutorials/legacy/score_variants/score_expr_sed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | mkdir -p snp_sed/f3c0 4 | 5 | borzoi_sed.py -o snp_sed/f3c0 --rc --stats logSED,logD2 -u -t ../../../examples/targets_gtex.txt ../../../examples/params_pred.json ../../../examples/saved_models/f3c0/train/model0_best.h5 snps_expr.vcf 6 | -------------------------------------------------------------------------------- /tutorials/legacy/score_variants/score_polya.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | mkdir -p snp_polya/f3c0 4 | 5 | borzoi_sed_paqtl_cov.py -o snp_polya/f3c0 --rc --stats COVR -u -t ../../../examples/targets_rna.txt ../../../examples/params_pred.json ../../../examples/saved_models/f3c0/train/model0_best.h5 snps_polya.vcf 6 | -------------------------------------------------------------------------------- /tutorials/legacy/score_variants/score_splice.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | mkdir -p snp_splice/f3c0 4 | 5 | borzoi_sed.py -o snp_splice/f3c0 --span --no_untransform --rc --stats nDi -u -t ../../../examples/targets_rna.txt ../../../examples/params_pred.json ../../../examples/saved_models/f3c0/train/model0_best.h5 snps_splice.vcf 6 | -------------------------------------------------------------------------------- /tutorials/legacy/score_variants/snps_expr.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.2 2 | chr1 43110773 chr1_43110773_G_A_b38 G A . . 3 | chr1 43120331 chr1_43120331_C_T_b38 C T . . 4 | chr1 46309111 chr1_46309111_A_G_b38 A G . . 5 | chr1 52632886 chr1_52632886_A_C_b38 A C . . 6 | chr1 54053434 chr1_54053434_G_A_b38 G A . . 7 | -------------------------------------------------------------------------------- /tutorials/legacy/score_variants/snps_polya.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.2 2 | ##INFO= 3 | ##INFO= 4 | ##INFO= 5 | #CHROM POS ID REF ALT QUAL FILTER INFO 6 | chr1 11790946 chr1_11790946_G_C G C . . MT=ENSG00000177000.grp_2.downstream.ENST00000641805;PD=924;PI=chr1_11790946_G_C 7 | chr1 150160094 chr1_150160094_C_G C G . . MT=ENSG00000023902.grp_1.downstream.ENST00000369126;PD=29;PI=chr1_150160094_C_G 8 | chr16 57665101 chr16_57665101_A_G A G . . MT=ENSG00000205336.grp_1.downstream.ENST00000568908;PD=73;PI=chr16_57665101_A_G 9 | chr16 80976052 chr16_80976052_T_G T G . . MT=ENSG00000103121.grp_2.downstream.ENST00000565925;PD=24;PI=chr16_80976052_T_G 10 | chr16 88857261 chr16_88857261_T_C T C . . MT=ENSG00000167515.grp_2.downstream.ENST00000564547;PD=3851;PI=chr16_88857261_T_C -------------------------------------------------------------------------------- /tutorials/legacy/score_variants/snps_splice.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.2 2 | ##INFO= 3 | ##INFO= 4 | ##INFO= 5 | #CHROM POS ID REF ALT QUAL FILTER INFO 6 | chr1 1665061 chr1_1665061_C_T C T . . MT=ENSG00000189339.grp_2.contained.ENST00000611123;SD=959;PI=chr1_1665061_C_T 7 | chr1 1689221 chr1_1689221_G_A G A . . MT=ENSG00000189339.grp_1.contained.ENST00000614300;SD=1753;PI=chr1_1689221_G_A 8 | chr1 50655526 chr1_50655526_T_C T C . . MT=ENSG00000185104.grp_2.contained.ENST00000396153;SD=3;PI=chr1_50655526_T_C 9 | chr1 109489368 chr1_109489368_C_G C G . . MT=ENSG00000143537.grp_2.contained.ENST00000360674;SD=1;PI=chr1_155060832_G_A 10 | chr1 156236330 chr1_156236330_G_A G A . . MT=ENSG00000160783.grp_1.contained.ENST00000368279;SD=17;PI=chr1_156236330_G_A 11 | -------------------------------------------------------------------------------- /tutorials/legacy/train_model/README.md: -------------------------------------------------------------------------------- 1 | ## Model Training 2 | 3 | This tutorial describes how to train smaller Borzoi models on the example RNA-seq experiment processed in the [make_data tutorial](https://github.com/calico/borzoi/tree/main/tutorials/legacy/make_data). 4 | 5 | To train a 'Mini Borzoi' ensemble (~40M parameters, 2 cross-validation folds), run the script 'train_mini.sh'. The model parameters are specified in 'params_mini.json'. This model can be trained with a batch size of 2 on a 24GB NVIDIA Titan RTX or RTX4090 GPU. 6 | ```sh 7 | conda activate borzoi_py310 8 | cd ~/borzoi/tutorials/legacy/train_model 9 | ./train_mini.sh 10 | ``` 11 | 12 | Alternatively, to train an even smaller 'Micro Borzoi' ensemble (~5M parameters), run the script 'train_micro.sh'. This model can fit into the above GPU cards with a batch size of 4, which means the learning rate can be doubled and each epoch finished in half the time. 13 | ```sh 14 | ./train_micro.sh 15 | ``` 16 | 17 | *Notes*: 18 | - See [here](https://github.com/calico/borzoi-paper/tree/main/model) for a description of the scripts called internally by the training .sh script. 19 | - The **legacy** model crops the predicted tracks (see layer 'Cropping1D' in the parameters file). In this example, the input sequence has length 393216 bp, and the cropping layer removes 3072x 32 bp bins from each side, resulting in 6144 bins. 20 | - In the **legacy** architecture, there is an extra/superfluous linear convolution applied in each 'unet_conv' layer (see the bool 'upsample_conv' in the parameters file). This additional convolution has since been removed. 21 | -------------------------------------------------------------------------------- /tutorials/legacy/train_model/params_micro.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "batch_size": 4, 4 | "shuffle_buffer": 256, 5 | "optimizer": "adam", 6 | "learning_rate": 0.0002, 7 | "loss": "poisson_mn", 8 | "total_weight": 0.2, 9 | "warmup_steps": 10000, 10 | "global_clipnorm": 0.2, 11 | "adam_beta1": 0.9, 12 | "adam_beta2": 0.999, 13 | "patience": 30, 14 | "train_epochs_min": 130, 15 | "train_epochs_max": 180 16 | }, 17 | "model": { 18 | "seq_length": 393216, 19 | "augment_rc": true, 20 | "augment_shift": 3, 21 | "activation": "gelu", 22 | "norm_type": "batch", 23 | "bn_momentum": 0.9, 24 | "kernel_initializer": "lecun_normal", 25 | "l2_scale": 1.0e-6, 26 | "trunk": [ 27 | { 28 | "name": "conv_dna", 29 | "filters": 128, 30 | "kernel_size": 11, 31 | "norm_type": null, 32 | "activation": "linear", 33 | "pool_size": 2 34 | }, 35 | { 36 | "name": "res_tower", 37 | "filters_init": 160, 38 | "filters_end": 320, 39 | "divisible_by": 8, 40 | "kernel_size": 5, 41 | "num_convs": 1, 42 | "pool_size": 2, 43 | "repeat": 6 44 | }, 45 | { 46 | "name": "transformer_tower", 47 | "key_size": 32, 48 | "heads": 4, 49 | "num_position_features": 32, 50 | "dropout": 0.1, 51 | "attention_dropout": 0.01, 52 | "mha_l2_scale": 1.0e-8, 53 | "l2_scale": 1.0e-8, 54 | "kernel_initializer": "he_normal", 55 | "repeat": 4 56 | }, 57 | { 58 | "name": "unet_conv", 59 | "kernel_size": 3, 60 | "upsample_conv": true 61 | }, 62 | { 63 | "name": "unet_conv", 64 | "kernel_size": 3, 65 | "upsample_conv": true 66 | }, 67 | { 68 | "name": "Cropping1D", 69 | "cropping": 3072 70 | } 71 | ], 72 | "head_human": { 73 | "name": "final", 74 | "units": 2, 75 | "activation": "softplus" 76 | } 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /tutorials/legacy/train_model/params_mini.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "batch_size": 2, 4 | "shuffle_buffer": 256, 5 | "optimizer": "adam", 6 | "learning_rate": 0.0001, 7 | "loss": "poisson_mn", 8 | "total_weight": 0.2, 9 | "warmup_steps": 20000, 10 | "global_clipnorm": 0.1, 11 | "adam_beta1": 0.9, 12 | "adam_beta2": 0.999, 13 | "patience": 30, 14 | "train_epochs_min": 130, 15 | "train_epochs_max": 180 16 | }, 17 | "model": { 18 | "seq_length": 393216, 19 | "augment_rc": true, 20 | "augment_shift": 3, 21 | "activation": "gelu", 22 | "norm_type": "batch", 23 | "bn_momentum": 0.9, 24 | "kernel_initializer": "lecun_normal", 25 | "l2_scale": 1.0e-6, 26 | "trunk": [ 27 | { 28 | "name": "conv_dna", 29 | "filters": 320, 30 | "kernel_size": 11, 31 | "norm_type": null, 32 | "activation": "linear", 33 | "pool_size": 2 34 | }, 35 | { 36 | "name": "res_tower", 37 | "filters_init": 384, 38 | "filters_end": 768, 39 | "divisible_by": 16, 40 | "kernel_size": 5, 41 | "num_convs": 1, 42 | "pool_size": 2, 43 | "repeat": 6 44 | }, 45 | { 46 | "name": "transformer_tower", 47 | "key_size": 64, 48 | "heads": 4, 49 | "num_position_features": 32, 50 | "dropout": 0.2, 51 | "mha_l2_scale": 1.0e-8, 52 | "l2_scale": 1.0e-8, 53 | "kernel_initializer": "he_normal", 54 | "repeat": 8 55 | }, 56 | { 57 | "name": "unet_conv", 58 | "kernel_size": 3, 59 | "upsample_conv": true 60 | }, 61 | { 62 | "name": "unet_conv", 63 | "kernel_size": 3, 64 | "upsample_conv": true 65 | }, 66 | { 67 | "name": "Cropping1D", 68 | "cropping": 3072 69 | } 70 | ], 71 | "head_human": { 72 | "name": "final", 73 | "units": 2, 74 | "activation": "softplus" 75 | } 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /tutorials/legacy/train_model/train_micro.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | westminster_train_folds.py -e borzoi_py310 -f 2 -c 1 -q rtx4090 -o micro_models params_micro.json ../make_data/data/hg38 4 | -------------------------------------------------------------------------------- /tutorials/legacy/train_model/train_mini.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | westminster_train_folds.py -e borzoi_py310 -f 2 -c 1 -q rtx4090 -o mini_models params_mini.json ../make_data/data/hg38 4 | --------------------------------------------------------------------------------