├── .github
    ├── CODEOWNERS
    ├── pull_request_template.md
    └── workflows
    │   ├── python-black.yml
    │   └── python-pytest.yml
├── .gitignore
├── LICENSE
├── README.md
├── borzoi_logo.png
├── data
    ├── sequences_human.bed.gz
    ├── sequences_mouse.bed.gz
    ├── targets_human.txt.gz
    └── targets_mouse.txt.gz
├── download_models.sh
├── env_vars.sh
├── examples
    ├── .gitignore
    ├── CD99_example.gtf
    ├── CFHR2_example.gtf
    ├── GCFC2_example.gtf
    ├── borzoi_example_eqtl_chr10_116952944_T_C.ipynb
    ├── borzoi_example_eqtl_chr10_116952944_T_C_fancy.ipynb
    ├── borzoi_example_ipaqtl_chr10_116664061_G_A.ipynb
    ├── borzoi_example_paqtl_chr1_236763042_A_G.ipynb
    ├── borzoi_example_paqtl_chr1_236763042_A_G_fancy.ipynb
    ├── borzoi_example_sqtl_chr9_135548708_G_C.ipynb
    ├── borzoi_helpers.py
    ├── params.json
    ├── params_pred.json
    ├── targets_gtex.txt
    ├── targets_gtex_liver.txt
    ├── targets_human.txt
    ├── targets_mouse.txt
    └── targets_rna.txt
├── pyproject.toml
├── src
    ├── __init__.py
    ├── borzoi
    │   ├── __init__.py
    │   ├── helpers
    │   │   └── h5_grad_utils.py
    │   └── scripts
    │   │   └── borzoi_satg_gene_gpu.py
    ├── notebooks
    │   └── borzoi_snp.ipynb
    ├── scripts
    │   ├── _archive
    │   │   ├── borzoi_bench_crispr.py
    │   │   ├── borzoi_bench_crispr_folds.py
    │   │   ├── borzoi_bench_flowfish_folds.py
    │   │   ├── borzoi_bench_gasperini_folds.py
    │   │   ├── borzoi_satg_gene.py
    │   │   └── borzoi_satg_gene_multi.py
    │   ├── borzoi_bench_classify.py
    │   ├── borzoi_bench_gtex_folds_sad.py
    │   ├── borzoi_bench_gtex_folds_sed.py
    │   ├── borzoi_bench_ipaqtl_folds.py
    │   ├── borzoi_bench_paqtl_folds.py
    │   ├── borzoi_bench_sqtl_folds.py
    │   ├── borzoi_bench_trip_folds.py
    │   ├── borzoi_gtex_coef_sad.py
    │   ├── borzoi_gtex_coef_sed.py
    │   ├── borzoi_sad.py
    │   ├── borzoi_sad_folds.py
    │   ├── borzoi_satg_gene.py
    │   ├── borzoi_satg_gene_crispr_ism_shuffle.py
    │   ├── borzoi_satg_gene_focused_ism.py
    │   ├── borzoi_satg_polya.py
    │   ├── borzoi_satg_splice.py
    │   ├── borzoi_sed.py
    │   ├── borzoi_sed_folds.py
    │   ├── borzoi_sed_ipaqtl_cov.py
    │   ├── borzoi_sed_paqtl_cov.py
    │   ├── borzoi_test_apa.py
    │   ├── borzoi_test_apa_folds.py
    │   ├── borzoi_test_exons.py
    │   ├── borzoi_test_exons_folds.py
    │   ├── borzoi_test_genes.py
    │   ├── borzoi_test_genes_folds.py
    │   ├── borzoi_test_tss.py
    │   ├── borzoi_test_tss_folds.py
    │   ├── borzoi_tfmodisco.py
    │   ├── borzoi_tfmodisco_diff.py
    │   ├── borzoi_trip.py
    │   ├── bw_h5.py
    │   ├── idx_genome.py
    │   ├── pygene.py
    │   ├── slurm.py
    │   ├── util.py
    │   ├── w5_merge.py
    │   └── w5_qc.py
    └── tests
    │   ├── __init__.py
    │   └── test_dummy.py
└── tutorials
    ├── latest
        ├── analyze_sv
        │   ├── README.md
        │   ├── analyze_indel.sh
        │   ├── analyze_vcf.py
        │   ├── data
        │   │   ├── STR.csv
        │   │   └── chr6_41897087_SV.vcf
        │   ├── download_dependencies_STR.sh
        │   ├── download_dependencies_SV.sh
        │   ├── save_STR_vcf.py
        │   ├── score_STRs.sh
        │   ├── score_tandem_repeats.py
        │   └── utils.py
        ├── interpret_sequence
        │   ├── HBE1_example.gtf
        │   ├── README.md
        │   ├── explore_grads_k562_HBE1.ipynb
        │   ├── run_gradients_expr_HBE1.sh
        │   └── vis_helpers.py
        ├── make_data
        │   ├── Makefile
        │   ├── README.md
        │   ├── download_bw.sh
        │   ├── download_dependencies.sh
        │   ├── process_w5.sh
        │   └── targets_human.txt
        ├── score_variants
        │   ├── README.md
        │   ├── run_variant_scripts.ipynb
        │   ├── score_expr_sad.sh
        │   ├── score_expr_sed.sh
        │   ├── score_polya.sh
        │   ├── score_splice.sh
        │   ├── snps_expr.vcf
        │   ├── snps_polya.vcf
        │   └── snps_splice.vcf
        └── train_model
        │   ├── README.md
        │   ├── params_micro.json
        │   ├── params_mini.json
        │   ├── train_micro.sh
        │   └── train_mini.sh
    └── legacy
        ├── interpret_sequence
            ├── README.md
            ├── explore_grads_liver_CFHR2.ipynb
            ├── explore_polya_grads_CD99.ipynb
            ├── explore_splice_grads_GCFC2.ipynb
            ├── run_gradients_expr_CFHR2.sh
            ├── run_gradients_polya_CD99.sh
            ├── run_gradients_splice_GCFC2.sh
            └── vis_helpers.py
        ├── make_data
            ├── Makefile
            ├── README.md
            ├── download_bw.sh
            ├── download_dependencies.sh
            ├── process_w5.sh
            └── targets_human.txt
        ├── score_variants
            ├── README.md
            ├── run_variant_scripts.ipynb
            ├── score_expr_sad.sh
            ├── score_expr_sed.sh
            ├── score_polya.sh
            ├── score_splice.sh
            ├── snps_expr.vcf
            ├── snps_polya.vcf
            └── snps_splice.vcf
        └── train_model
            ├── README.md
            ├── params_micro.json
            ├── params_mini.json
            ├── train_micro.sh
            └── train_mini.sh


/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | # By default, PRs for this repo are automatically reviewed by:
2 | * @calico/sweng-dev @calico/data-eng-dev
3 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | ### Description of your changes
 2 | 
 3 | <!--- Changes proposed in this pull request -->
 4 | 
 5 | ### Issue ticket number and link
 6 | 
 7 | <!--- Please include the JIRA ticket and link -->
 8 | 
 9 | ### Type of change
10 | 
11 | - [ ] Bug fix (non-breaking change which fixes an issue)
12 | - [ ] New feature (non-breaking change which adds functionality)
13 | - [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected)
14 | - [ ] Documentation add / update
15 | 
16 | ### (If applicable) How has this been tested?
17 | 
18 | <!--- Please describe the tests that you ran to verify your changes. -->
19 | <!--- Include details of your testing environment -->
20 | 


--------------------------------------------------------------------------------
/.github/workflows/python-black.yml:
--------------------------------------------------------------------------------
 1 | name: Validate Black Formatting
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   push:
 6 |     paths:
 7 |       - "**.py"
 8 | 
 9 | jobs:
10 |   format:
11 |     runs-on: ubuntu-20.04
12 | 
13 |     steps:
14 |       - name: Checkout base repo
15 |         uses: actions/checkout@v3
16 | 
17 |       - name: Set up Python 3.11
18 |         uses: actions/setup-python@v2
19 |         with:
20 |           python-version: "3.11"
21 | 
22 |       - name: Install dependencies
23 |         run: python3 -m pip install black~=22.3.0
24 | 
25 |       - name: Check Black formatting
26 |         run: black --check .
27 | 


--------------------------------------------------------------------------------
/.github/workflows/python-pytest.yml:
--------------------------------------------------------------------------------
 1 | name: Install Requirements and Run Pytest
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   push:
 6 |     paths:
 7 |       - "**.py"
 8 | 
 9 | jobs:
10 |   validate:
11 |     runs-on: ubuntu-20.04
12 |     strategy:
13 |       matrix:
14 |         python-version: ["3.8", "3.9", "3.10"]
15 | 
16 |     steps:
17 |       - name: Checkout base repo
18 |         uses: actions/checkout@v3
19 | 
20 |       - name: Set up Python ${{ matrix.python-version }}
21 |         uses: actions/setup-python@v4
22 |         with:
23 |           python-version: ${{ matrix.python-version }}
24 | 
25 |       - name: Install dependencies
26 |         run: |
27 |           python3 -m pip install --upgrade pip 
28 |           python3 -m pip install '.[dev]'
29 | 
30 |       - name: Run pytest
31 |         run: python -m pytest
32 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # JetBrains IDE files
  2 | **/.idea*
  3 | 
  4 | # Byte-compiled / optimized / DLL files
  5 | __pycache__/
  6 | *.py[cod]
  7 | *$py.class
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | pip-wheel-metadata/
 27 | share/python-wheels/
 28 | *.egg-info/
 29 | .installed.cfg
 30 | *.egg
 31 | MANIFEST
 32 | 
 33 | # PyInstaller
 34 | #  Usually these files are written by a python script from a template
 35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 36 | *.manifest
 37 | *.spec
 38 | 
 39 | # Installer logs
 40 | pip-log.txt
 41 | pip-delete-this-directory.txt
 42 | 
 43 | # Unit test / coverage reports
 44 | htmlcov/
 45 | .tox/
 46 | .nox/
 47 | .coverage
 48 | .coverage.*
 49 | .cache
 50 | nosetests.xml
 51 | coverage.xml
 52 | *.cover
 53 | *.py,cover
 54 | .hypothesis/
 55 | .pytest_cache/
 56 | 
 57 | # Translations
 58 | *.mo
 59 | *.pot
 60 | 
 61 | # Django stuff:
 62 | *.log
 63 | local_settings.py
 64 | db.sqlite3
 65 | db.sqlite3-journal
 66 | 
 67 | # Flask stuff:
 68 | instance/
 69 | .webassets-cache
 70 | 
 71 | # Scrapy stuff:
 72 | .scrapy
 73 | 
 74 | # Sphinx documentation
 75 | docs/_build/
 76 | 
 77 | # PyBuilder
 78 | target/
 79 | 
 80 | # Jupyter Notebook
 81 | .ipynb_checkpoints
 82 | 
 83 | # IPython
 84 | profile_default/
 85 | ipython_config.py
 86 | 
 87 | # pyenv
 88 | .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 98 | __pypackages__/
 99 | 
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 | 
104 | # SageMath parsed files
105 | *.sage.py
106 | 
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 | 
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 | 
120 | # Rope project settings
121 | .ropeproject
122 | 
123 | # mkdocs documentation
124 | /site
125 | 
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 | 
131 | # Pyre type checker
132 | .pyre/
133 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Apache License
 2 | 
 3 | Version 2.0, January 2004
 4 | 
 5 | http://www.apache.org/licenses/
 6 | 
 7 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
 8 | 
 9 | 1. Definitions.
10 | 
11 | "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document.
12 | 
13 | "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License.
14 | 
15 | "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity.
16 | 
17 | "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License.
18 | 
19 | "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files.
20 | 
21 | "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types.
22 | 
23 | "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below).
24 | 
25 | "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof.
26 | 
27 | "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution."
28 | 
29 | "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work.
30 | 
31 | 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form.
32 | 
33 | 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed.
34 | 
35 | 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:
36 | 
37 | You must give any other recipients of the Work or Derivative Works a copy of this License; and
38 | You must cause any modified files to carry prominent notices stating that You changed the files; and
39 | You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and
40 | If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License.
41 | 
42 | You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License.
43 | 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions.
44 | 
45 | 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file.
46 | 
47 | 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License.
48 | 
49 | 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
50 | 
51 | 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.
52 | 
53 | END OF TERMS AND CONDITIONS
54 | 


--------------------------------------------------------------------------------
/borzoi_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/calico/borzoi/77670d9b0f940ff0a4b2b0edbdff445bbaefc6ed/borzoi_logo.png


--------------------------------------------------------------------------------
/data/sequences_human.bed.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/calico/borzoi/77670d9b0f940ff0a4b2b0edbdff445bbaefc6ed/data/sequences_human.bed.gz


--------------------------------------------------------------------------------
/data/sequences_mouse.bed.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/calico/borzoi/77670d9b0f940ff0a4b2b0edbdff445bbaefc6ed/data/sequences_mouse.bed.gz


--------------------------------------------------------------------------------
/data/targets_human.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/calico/borzoi/77670d9b0f940ff0a4b2b0edbdff445bbaefc6ed/data/targets_human.txt.gz


--------------------------------------------------------------------------------
/data/targets_mouse.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/calico/borzoi/77670d9b0f940ff0a4b2b0edbdff445bbaefc6ed/data/targets_mouse.txt.gz


--------------------------------------------------------------------------------
/download_models.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # download model weights (data fold 3, 4 replicates)
 4 | for rep in f3c0,f0 f3c1,f1 f3c2,f2 f3c3,f3; do IFS=","; set -- $rep; 
 5 |   mkdir -p "examples/saved_models/$1/train"
 6 |   local_model="examples/saved_models/$1/train/model0_best.h5"
 7 |   if [ -f "$local_model" ]; then
 8 |     echo "$1 model already exists."
 9 |   else
10 |     wget --progress=bar:force "https://storage.googleapis.com/seqnn-share/borzoi/$2/model0_best.h5" -O "$local_model"
11 |   fi
12 | done
13 | 
14 | # download and uncompress annotation files
15 | mkdir -p examples/hg38/genes/gencode41
16 | mkdir -p examples/hg38/genes/polyadb
17 | 
18 | if [ -f examples/hg38/genes/gencode41/gencode41_basic_nort.gtf ]; then
19 |   echo "Gene annotation already exists."
20 | else
21 |   wget -O - https://storage.googleapis.com/seqnn-share/helper/gencode41_basic_nort.gtf.gz | gunzip -c > examples/hg38/genes/gencode41/gencode41_basic_nort.gtf
22 | fi
23 | 
24 | if [ -f examples/hg38/genes/gencode41/gencode41_basic_nort_protein.gtf ]; then
25 |   echo "Gene annotation (no read-through, protein-coding) already exists."
26 | else
27 |   wget -O - https://storage.googleapis.com/seqnn-share/helper/gencode41_basic_nort_protein.gtf.gz | gunzip -c > examples/hg38/genes/gencode41/gencode41_basic_nort_protein.gtf
28 | fi
29 | 
30 | if [ -f examples/hg38/genes/gencode41/gencode41_basic_protein.gtf ]; then
31 |   echo "Gene annotation (protein-coding) already exists."
32 | else
33 |   wget -O - https://storage.googleapis.com/seqnn-share/helper/gencode41_basic_protein.gtf.gz | gunzip -c > examples/hg38/genes/gencode41/gencode41_basic_protein.gtf
34 | fi
35 | 
36 | if [ -f examples/hg38/genes/gencode41/gencode41_basic_tss2.bed ]; then
37 |   echo "TSS annotation already exists."
38 | else
39 |   wget -O - https://storage.googleapis.com/seqnn-share/helper/gencode41_basic_tss2.bed.gz | gunzip -c > examples/hg38/genes/gencode41/gencode41_basic_tss2.bed
40 | fi
41 | 
42 | if [ -f examples/hg38/genes/gencode41/gencode41_basic_protein_splice.csv.gz ]; then
43 |   echo "Splice site annotation already exist."
44 | else
45 |   wget https://storage.googleapis.com/seqnn-share/helper/gencode41_basic_protein_splice.csv.gz -O examples/hg38/genes/gencode41/gencode41_basic_protein_splice.csv.gz
46 | fi
47 | 
48 | if [ -f examples/hg38/genes/gencode41/gencode41_basic_protein_splice.gff ]; then
49 |   echo "Splice site annotation already exist."
50 | else
51 |   wget -O - https://storage.googleapis.com/seqnn-share/helper/gencode41_basic_protein_splice.gff.gz | gunzip -c > examples/hg38/genes/gencode41/gencode41_basic_protein_splice.gff
52 | fi
53 | 
54 | if [ -f examples/hg38/genes/polyadb/polyadb_human_v3.csv.gz ]; then
55 |   echo "PolyA site annotation already exist."
56 | else
57 |   wget https://storage.googleapis.com/seqnn-share/helper/polyadb_human_v3.csv.gz -O examples/hg38/genes/polyadb/polyadb_human_v3.csv.gz
58 | fi
59 | 
60 | # download and index hg38 genome
61 | mkdir -p examples/hg38/assembly/ucsc
62 | 
63 | if [ -f examples/hg38/assembly/ucsc/hg38.fa ]; then
64 |   echo "Human genome FASTA already exists."
65 | else
66 |   wget -O - http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz | gunzip -c > examples/hg38/assembly/ucsc/hg38.fa
67 |   python src/scripts/idx_genome.py examples/hg38/assembly/ucsc/hg38.fa
68 | fi
69 | 


--------------------------------------------------------------------------------
/env_vars.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # set these variables before running the script
 4 | LOCAL_BORZOI_PATH="/home/jlinder/borzoi"
 5 | LOCAL_CONDA_PATH="/home/jlinder/anaconda3/etc/profile.d/conda.sh"
 6 | 
 7 | # create env_vars sh scripts in local conda env
 8 | mkdir -p "$CONDA_PREFIX/etc/conda/activate.d"
 9 | mkdir -p "$CONDA_PREFIX/etc/conda/deactivate.d"
10 | 
11 | file_vars_act="$CONDA_PREFIX/etc/conda/activate.d/env_vars.sh"
12 | if ! [ -e $file_vars_act ]; then
13 |     echo '#!/bin/sh' > $file_vars_act
14 | fi
15 | 
16 | file_vars_deact="$CONDA_PREFIX/etc/conda/deactivate.d/env_vars.sh"
17 | if ! [ -e $file_vars_deact ]; then
18 |     echo '#!/bin/sh' > $file_vars_deact
19 | fi
20 | 
21 | # append env variable exports to /activate.d/env_vars.sh
22 | echo "export BORZOI_DIR=$LOCAL_BORZOI_PATH" >> $file_vars_act
23 | echo 'export PATH=$BORZOI_DIR/src/scripts:$PATH' >> $file_vars_act
24 | echo 'export PYTHONPATH=$BORZOI_DIR/src/scripts:$PYTHONPATH' >> $file_vars_act
25 | 
26 | echo 'export BORZOI_HG38=$BORZOI_DIR/examples/hg38' >> $file_vars_act
27 | echo 'export BORZOI_MM10=$BORZOI_DIR/examples/mm10' >> $file_vars_act
28 | 
29 | echo "export BORZOI_CONDA=$LOCAL_CONDA_PATH" >> $file_vars_act
30 | 
31 | # append env variable unsets to /deactivate.d/env_vars.sh
32 | echo 'unset BORZOI_DIR' >> $file_vars_deact
33 | echo 'unset BORZOI_HG38' >> $file_vars_deact
34 | echo 'unset BORZOI_MM10' >> $file_vars_deact
35 | echo 'unset BORZOI_CONDA' >> $file_vars_deact
36 | 
37 | # finally activate env variables
38 | source $file_vars_act
39 | 


--------------------------------------------------------------------------------
/examples/.gitignore:
--------------------------------------------------------------------------------
1 | gencode41_basic*
2 | hg38.fa*
3 | polyadb_human_v3.csv.gz
4 | saved_models/
5 | .virtual_documents/
6 | *.eps
7 | *.png
8 | 


--------------------------------------------------------------------------------
/examples/CFHR2_example.gtf:
--------------------------------------------------------------------------------
 1 | chr1	HAVANA	transcript	196943738	196959622	.	+	.	gene_id "ENSG00000080910.14"; transcript_id "ENST00000367415.8"; gene_type "protein_coding"; gene_name "CFHR2"; transcript_type "protein_coding"; transcript_name "CFHR2-201"; level 2; protein_id "ENSP00000356385.4"; transcript_support_level "1"; hgnc_id "HGNC:4890"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; tag "appris_principal_2"; tag "CCDS"; ccdsid "CCDS30959.1"; havana_gene "OTTHUMG00000036518.4"; havana_transcript "OTTHUMT00000088815.4";
 2 | chr1	HAVANA	exon	196943738	196943938	.	+	.	gene_id "ENSG00000080910.14"; transcript_id "ENST00000367415.8"; gene_type "protein_coding"; gene_name "CFHR2"; transcript_type "protein_coding"; transcript_name "CFHR2-201"; exon_number 1; exon_id "ENSE00001920108.3"; level 2; protein_id "ENSP00000356385.4"; transcript_support_level "1"; hgnc_id "HGNC:4890"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; tag "appris_principal_2"; tag "CCDS"; ccdsid "CCDS30959.1"; havana_gene "OTTHUMG00000036518.4"; havana_transcript "OTTHUMT00000088815.4";
 3 | chr1	HAVANA	CDS	196943881	196943938	.	+	0	gene_id "ENSG00000080910.14"; transcript_id "ENST00000367415.8"; gene_type "protein_coding"; gene_name "CFHR2"; transcript_type "protein_coding"; transcript_name "CFHR2-201"; exon_number 1; exon_id "ENSE00001920108.3"; level 2; protein_id "ENSP00000356385.4"; transcript_support_level "1"; hgnc_id "HGNC:4890"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; tag "appris_principal_2"; tag "CCDS"; ccdsid "CCDS30959.1"; havana_gene "OTTHUMG00000036518.4"; havana_transcript "OTTHUMT00000088815.4";
 4 | chr1	HAVANA	start_codon	196943881	196943883	.	+	0	gene_id "ENSG00000080910.14"; transcript_id "ENST00000367415.8"; gene_type "protein_coding"; gene_name "CFHR2"; transcript_type "protein_coding"; transcript_name "CFHR2-201"; exon_number 1; exon_id "ENSE00001920108.3"; level 2; protein_id "ENSP00000356385.4"; transcript_support_level "1"; hgnc_id "HGNC:4890"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; tag "appris_principal_2"; tag "CCDS"; ccdsid "CCDS30959.1"; havana_gene "OTTHUMG00000036518.4"; havana_transcript "OTTHUMT00000088815.4";
 5 | chr1	HAVANA	exon	196949455	196949649	.	+	.	gene_id "ENSG00000080910.14"; transcript_id "ENST00000367415.8"; gene_type "protein_coding"; gene_name "CFHR2"; transcript_type "protein_coding"; transcript_name "CFHR2-201"; exon_number 2; exon_id "ENSE00003745979.1"; level 2; protein_id "ENSP00000356385.4"; transcript_support_level "1"; hgnc_id "HGNC:4890"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; tag "appris_principal_2"; tag "CCDS"; ccdsid "CCDS30959.1"; havana_gene "OTTHUMG00000036518.4"; havana_transcript "OTTHUMT00000088815.4";
 6 | chr1	HAVANA	CDS	196949455	196949649	.	+	2	gene_id "ENSG00000080910.14"; transcript_id "ENST00000367415.8"; gene_type "protein_coding"; gene_name "CFHR2"; transcript_type "protein_coding"; transcript_name "CFHR2-201"; exon_number 2; exon_id "ENSE00003745979.1"; level 2; protein_id "ENSP00000356385.4"; transcript_support_level "1"; hgnc_id "HGNC:4890"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; tag "appris_principal_2"; tag "CCDS"; ccdsid "CCDS30959.1"; havana_gene "OTTHUMG00000036518.4"; havana_transcript "OTTHUMT00000088815.4";
 7 | chr1	HAVANA	exon	196950852	196951028	.	+	.	gene_id "ENSG00000080910.14"; transcript_id "ENST00000367415.8"; gene_type "protein_coding"; gene_name "CFHR2"; transcript_type "protein_coding"; transcript_name "CFHR2-201"; exon_number 3; exon_id "ENSE00003831930.1"; level 2; protein_id "ENSP00000356385.4"; transcript_support_level "1"; hgnc_id "HGNC:4890"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; tag "appris_principal_2"; tag "CCDS"; ccdsid "CCDS30959.1"; havana_gene "OTTHUMG00000036518.4"; havana_transcript "OTTHUMT00000088815.4";
 8 | chr1	HAVANA	CDS	196950852	196951028	.	+	2	gene_id "ENSG00000080910.14"; transcript_id "ENST00000367415.8"; gene_type "protein_coding"; gene_name "CFHR2"; transcript_type "protein_coding"; transcript_name "CFHR2-201"; exon_number 3; exon_id "ENSE00003831930.1"; level 2; protein_id "ENSP00000356385.4"; transcript_support_level "1"; hgnc_id "HGNC:4890"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; tag "appris_principal_2"; tag "CCDS"; ccdsid "CCDS30959.1"; havana_gene "OTTHUMG00000036518.4"; havana_transcript "OTTHUMT00000088815.4";
 9 | chr1	HAVANA	exon	196957891	196958073	.	+	.	gene_id "ENSG00000080910.14"; transcript_id "ENST00000367415.8"; gene_type "protein_coding"; gene_name "CFHR2"; transcript_type "protein_coding"; transcript_name "CFHR2-201"; exon_number 4; exon_id "ENSE00003836915.1"; level 2; protein_id "ENSP00000356385.4"; transcript_support_level "1"; hgnc_id "HGNC:4890"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; tag "appris_principal_2"; tag "CCDS"; ccdsid "CCDS30959.1"; havana_gene "OTTHUMG00000036518.4"; havana_transcript "OTTHUMT00000088815.4";
10 | chr1	HAVANA	CDS	196957891	196958073	.	+	2	gene_id "ENSG00000080910.14"; transcript_id "ENST00000367415.8"; gene_type "protein_coding"; gene_name "CFHR2"; transcript_type "protein_coding"; transcript_name "CFHR2-201"; exon_number 4; exon_id "ENSE00003836915.1"; level 2; protein_id "ENSP00000356385.4"; transcript_support_level "1"; hgnc_id "HGNC:4890"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; tag "appris_principal_2"; tag "CCDS"; ccdsid "CCDS30959.1"; havana_gene "OTTHUMG00000036518.4"; havana_transcript "OTTHUMT00000088815.4";
11 | chr1	HAVANA	exon	196958881	196959622	.	+	.	gene_id "ENSG00000080910.14"; transcript_id "ENST00000367415.8"; gene_type "protein_coding"; gene_name "CFHR2"; transcript_type "protein_coding"; transcript_name "CFHR2-201"; exon_number 5; exon_id "ENSE00003843688.1"; level 2; protein_id "ENSP00000356385.4"; transcript_support_level "1"; hgnc_id "HGNC:4890"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; tag "appris_principal_2"; tag "CCDS"; ccdsid "CCDS30959.1"; havana_gene "OTTHUMG00000036518.4"; havana_transcript "OTTHUMT00000088815.4";
12 | chr1	HAVANA	CDS	196958881	196959077	.	+	2	gene_id "ENSG00000080910.14"; transcript_id "ENST00000367415.8"; gene_type "protein_coding"; gene_name "CFHR2"; transcript_type "protein_coding"; transcript_name "CFHR2-201"; exon_number 5; exon_id "ENSE00003843688.1"; level 2; protein_id "ENSP00000356385.4"; transcript_support_level "1"; hgnc_id "HGNC:4890"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; tag "appris_principal_2"; tag "CCDS"; ccdsid "CCDS30959.1"; havana_gene "OTTHUMG00000036518.4"; havana_transcript "OTTHUMT00000088815.4";
13 | chr1	HAVANA	stop_codon	196959078	196959080	.	+	0	gene_id "ENSG00000080910.14"; transcript_id "ENST00000367415.8"; gene_type "protein_coding"; gene_name "CFHR2"; transcript_type "protein_coding"; transcript_name "CFHR2-201"; exon_number 5; exon_id "ENSE00003843688.1"; level 2; protein_id "ENSP00000356385.4"; transcript_support_level "1"; hgnc_id "HGNC:4890"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; tag "appris_principal_2"; tag "CCDS"; ccdsid "CCDS30959.1"; havana_gene "OTTHUMG00000036518.4"; havana_transcript "OTTHUMT00000088815.4";
14 | chr1	HAVANA	UTR	196943738	196943880	.	+	.	gene_id "ENSG00000080910.14"; transcript_id "ENST00000367415.8"; gene_type "protein_coding"; gene_name "CFHR2"; transcript_type "protein_coding"; transcript_name "CFHR2-201"; exon_number 1; exon_id "ENSE00001920108.3"; level 2; protein_id "ENSP00000356385.4"; transcript_support_level "1"; hgnc_id "HGNC:4890"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; tag "appris_principal_2"; tag "CCDS"; ccdsid "CCDS30959.1"; havana_gene "OTTHUMG00000036518.4"; havana_transcript "OTTHUMT00000088815.4";
15 | chr1	HAVANA	UTR	196959078	196959622	.	+	.	gene_id "ENSG00000080910.14"; transcript_id "ENST00000367415.8"; gene_type "protein_coding"; gene_name "CFHR2"; transcript_type "protein_coding"; transcript_name "CFHR2-201"; exon_number 5; exon_id "ENSE00003843688.1"; level 2; protein_id "ENSP00000356385.4"; transcript_support_level "1"; hgnc_id "HGNC:4890"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; tag "appris_principal_2"; tag "CCDS"; ccdsid "CCDS30959.1"; havana_gene "OTTHUMG00000036518.4"; havana_transcript "OTTHUMT00000088815.4";
16 | 


--------------------------------------------------------------------------------
/examples/params.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train": {
 3 |         "batch_size": 2,
 4 |         "shuffle_buffer": 256,
 5 |         "optimizer": "adam",
 6 |         "learning_rate": 0.00006,
 7 |         "loss": "poisson_mn",
 8 |         "total_weight": 0.2,
 9 |         "warmup_steps": 20000,
10 |         "global_clipnorm": 0.15,
11 |         "adam_beta1": 0.9,
12 |         "adam_beta2": 0.999,
13 |         "patience": 30,
14 |         "train_epochs_min": 130,
15 |         "train_epochs_max": 180
16 |     },
17 |     "model": {
18 |         "seq_length": 524288,
19 |         "augment_rc": true,
20 |         "augment_shift": 3,
21 |         "activation": "gelu",
22 |         "norm_type": "batch-sync",
23 |         "bn_momentum": 0.9,
24 |         "kernel_initializer": "lecun_normal",
25 |         "l2_scale": 2.0e-8,
26 |         "trunk": [
27 |             {
28 |                 "name": "conv_dna",
29 |                 "filters": 512,
30 |                 "kernel_size": 15,
31 |                 "norm_type": null,
32 |                 "activation": "linear",
33 |                 "pool_size": 2
34 |             },
35 |             {
36 |                 "name": "res_tower",
37 |                 "filters_init": 608,
38 |                 "filters_end": 1536,
39 |                 "divisible_by": 32,
40 |                 "kernel_size": 5,
41 |                 "num_convs": 1,
42 |                 "pool_size": 2,
43 |                 "repeat": 6
44 |             },
45 |             {
46 |                 "name": "transformer_tower",
47 |                 "key_size": 64,
48 |                 "heads": 8,
49 |                 "num_position_features": 32,
50 |                 "dropout": 0.2,
51 |                 "mha_l2_scale": 1.0e-8,
52 |                 "l2_scale": 1.0e-8,
53 |                 "kernel_initializer": "he_normal",
54 |                 "repeat": 8
55 |             },
56 |             {
57 |                 "name": "unet_conv",
58 |                 "kernel_size": 3,
59 |                 "upsample_conv": true
60 |             },
61 |             {
62 |                 "name": "unet_conv",
63 |                 "kernel_size": 3,
64 |                 "upsample_conv": true
65 |             },
66 |             {
67 |                 "name": "Cropping1D",
68 |                 "cropping": 5120
69 |             },
70 |             {
71 |                 "name": "conv_nac",
72 |                 "filters": 1920,
73 |                 "dropout": 0.1
74 |             }
75 |         ],
76 |         "head_human": {
77 |             "name": "final",
78 |             "units": 7611,
79 |             "activation": "softplus"
80 |         },
81 |         "head_mouse": {
82 |             "name": "final",
83 |             "units": 2608,
84 |             "activation": "softplus"
85 |         }
86 |     }
87 | }
88 | 


--------------------------------------------------------------------------------
/examples/params_pred.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train": {
 3 |         "batch_size": 2,
 4 |         "shuffle_buffer": 256,
 5 |         "optimizer": "adam",
 6 |         "learning_rate": 0.00006,
 7 |         "loss": "poisson_mn",
 8 |         "total_weight": 0.2,
 9 |         "warmup_steps": 20000,
10 |         "global_clipnorm": 0.15,
11 |         "adam_beta1": 0.9,
12 |         "adam_beta2": 0.999,
13 |         "patience": 30,
14 |         "train_epochs_min": 130,
15 |         "train_epochs_max": 180
16 |     },
17 |     "model": {
18 |         "verbose": false,
19 |         "seq_length": 524288,
20 |         "augment_rc": true,
21 |         "augment_shift": 3,
22 |         "activation": "gelu",
23 |         "norm_type": "batch-sync",
24 |         "bn_momentum": 0.9,
25 |         "kernel_initializer": "lecun_normal",
26 |         "l2_scale": 2.0e-8,
27 |         "trunk": [
28 |             {
29 |                 "name": "conv_dna",
30 |                 "filters": 512,
31 |                 "kernel_size": 15,
32 |                 "norm_type": null,
33 |                 "activation": "linear",
34 |                 "pool_size": 2
35 |             },
36 |             {
37 |                 "name": "res_tower",
38 |                 "filters_init": 608,
39 |                 "filters_end": 1536,
40 |                 "divisible_by": 32,
41 |                 "kernel_size": 5,
42 |                 "num_convs": 1,
43 |                 "pool_size": 2,
44 |                 "repeat": 6
45 |             },
46 |             {
47 |                 "name": "transformer_tower",
48 |                 "key_size": 64,
49 |                 "heads": 8,
50 |                 "num_position_features": 32,
51 |                 "dropout": 0.2,
52 |                 "mha_l2_scale": 1.0e-8,
53 |                 "l2_scale": 1.0e-8,
54 |                 "kernel_initializer": "he_normal",
55 |                 "repeat": 8
56 |             },
57 |             {
58 |                 "name": "unet_conv",
59 |                 "kernel_size": 3,
60 |                 "upsample_conv": true
61 |             },
62 |             {
63 |                 "name": "unet_conv",
64 |                 "kernel_size": 3,
65 |                 "upsample_conv": true
66 |             },
67 |             {
68 |                 "name": "Cropping1D",
69 |                 "cropping": 16
70 |             },
71 |             {
72 |                 "name": "conv_nac",
73 |                 "filters": 1920,
74 |                 "dropout": 0.1
75 |             }
76 |         ],
77 |         "head_human": {
78 |             "name": "final",
79 |             "units": 7611,
80 |             "activation": "softplus"
81 |         },
82 |         "head_mouse": {
83 |             "name": "final",
84 |             "units": 2608,
85 |             "activation": "softplus"
86 |         }
87 |     }
88 | }
89 | 


--------------------------------------------------------------------------------
/examples/targets_gtex_liver.txt:
--------------------------------------------------------------------------------
1 | 	identifier	file	clip	clip_soft	scale	sum_stat	strand_pair	description
2 | 7563	GTEX-11EQ9-0526-SM-5A5JZ.1	/home/drk/tillage/datasets/human/rna/recount3/liver/GTEX-11EQ9-0526-SM-5A5JZ.1/coverage.w5	768	384	0.01	sum_sqrt	7563	RNA:liver
3 | 7564	GTEX-1QP66-0226-SM-DPRXS.1	/home/drk/tillage/datasets/human/rna/recount3/liver/GTEX-1QP66-0226-SM-DPRXS.1/coverage.w5	768	384	0.01	sum_sqrt	7564	RNA:liver
4 | 7565	GTEX-ZYT6-0626-SM-5E45V.1	/home/drk/tillage/datasets/human/rna/recount3/liver/GTEX-ZYT6-0626-SM-5E45V.1/coverage.w5	768	384	0.01	sum_sqrt	7565	RNA:liver
5 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=69.0.3", "setuptools_scm>=8.0.4"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "borzoi"
 7 | description = "borzoi"
 8 | authors = [
 9 |     {name = "David Kelley", email = "drk@calicolabs.com"},
10 |     {name = "Johannes Linder", email = "jlinder@calicolabs.com"}
11 | ]
12 | readme = "README.md"
13 | classifiers = ["License :: OSI Approved :: Apache License"]
14 | dynamic = ["version"]
15 | 
16 | requires-python = ">=3.9"
17 | dependencies = [
18 |     "h5py~=3.10.0",
19 |     "intervaltree~=3.1.0",
20 |     "joblib~=1.1.1",
21 |     "matplotlib~=3.7.1",
22 |     "google-cloud-storage~=2.0.0",
23 |     "natsort~=7.1.1",
24 |     "networkx~=2.8.4",
25 |     "numpy~=1.24.3",
26 |     "pandas~=1.5.3",
27 |     "pybigwig~=0.3.18",
28 |     "pybedtools~=0.10.0",
29 |     "pysam~=0.22.0",
30 |     "qnorm~=0.8.1",
31 |     "seaborn~=0.12.2",
32 |     "scikit-learn~=1.2.2",
33 |     "scipy~=1.9.1",
34 |     "tensorflow~=2.15.0",
35 |     "tqdm~=4.65.0",
36 |     "pyfaidx~=0.7.1",
37 |     "pyranges~=0.0.129",
38 | ]
39 | 
40 | [project.optional-dependencies]
41 | dev = [
42 |     "black~=23.12.1",
43 |     "pytest~=7.4.4",
44 |     "ruff~=0.1.11",
45 | ]
46 | 
47 | [project.urls]
48 | Homepage = "https://github.com/calico/borzoi"
49 | 
50 | [tool.setuptools_scm]


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/calico/borzoi/77670d9b0f940ff0a4b2b0edbdff445bbaefc6ed/src/__init__.py


--------------------------------------------------------------------------------
/src/borzoi/__init__.py:
--------------------------------------------------------------------------------
1 | from importlib.metadata import version, PackageNotFoundError
2 | 
3 | __version__ = "0.0.0"
4 | 
5 | try:
6 |     __version__ = version("calicolabs-$PYTHON_PACKAGE_NAME")
7 | except PackageNotFoundError:
8 |     pass
9 | 


--------------------------------------------------------------------------------
/src/borzoi/helpers/h5_grad_utils.py:
--------------------------------------------------------------------------------
  1 | import h5py
  2 | import numpy as np
  3 | import argparse
  4 | import subprocess
  5 | import tempfile
  6 | import os
  7 | from baskerville.helpers.gcs_utils import download_from_gcs, upload_file_gcs
  8 | 
  9 | 
 10 | def collect_h5_borzoi(out_dir, num_procs, sad_stat) -> None:
 11 |     h5_file = "scores_f0c0.h5"
 12 | 
 13 |     # count sequences
 14 |     num_seqs = 0
 15 |     for pi in range(num_procs):
 16 |         # open job
 17 |         job_h5_file = "%s/job%d/%s" % (out_dir, pi, h5_file)
 18 |         job_h5_open = h5py.File(job_h5_file, "r")
 19 |         num_seqs += job_h5_open[sad_stat].shape[0]
 20 |         seq_len = job_h5_open[sad_stat].shape[1]
 21 |         num_targets = job_h5_open[sad_stat].shape[-1]
 22 |         job_h5_open.close()
 23 | 
 24 |     # initialize final h5
 25 |     final_h5_file = "%s/%s" % (out_dir, h5_file)
 26 |     final_h5_open = h5py.File(final_h5_file, "w")
 27 | 
 28 |     # keep dict for string values
 29 |     final_strings = {}
 30 | 
 31 |     job0_h5_file = "%s/job0/%s" % (out_dir, h5_file)
 32 |     job0_h5_open = h5py.File(job0_h5_file, "r")
 33 |     for key in job0_h5_open.keys():
 34 |         key_shape = list(job0_h5_open[key].shape)
 35 |         key_shape[0] = num_seqs
 36 |         key_shape = tuple(key_shape)
 37 |         if job0_h5_open[key].dtype.char == "S":
 38 |             final_strings[key] = []
 39 |         else:
 40 |             final_h5_open.create_dataset(
 41 |                 key, shape=key_shape, dtype=job0_h5_open[key].dtype
 42 |             )
 43 | 
 44 |     # set values
 45 |     si = 0
 46 |     for pi in range(num_procs):
 47 |         # open job
 48 |         job_h5_file = "%s/job%d/%s" % (out_dir, pi, h5_file)
 49 |         job_h5_open = h5py.File(job_h5_file, "r")
 50 | 
 51 |         # append to final
 52 |         for key in job_h5_open.keys():
 53 |             job_seqs = job_h5_open[key].shape[0]
 54 |             if job_h5_open[key].dtype.char == "S":
 55 |                 final_strings[key] += list(job_h5_open[key])
 56 |             else:
 57 |                 final_h5_open[key][si : si + job_seqs] = job_h5_open[key]
 58 | 
 59 |         job_h5_open.close()
 60 |         si += job_seqs
 61 | 
 62 |     # create final string datasets
 63 |     for key in final_strings:
 64 |         final_h5_open.create_dataset(key, data=np.array(final_strings[key], dtype="S"))
 65 | 
 66 |     final_h5_open.close()
 67 | 
 68 | 
 69 | def download_h5_gcs(output_gcs_dir, num_processes) -> str:
 70 |     temp_dir = tempfile.mkdtemp()  # create a temp dir for output
 71 |     print(f"temp_dir is {temp_dir}")
 72 |     out_dir = temp_dir + "/" + output_gcs_dir.split("/")[-1]
 73 |     if not os.path.isdir(out_dir):
 74 |         os.mkdir(out_dir)
 75 |     # download output from tempfile
 76 |     for pi in range(num_processes):
 77 |         if not os.path.isdir(f"{out_dir}/job{pi}"):
 78 |             os.mkdir(f"{out_dir}/job{pi}")
 79 |         download_from_gcs(
 80 |             f"{output_gcs_dir}/job{pi}/scores_f0c0.h5",
 81 |             f"{out_dir}/job{pi}/scores_f0c0.h5",
 82 |         )
 83 |         print(f"Done downloading {pi} partition")
 84 |     # download all of the files in the folder
 85 |     # Use gsutil to copy the contents recursively
 86 |     # subprocess.check_call(["gsutil", "-m", "cp", "-r", output_gcs_dir, temp_dir])
 87 |     print(f"outdir is {out_dir}")
 88 |     print(f"gcs_out_dir is {output_gcs_dir}")
 89 |     print(f"Done dowloading")
 90 |     return out_dir
 91 | 
 92 | 
 93 | def main():
 94 |     parser = argparse.ArgumentParser(description="Process and collect h5 files.")
 95 | 
 96 |     parser.add_argument(
 97 |         "out_dir", type=str, help="Output directory for processed data."
 98 |     )
 99 |     parser.add_argument("num_procs", type=int, help="Number of processes to use.")
100 |     parser.add_argument("sad_stat", type=str, help="Stats to concatenate. E.g. grads")
101 |     parser.add_argument(
102 |         "--gcs",
103 |         action="store_true",
104 |         help="Flag indicating if the file is on Google Cloud Storage.",
105 |     )
106 | 
107 |     args = parser.parse_args()
108 |     if args.gcs:
109 |         # download files to tempdir
110 |         local_out_dir = download_h5_gcs(args.out_dir, args.num_procs)
111 |     collect_h5_borzoi(local_out_dir, args.num_procs, args.sad_stat)
112 |     # upload to gcs
113 |     print(f"is there such a file? {local_out_dir}/scores_f0c0.h5")
114 |     print(os.path.isfile(f"{local_out_dir}/scores_f0c0.h5"))
115 |     upload_file_gcs(f"{local_out_dir}/scores_f0c0.h5", args.out_dir)
116 | 
117 | 
118 | if __name__ == "__main__":
119 |     main()
120 | 


--------------------------------------------------------------------------------
/src/scripts/_archive/borzoi_satg_gene_multi.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # Copyright 2022 Calico LLC
  3 | 
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | 
  8 | #     https://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | # =========================================================================
 16 | from optparse import OptionParser
 17 | 
 18 | import os
 19 | import pickle
 20 | import sys
 21 | 
 22 | import h5py
 23 | import numpy as np
 24 | 
 25 | import slurm
 26 | 
 27 | """
 28 | borzoi_satg_gene_multi.py
 29 | 
 30 | Perform a gradient saliency analysis for genes specified in a GTF file.
 31 | using multiple processes.
 32 | """
 33 | 
 34 | ################################################################################
 35 | # main
 36 | ################################################################################
 37 | def main():
 38 |     usage = "usage: %prog [options] <params> <model> <gene_gtf>"
 39 |     parser = OptionParser(usage)
 40 | 
 41 |     # borzoi_satg_gene.py options
 42 |     parser.add_option(
 43 |         "-f",
 44 |         dest="genome_fasta",
 45 |         default="%s/assembly/ucsc/hg38.fa" % os.environ["HG38"],
 46 |         help="Genome FASTA for sequences [Default: %default]",
 47 |     )
 48 |     parser.add_option(
 49 |         "-o",
 50 |         dest="out_dir",
 51 |         default="satg_out",
 52 |         help="Output directory [Default: %default]",
 53 |     )
 54 |     parser.add_option(
 55 |         "--rc",
 56 |         dest="rc",
 57 |         default=False,
 58 |         action="store_true",
 59 |         help="Ensemble forward and reverse complement predictions [Default: %default]",
 60 |     )
 61 |     parser.add_option(
 62 |         "--shifts",
 63 |         dest="shifts",
 64 |         default="0",
 65 |         type="str",
 66 |         help="Ensemble prediction shifts [Default: %default]",
 67 |     )
 68 |     parser.add_option(
 69 |         "--span",
 70 |         dest="span",
 71 |         default=False,
 72 |         action="store_true",
 73 |         help="Aggregate entire gene span [Default: %default]",
 74 |     )
 75 |     parser.add_option(
 76 |         "--sum",
 77 |         dest="sum_targets",
 78 |         default=False,
 79 |         action="store_true",
 80 |         help="Sum targets for single output [Default: %default]",
 81 |     )
 82 |     parser.add_option(
 83 |         "-t",
 84 |         dest="targets_file",
 85 |         default=None,
 86 |         type="str",
 87 |         help="File specifying target indexes and labels in table format",
 88 |     )
 89 | 
 90 |     # _multi.py options
 91 |     parser.add_option(
 92 |         "-e",
 93 |         dest="conda_env",
 94 |         default="tf28",
 95 |         help="Anaconda environment [Default: %default]",
 96 |     )
 97 |     parser.add_option(
 98 |         "--max_proc",
 99 |         dest="max_proc",
100 |         default=None,
101 |         type="int",
102 |         help="Maximum concurrent processes [Default: %default]",
103 |     )
104 |     parser.add_option(
105 |         "-n",
106 |         dest="name",
107 |         default="satg",
108 |         help="SLURM job name prefix [Default: %default]",
109 |     )
110 |     parser.add_option(
111 |         "-p",
112 |         dest="processes",
113 |         default=None,
114 |         type="int",
115 |         help="Number of processes, passed by multi script",
116 |     )
117 |     parser.add_option(
118 |         "-q",
119 |         dest="queue",
120 |         default="standard",
121 |         help="SLURM queue on which to run the jobs [Default: %default]",
122 |     )
123 |     parser.add_option(
124 |         "-r",
125 |         "--restart",
126 |         dest="restart",
127 |         default=False,
128 |         action="store_true",
129 |         help="Restart a partially completed job [Default: %default]",
130 |     )
131 |     (options, args) = parser.parse_args()
132 | 
133 |     #######################################################
134 |     # prep work
135 | 
136 |     # output directory
137 |     if not options.restart:
138 |         if os.path.isdir(options.out_dir):
139 |             print("Please remove %s" % options.out_dir, file=sys.stderr)
140 |             exit(1)
141 |         os.mkdir(options.out_dir)
142 | 
143 |     # pickle options
144 |     options_pkl_file = "%s/options.pkl" % options.out_dir
145 |     options_pkl = open(options_pkl_file, "wb")
146 |     pickle.dump(options, options_pkl)
147 |     options_pkl.close()
148 | 
149 |     if options.queue == "standard":
150 |         num_gpu = 0
151 |         num_cpu = 8
152 |     else:
153 |         num_gpu = 1
154 |         num_cpu = 2
155 | 
156 |     #######################################################
157 |     # launch worker threads
158 |     jobs = []
159 |     for pi in range(options.processes):
160 |         if not options.restart or not job_completed(options, pi):
161 |             cmd = ". /home/drk/anaconda3/etc/profile.d/conda.sh;"
162 |             cmd += " conda activate %s;" % options.conda_env
163 | 
164 |             cmd += " borzoi_satg_gene.py %s %s %d" % (
165 |                 options_pkl_file,
166 |                 " ".join(args),
167 |                 pi,
168 |             )
169 |             name = "%s_p%d" % (options.name, pi)
170 |             outf = "%s/job%d.out" % (options.out_dir, pi)
171 |             errf = "%s/job%d.err" % (options.out_dir, pi)
172 |             j = slurm.Job(
173 |                 cmd,
174 |                 name,
175 |                 outf,
176 |                 errf,
177 |                 queue=options.queue,
178 |                 cpu=num_cpu,
179 |                 gpu=num_gpu,
180 |                 mem=120000,
181 |                 time="14-0:0:0",
182 |             )
183 |             jobs.append(j)
184 | 
185 |     slurm.multi_run(
186 |         jobs, max_proc=options.max_proc, verbose=True, launch_sleep=10, update_sleep=60
187 |     )
188 | 
189 |     #######################################################
190 |     # collect output
191 | 
192 |     collect_h5(options.out_dir, options.processes, "grads")
193 | 
194 |     # for pi in range(options.processes):
195 |     #     shutil.rmtree('%s/job%d' % (options.out_dir,pi))
196 | 
197 | 
198 | def collect_h5(out_dir, num_procs, sad_stat):
199 |     h5_file = "scores.h5"
200 | 
201 |     # count sequences
202 |     num_seqs = 0
203 |     for pi in range(num_procs):
204 |         # open job
205 |         job_h5_file = "%s/job%d/%s" % (out_dir, pi, h5_file)
206 |         job_h5_open = h5py.File(job_h5_file, "r")
207 |         num_seqs += job_h5_open[sad_stat].shape[0]
208 |         seq_len = job_h5_open[sad_stat].shape[1]
209 |         num_targets = job_h5_open[sad_stat].shape[-1]
210 |         job_h5_open.close()
211 | 
212 |     # initialize final h5
213 |     final_h5_file = "%s/%s" % (out_dir, h5_file)
214 |     final_h5_open = h5py.File(final_h5_file, "w")
215 | 
216 |     # keep dict for string values
217 |     final_strings = {}
218 | 
219 |     job0_h5_file = "%s/job0/%s" % (out_dir, h5_file)
220 |     job0_h5_open = h5py.File(job0_h5_file, "r")
221 |     for key in job0_h5_open.keys():
222 |         key_shape = list(job0_h5_open[key].shape)
223 |         key_shape[0] = num_seqs
224 |         key_shape = tuple(key_shape)
225 |         if job0_h5_open[key].dtype.char == "S":
226 |             final_strings[key] = []
227 |         else:
228 |             final_h5_open.create_dataset(
229 |                 key, shape=key_shape, dtype=job0_h5_open[key].dtype
230 |             )
231 | 
232 |     # set values
233 |     si = 0
234 |     for pi in range(num_procs):
235 |         # open job
236 |         job_h5_file = "%s/job%d/%s" % (out_dir, pi, h5_file)
237 |         job_h5_open = h5py.File(job_h5_file, "r")
238 | 
239 |         # append to final
240 |         for key in job_h5_open.keys():
241 |             job_seqs = job_h5_open[key].shape[0]
242 |             if job_h5_open[key].dtype.char == "S":
243 |                 final_strings[key] += list(job_h5_open[key])
244 |             else:
245 |                 final_h5_open[key][si : si + job_seqs] = job_h5_open[key]
246 | 
247 |         job_h5_open.close()
248 |         si += job_seqs
249 | 
250 |     # create final string datasets
251 |     for key in final_strings:
252 |         final_h5_open.create_dataset(key, data=np.array(final_strings[key], dtype="S"))
253 | 
254 |     final_h5_open.close()
255 | 
256 | 
257 | def job_completed(options, pi):
258 |     """Check whether a specific job has generated its
259 |     output file."""
260 |     out_file = "%s/job%d/scores.h5" % (options.out_dir, pi)
261 |     valid_file = True
262 |     if not os.path.isfile(out_file):
263 |         valid_file = False
264 |     else:
265 |         try:
266 |             out_open = h5py.File(out_file, "r")
267 |         except OSError:
268 |             valid_file = False
269 |     return valid_file
270 | 
271 | 
272 | ################################################################################
273 | # __main__
274 | ################################################################################
275 | if __name__ == "__main__":
276 |     main()
277 | 


--------------------------------------------------------------------------------
/src/scripts/borzoi_bench_trip_folds.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # Copyright 2019 Calico LLC
  3 | 
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | 
  8 | #     https://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | # =========================================================================
 16 | from optparse import OptionParser, OptionGroup
 17 | import os
 18 | 
 19 | import slurm
 20 | 
 21 | """
 22 | borzoi_borzoi_trip_folds.py
 23 | 
 24 | Benchmark Borzoi model replicates on TRIP prediction task.
 25 | """
 26 | 
 27 | ################################################################################
 28 | # main
 29 | ################################################################################
 30 | def main():
 31 |     usage = "usage: %prog [options] <params_file> <data_dir> <promoter_file> <insertions_file>"
 32 |     parser = OptionParser(usage)
 33 | 
 34 |     # trip
 35 |     trip_options = OptionGroup(parser, "borzoi_trip.py options")
 36 |     trip_options.add_option(
 37 |         "-f",
 38 |         dest="genome_fasta",
 39 |         default="%s/assembly/ucsc/hg38.fa" % os.environ.get('BORZOI_HG38', 'hg38'),
 40 |         help="Genome FASTA for sequences [Default: %default]",
 41 |     )
 42 |     trip_options.add_option(
 43 |         "-o",
 44 |         dest="out_dir",
 45 |         default="trip",
 46 |         help="Output directory for tables and plots [Default: %default]",
 47 |     )
 48 |     trip_options.add_option(
 49 |         "--site",
 50 |         dest="site",
 51 |         default=False,
 52 |         action="store_true",
 53 |         help="Return the insertion site without the promoter [Default: %default]",
 54 |     )
 55 |     trip_options.add_option(
 56 |         "--reporter",
 57 |         dest="reporter",
 58 |         default=False,
 59 |         action="store_true",
 60 |         help="Insert the flanking piggyback reporter with the promoter [Default: %default]",
 61 |     )
 62 |     trip_options.add_option(
 63 |         "--reporter_bare",
 64 |         dest="reporter_bare",
 65 |         default=False,
 66 |         action="store_true",
 67 |         help="Insert the flanking piggyback reporter with the promoter (no terminal repeats) [Default: %default]",
 68 |     )
 69 |     trip_options.add_option(
 70 |         "--rc",
 71 |         dest="rc",
 72 |         default=False,
 73 |         action="store_true",
 74 |         help="Average forward and reverse complement predictions [Default: %default]",
 75 |     )
 76 |     trip_options.add_option(
 77 |         "--shifts",
 78 |         dest="shifts",
 79 |         default="0",
 80 |         type="str",
 81 |         help="Ensemble prediction shifts [Default: %default]",
 82 |     )
 83 |     trip_options.add_option(
 84 |         "-t",
 85 |         dest="targets_file",
 86 |         default=None,
 87 |         type="str",
 88 |         help="File specifying target indexes and labels in table format",
 89 |     )
 90 |     parser.add_option_group(trip_options)
 91 | 
 92 |     # cross-fold
 93 |     fold_options = OptionGroup(parser, "cross-fold options")
 94 |     fold_options.add_option(
 95 |         "-c",
 96 |         dest="crosses",
 97 |         default=1,
 98 |         type="int",
 99 |         help="Number of cross-fold rounds [Default:%default]",
100 |     )
101 |     fold_options.add_option(
102 |         "--folds",
103 |         dest="fold_subset",
104 |         default=1,
105 |         type="int",
106 |         help="Run a subset of folds [Default:%default]",
107 |     )
108 |     fold_options.add_option(
109 |         "--f_list",
110 |         dest="fold_subset_list",
111 |         default=None,
112 |         help="Run a subset of folds (encoded as comma-separated string) [Default:%default]",
113 |     )
114 |     fold_options.add_option(
115 |         "-d",
116 |         dest="data_head",
117 |         default=None,
118 |         type="int",
119 |         help="Index for dataset/head [Default: %default]",
120 |     )
121 |     fold_options.add_option(
122 |         "-e",
123 |         dest="conda_env",
124 |         default="tf210",
125 |         help="Anaconda environment [Default: %default]",
126 |     )
127 |     fold_options.add_option(
128 |         "--name",
129 |         dest="name",
130 |         default="trip",
131 |         help="SLURM name prefix [Default: %default]",
132 |     )
133 |     fold_options.add_option(
134 |         "--max_proc",
135 |         dest="max_proc",
136 |         default=None,
137 |         type="int",
138 |         help="Maximum concurrent processes [Default: %default]",
139 |     )
140 |     fold_options.add_option(
141 |         "-q",
142 |         dest="queue",
143 |         default="geforce",
144 |         help="SLURM queue on which to run the jobs [Default: %default]",
145 |     )
146 |     fold_options.add_option(
147 |         "-r",
148 |         dest="restart",
149 |         default=False,
150 |         action="store_true",
151 |         help="Restart a partially completed job [Default: %default]",
152 |     )
153 |     parser.add_option_group(fold_options)
154 | 
155 |     (options, args) = parser.parse_args()
156 | 
157 |     if len(args) != 4:
158 |         print(len(args))
159 |         print(args)
160 |         parser.error(
161 |             "Must provide parameters file, cross-fold directory, TRIP promoter sequences, and TRIP insertion sites"
162 |         )
163 |     else:
164 |         params_file = args[0]
165 |         exp_dir = args[1]
166 |         promoters_file = args[2]
167 |         insertions_file = args[3]
168 | 
169 |     #######################################################
170 |     # prep work
171 | 
172 |     # set folds
173 |     num_folds = 1
174 |     if options.fold_subset is not None:
175 |         num_folds = options.fold_subset
176 |   
177 |     fold_index = [fold_i for fold_i in range(num_folds)]
178 | 
179 |     # subset folds (list)
180 |     if options.fold_subset_list is not None:
181 |         fold_index = [int(fold_str) for fold_str in options.fold_subset_list.split(",")]
182 | 
183 |     ################################################################
184 |     # TRIP prediction jobs
185 | 
186 |     # command base
187 |     cmd_base = ('. %s; ' % os.environ['BORZOI_CONDA']) if 'BORZOI_CONDA' in os.environ else ''
188 |     cmd_base += "conda activate %s;" % options.conda_env
189 |     cmd_base += " echo $HOSTNAME;"
190 | 
191 |     jobs = []
192 | 
193 |     for ci in range(options.crosses):
194 |         for fi in fold_index:
195 |             it_dir = "%s/f%dc%d" % (exp_dir, fi, ci)
196 |             name = "%s-f%dc%d" % (options.name, fi, ci)
197 | 
198 |             # update output directory
199 |             it_out_dir = "%s/%s" % (it_dir, options.out_dir)
200 |             os.makedirs(it_out_dir, exist_ok=True)
201 | 
202 |             model_file = "%s/train/model_best.h5" % it_dir
203 |             if options.data_head is not None:
204 |                 model_file = "%s/train/model%d_best.h5" % (it_dir, options.data_head)
205 | 
206 |             cmd_fold = "%s time borzoi_trip.py %s %s %s %s" % (
207 |                 cmd_base,
208 |                 params_file,
209 |                 model_file,
210 |                 promoters_file,
211 |                 insertions_file,
212 |             )
213 | 
214 |             # TRIP job
215 |             job_out_dir = it_out_dir
216 |             if not options.restart or not os.path.isfile("%s/preds.h5" % job_out_dir):
217 |                 cmd_job = cmd_fold
218 |                 cmd_job += " %s" % options_string(options, trip_options, job_out_dir)
219 |                 j = slurm.Job(
220 |                     cmd_job,
221 |                     name,
222 |                     "%s.out" % job_out_dir,
223 |                     "%s.err" % job_out_dir,
224 |                     queue=options.queue,
225 |                     gpu=1,
226 |                     mem=60000,
227 |                     time="7-0:0:0",
228 |                 )
229 |                 jobs.append(j)
230 | 
231 |     slurm.multi_run(
232 |         jobs, max_proc=options.max_proc, verbose=True, launch_sleep=10, update_sleep=60
233 |     )
234 | 
235 | 
236 | def options_string(options, group_options, rep_dir):
237 |     options_str = ""
238 | 
239 |     for opt in group_options.option_list:
240 |         opt_str = opt.get_opt_string()
241 |         opt_value = options.__dict__[opt.dest]
242 | 
243 |         # wrap askeriks in ""
244 |         if type(opt_value) == str and opt_value.find("*") != -1:
245 |             opt_value = '"%s"' % opt_value
246 | 
247 |         # no value for bools
248 |         elif type(opt_value) == bool:
249 |             if not opt_value:
250 |                 opt_str = ""
251 |             opt_value = ""
252 | 
253 |         # skip Nones
254 |         elif opt_value is None:
255 |             opt_str = ""
256 |             opt_value = ""
257 | 
258 |         # modify
259 |         elif opt.dest == "out_dir":
260 |             opt_value = rep_dir
261 | 
262 |         options_str += " %s %s" % (opt_str, opt_value)
263 | 
264 |     return options_str
265 | 
266 | 
267 | ################################################################################
268 | # __main__
269 | ################################################################################
270 | if __name__ == "__main__":
271 |     main()
272 | 


--------------------------------------------------------------------------------
/src/scripts/borzoi_sad_folds.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # Copyright 2019 Calico LLC
  3 | 
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | 
  8 | #         https://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | # =========================================================================
 16 | from optparse import OptionParser, OptionGroup
 17 | import glob
 18 | import h5py
 19 | import json
 20 | import pdb
 21 | import os
 22 | import sys
 23 | 
 24 | import numpy as np
 25 | import pandas as pd
 26 | 
 27 | import slurm
 28 | 
 29 | """
 30 | borzoi_sad_folds.py
 31 | 
 32 | Compute SAD scores across model folds.
 33 | """
 34 | 
 35 | ################################################################################
 36 | # main
 37 | ################################################################################
 38 | def main():
 39 |     usage = 'usage: %prog [options] <params_file> <data_dir>'
 40 |     parser = OptionParser(usage)
 41 | 
 42 |     # sad
 43 |     sad_options = OptionGroup(parser, 'borzoi_sad.py options')
 44 |     sad_options.add_option(
 45 |         '-f',
 46 |         dest='genome_fasta',
 47 |         default='%s/assembly/ucsc/hg38.fa' % os.environ.get('BORZOI_HG38', 'hg38'),
 48 |         help='Genome FASTA for sequences [Default: %default]',
 49 |     )
 50 |     sad_options.add_option(
 51 |         '-o',
 52 |         dest='out_dir',
 53 |         default='sad',
 54 |         help='Output directory for tables and plots [Default: %default]'
 55 |     )
 56 |     sad_options.add_option(
 57 |         '-p',
 58 |         dest='processes',
 59 |         default=None,
 60 |         type='int',
 61 |         help='Number of processes, passed by multi script'
 62 |     )
 63 |     sad_options.add_option(
 64 |         '--rc',
 65 |         dest='rc',
 66 |         default=False,
 67 |         action='store_true',
 68 |         help='Average forward and reverse complement predictions [Default: %default]'
 69 |     )
 70 |     sad_options.add_option(
 71 |         '--shifts', dest='shifts',
 72 |         default='0',
 73 |         type='str',
 74 |         help='Ensemble prediction shifts [Default: %default]'
 75 |     )
 76 |     sad_options.add_option(
 77 |         '--stats',
 78 |         dest='sad_stats',
 79 |         default='SAD',
 80 |         help='Comma-separated list of stats to save. [Default: %default]'
 81 |     )
 82 |     sad_options.add_option(
 83 |         '-t',
 84 |         dest='targets_file',
 85 |         default=None,
 86 |         type='str',
 87 |         help='File specifying target indexes and labels in table format'
 88 |     )
 89 |     sad_options.add_option(
 90 |         '-u',
 91 |         dest='untransform_old',
 92 |         default=False,
 93 |         action='store_true',
 94 |     )
 95 |     sad_options.add_option(
 96 |         '--no_untransform',
 97 |         dest='no_untransform',
 98 |         default=False,
 99 |         action='store_true',
100 |     )
101 |     parser.add_option_group(sad_options)
102 | 
103 |     # cross-fold
104 |     fold_options = OptionGroup(parser, 'cross-fold options')
105 |     fold_options.add_option(
106 |         '-c',
107 |         dest='crosses',
108 |         default=1,
109 |         type='int',
110 |         help='Number of cross-fold rounds [Default:%default]',
111 |     )
112 |     fold_options.add_option(
113 |         '--folds',
114 |         dest='fold_subset',
115 |         default=1,
116 |         type='int',
117 |         help='Run a subset of folds [Default:%default]',
118 |     )
119 |     fold_options.add_option(
120 |         '--f_list',
121 |         dest='fold_subset_list',
122 |         default=None,
123 |         help='Run a subset of folds (encoded as comma-separated string) [Default:%default]',
124 |     )
125 |     fold_options.add_option(
126 |         '-d',
127 |         dest='data_head',
128 |         default=None,
129 |         type='int',
130 |         help='Index for dataset/head [Default: %default]'
131 |     )
132 |     fold_options.add_option(
133 |         '-e',
134 |         dest='conda_env',
135 |         default='tf210',
136 |         help='Anaconda environment [Default: %default]'
137 |     )
138 |     fold_options.add_option(
139 |         '--name',
140 |         dest='name',
141 |         default='sad',
142 |         help='SLURM name prefix [Default: %default]'
143 |     )
144 |     fold_options.add_option(
145 |         '--max_proc',
146 |         dest='max_proc',
147 |         default=None,
148 |         type='int',
149 |         help='Maximum concurrent processes [Default: %default]'
150 |     )
151 |     fold_options.add_option(
152 |         '-q',
153 |         dest='queue',
154 |         default='geforce',
155 |         help='SLURM queue on which to run the jobs [Default: %default]'
156 |     )
157 |     fold_options.add_option(
158 |         '-r',
159 |         dest='restart',
160 |         default=False,
161 |         action='store_true',
162 |         help='Restart a partially completed job [Default: %default]'
163 |     )
164 |     fold_options.add_option(
165 |         '--vcf',
166 |         dest='vcf_file',
167 |         default='/home/jlinder/seqnn/data/satmutmpra/satmutmpra_v1.vcf'
168 |     )
169 |     parser.add_option_group(fold_options)
170 | 
171 |     (options, args) = parser.parse_args()
172 | 
173 |     if len(args) != 2:
174 |         parser.error('Must provide parameters file and cross-fold directory')
175 |     else:
176 |         params_file = args[0]
177 |         exp_dir = args[1]
178 | 
179 |     #######################################################
180 |     # prep work
181 | 
182 |     # set folds
183 |     num_folds = 1
184 |     if options.fold_subset is not None:
185 |         num_folds = options.fold_subset
186 |   
187 |     fold_index = [fold_i for fold_i in range(num_folds)]
188 | 
189 |     # subset folds (list)
190 |     if options.fold_subset_list is not None:
191 |         fold_index = [int(fold_str) for fold_str in options.fold_subset_list.split(",")]
192 | 
193 |     ################################################################
194 |     # SNP scores
195 | 
196 |     # command base
197 |     cmd_base = ('. %s; ' % os.environ['BORZOI_CONDA']) if 'BORZOI_CONDA' in os.environ else ''
198 |     cmd_base += 'conda activate %s;' % options.conda_env
199 |     cmd_base += ' echo $HOSTNAME;'
200 | 
201 |     jobs = []
202 | 
203 |     for ci in range(options.crosses):
204 |         for fi in fold_index:
205 |             it_dir = '%s/f%dc%d' % (exp_dir, fi, ci)
206 |             name = '%s-f%dc%d' % (options.name, fi, ci)
207 | 
208 |             # update output directory
209 |             it_out_dir = '%s/%s' % (it_dir, options.out_dir)
210 |             os.makedirs(it_out_dir, exist_ok=True)
211 | 
212 |             model_file = '%s/train/model_best.h5' % it_dir
213 |             if options.data_head is not None:
214 |                 model_file = '%s/train/model%d_best.h5' % (it_dir, options.data_head)
215 | 
216 |             cmd_fold = '%s time borzoi_sad.py %s %s' % (cmd_base, params_file, model_file)
217 | 
218 |             # variant scoring job
219 |             job_out_dir = it_out_dir
220 |             if not options.restart or not os.path.isfile('%s/sad.h5'%job_out_dir):
221 |                 cmd_job = '%s %s' % (cmd_fold, options.vcf_file)
222 |                 cmd_job += ' %s' % options_string(options, sad_options, job_out_dir)
223 |                 j = slurm.Job(cmd_job, '%s' % name,
224 |                         '%s.out'%job_out_dir, '%s.err'%job_out_dir, '%s.sb'%job_out_dir,
225 |                         queue=options.queue, gpu=1,
226 |                         mem=45000, time='30-0:0:0')
227 |                 jobs.append(j)
228 |                 
229 |     slurm.multi_run(jobs, max_proc=options.max_proc, verbose=True,
230 |                                     launch_sleep=10, update_sleep=60)
231 | 
232 | def options_string(options, group_options, rep_dir):
233 |     options_str = ''
234 | 
235 |     for opt in group_options.option_list:
236 |         opt_str = opt.get_opt_string()
237 |         opt_value = options.__dict__[opt.dest]
238 | 
239 |         # wrap askeriks in ""
240 |         if type(opt_value) == str and opt_value.find('*') != -1:
241 |             opt_value = '"%s"' % opt_value
242 | 
243 |         # no value for bools
244 |         elif type(opt_value) == bool:
245 |             if not opt_value:
246 |                 opt_str = ''
247 |             opt_value = ''
248 | 
249 |         # skip Nones
250 |         elif opt_value is None:
251 |             opt_str = ''
252 |             opt_value = ''
253 | 
254 |         # modify
255 |         elif opt.dest == 'out_dir':
256 |             opt_value = rep_dir
257 | 
258 |         options_str += ' %s %s' % (opt_str, opt_value)
259 | 
260 |     return options_str
261 | 
262 | ################################################################################
263 | # __main__
264 | ################################################################################
265 | if __name__ == '__main__':
266 |     main()
267 | 


--------------------------------------------------------------------------------
/src/scripts/borzoi_sed_folds.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # Copyright 2019 Calico LLC
  3 | 
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | 
  8 | #         https://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | # =========================================================================
 16 | from optparse import OptionParser, OptionGroup
 17 | import glob
 18 | import h5py
 19 | import json
 20 | import pdb
 21 | import os
 22 | import sys
 23 | 
 24 | import numpy as np
 25 | import pandas as pd
 26 | 
 27 | import slurm
 28 | 
 29 | """
 30 | borzoi_sed_folds.py
 31 | 
 32 | Compute SED scores across model folds.
 33 | """
 34 | 
 35 | ################################################################################
 36 | # main
 37 | ################################################################################
 38 | def main():
 39 |     usage = 'usage: %prog [options] <params_file> <data_dir>'
 40 |     parser = OptionParser(usage)
 41 | 
 42 |     # sed
 43 |     sed_options = OptionGroup(parser, 'borzoi_sed_folds.py options')
 44 |     sed_options.add_option(
 45 |         '-f',
 46 |         dest='genome_fasta',
 47 |         default='%s/assembly/ucsc/hg38.fa' % os.environ.get('BORZOI_HG38', 'hg38'),
 48 |         help='Genome FASTA for sequences [Default: %default]',
 49 |     )
 50 |     sed_options.add_option(
 51 |         '-g',
 52 |         dest='genes_gtf',
 53 |         default='%s/genes/gencode41/gencode41_basic_nort.gtf' % os.environ.get('BORZOI_HG38', 'hg38'),
 54 |         help='GTF for gene definition [Default %default]',
 55 |     )
 56 |     sed_options.add_option(
 57 |         '-o',
 58 |         dest='out_dir',
 59 |         default='sed',
 60 |         help='Output directory for tables and plots [Default: %default]',
 61 |     )
 62 |     sed_options.add_option(
 63 |         '-p',
 64 |         dest='processes',
 65 |         default=None,
 66 |         type='int',
 67 |         help='Number of processes, passed by multi script',
 68 |     )
 69 |     sed_options.add_option(
 70 |         '--rc',
 71 |         dest='rc',
 72 |         default=False,
 73 |         action='store_true',
 74 |         help='Average forward and reverse complement predictions [Default: %default]',
 75 |     )
 76 |     sed_options.add_option(
 77 |         '--shifts',
 78 |         dest='shifts',
 79 |         default='0',
 80 |         type='str',
 81 |         help='Ensemble prediction shifts [Default: %default]',
 82 |     )
 83 |     sed_options.add_option(
 84 |         '--span',
 85 |         dest='span',
 86 |         default=False,
 87 |         action='store_true',
 88 |         help='Aggregate entire gene span [Default: %default]',
 89 |     )
 90 |     sed_options.add_option(
 91 |         '--stats',
 92 |         dest='sed_stats',
 93 |         default='SED',
 94 |         help='Comma-separated list of stats to save. [Default: %default]',
 95 |     )
 96 |     sed_options.add_option(
 97 |         '-t',
 98 |         dest='targets_file',
 99 |         default=None,
100 |         type='str',
101 |         help='File specifying target indexes and labels in table format',
102 |     )
103 |     sed_options.add_option(
104 |         '-u',
105 |         dest='untransform_old',
106 |         default=False,
107 |         action='store_true',
108 |     )
109 |     sed_options.add_option(
110 |         '--no_untransform',
111 |         dest='no_untransform',
112 |         default=False,
113 |         action='store_true',
114 |     )
115 |     parser.add_option_group(sed_options)
116 |     
117 |     # cross-fold
118 |     fold_options = OptionGroup(parser, 'cross-fold options')
119 |     fold_options.add_option(
120 |         '-c',
121 |         dest='crosses',
122 |         default=1,
123 |         type='int',
124 |         help='Number of cross-fold rounds [Default:%default]',
125 |     )
126 |     fold_options.add_option(
127 |         '--folds',
128 |         dest='fold_subset',
129 |         default=1,
130 |         type='int',
131 |         help='Run a subset of folds [Default:%default]',
132 |     )
133 |     fold_options.add_option(
134 |         '--f_list',
135 |         dest='fold_subset_list',
136 |         default=None,
137 |         help='Run a subset of folds (encoded as comma-separated string) [Default:%default]',
138 |     )
139 |     fold_options.add_option(
140 |         '-d',
141 |         dest='data_head',
142 |         default=None,
143 |         type='int',
144 |         help='Index for dataset/head [Default: %default]',
145 |     )
146 |     fold_options.add_option(
147 |         '-e',
148 |         dest='conda_env',
149 |         default='tf210',
150 |         help='Anaconda environment [Default: %default]',
151 |     )
152 |     fold_options.add_option(
153 |         '--name',
154 |         dest='name',
155 |         default='sed',
156 |         help='SLURM name prefix [Default: %default]',
157 |     )
158 |     fold_options.add_option(
159 |         '--max_proc',
160 |         dest='max_proc',
161 |         default=None,
162 |         type='int',
163 |         help='Maximum concurrent processes [Default: %default]',
164 |     )
165 |     fold_options.add_option(
166 |         '-q',
167 |         dest='queue',
168 |         default='geforce',
169 |         help='SLURM queue on which to run the jobs [Default: %default]',
170 |     )
171 |     fold_options.add_option(
172 |         '-r',
173 |         dest='restart',
174 |         default=False,
175 |         action='store_true',
176 |         help='Restart a partially completed job [Default: %default]',
177 |     )
178 |     fold_options.add_option(
179 |         '--vcf',
180 |         dest='vcf_file',
181 |         default='/home/drk/seqnn/data/gtex_fine/susie_pip90/pos_merge.vcf',
182 |     )
183 |     parser.add_option_group(fold_options)
184 | 
185 |     (options, args) = parser.parse_args()
186 | 
187 |     if len(args) != 2:
188 |         parser.error('Must provide parameters file and cross-fold directory')
189 |     else:
190 |         params_file = args[0]
191 |         exp_dir = args[1]
192 | 
193 |     #######################################################
194 |     # prep work
195 | 
196 |     # set folds
197 |     num_folds = 1
198 |     if options.fold_subset is not None:
199 |         num_folds = options.fold_subset
200 |   
201 |     fold_index = [fold_i for fold_i in range(num_folds)]
202 | 
203 |     # subset folds (list)
204 |     if options.fold_subset_list is not None:
205 |         fold_index = [int(fold_str) for fold_str in options.fold_subset_list.split(",")]
206 | 
207 |     ################################################################
208 |     # SNP scores
209 | 
210 |     # command base
211 |     cmd_base = ('. %s; ' % os.environ['BORZOI_CONDA']) if 'BORZOI_CONDA' in os.environ else ''
212 |     cmd_base += 'conda activate %s;' % options.conda_env
213 |     cmd_base += ' echo $HOSTNAME;'
214 | 
215 |     jobs = []
216 | 
217 |     for ci in range(options.crosses):
218 |         for fi in fold_index:
219 |             it_dir = '%s/f%dc%d' % (exp_dir, fi, ci)
220 |             name = '%s-f%dc%d' % (options.name, fi, ci)
221 | 
222 |             # update output directory
223 |             it_out_dir = '%s/%s' % (it_dir, options.out_dir)
224 |             os.makedirs(it_out_dir, exist_ok=True)
225 | 
226 |             model_file = '%s/train/model_best.h5' % it_dir
227 |             if options.data_head is not None:
228 |                 model_file = '%s/train/model%d_best.h5' % (it_dir, options.data_head)
229 | 
230 |             cmd_fold = '%s time borzoi_sed.py %s %s' % (cmd_base, params_file, model_file)
231 | 
232 |             # variant scoring job
233 |             job_out_dir = it_out_dir
234 |             if not options.restart or not os.path.isfile('%s/sed.h5'%job_out_dir):
235 |                 cmd_job = '%s %s' % (cmd_fold, options.vcf_file)
236 |                 cmd_job += ' %s' % options_string(options, sed_options, job_out_dir)
237 |                 j = slurm.Job(cmd_job, '%s' % name,
238 |                         '%s.out'%job_out_dir, '%s.err'%job_out_dir, '%s.sb'%job_out_dir,
239 |                         queue=options.queue, gpu=1,
240 |                         mem=60000, time='30-0:0:0')
241 |                 jobs.append(j)
242 |                 
243 |     slurm.multi_run(jobs, max_proc=options.max_proc, verbose=True,
244 |                                     launch_sleep=10, update_sleep=60)
245 | 
246 | def options_string(options, group_options, rep_dir):
247 |     options_str = ''
248 | 
249 |     for opt in group_options.option_list:
250 |         opt_str = opt.get_opt_string()
251 |         opt_value = options.__dict__[opt.dest]
252 | 
253 |         # wrap askeriks in ""
254 |         if type(opt_value) == str and opt_value.find('*') != -1:
255 |             opt_value = '"%s"' % opt_value
256 | 
257 |         # no value for bools
258 |         elif type(opt_value) == bool:
259 |             if not opt_value:
260 |                 opt_str = ''
261 |             opt_value = ''
262 | 
263 |         # skip Nones
264 |         elif opt_value is None:
265 |             opt_str = ''
266 |             opt_value = ''
267 | 
268 |         # modify
269 |         elif opt.dest == 'out_dir':
270 |             opt_value = rep_dir
271 | 
272 |         options_str += ' %s %s' % (opt_str, opt_value)
273 | 
274 |     return options_str
275 | 
276 | ################################################################################
277 | # __main__
278 | ################################################################################
279 | if __name__ == '__main__':
280 |     main()
281 | 


--------------------------------------------------------------------------------
/src/scripts/borzoi_test_apa_folds.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # Copyright 2019 Calico LLC
  3 | 
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | 
  8 | #     https://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | # =========================================================================
 16 | from optparse import OptionParser
 17 | import json
 18 | import os
 19 | 
 20 | import slurm
 21 | 
 22 | """
 23 | borzoi_test_apa_folds.py
 24 | 
 25 | Measure accuracy at polyadenylation-level for multiple model replicates.
 26 | """
 27 | 
 28 | ################################################################################
 29 | # main
 30 | ################################################################################
 31 | def main():
 32 |     usage = "usage: %prog [options] <params_file> <data1_dir> ..."
 33 |     parser = OptionParser(usage)
 34 |     parser.add_option(
 35 |         "-c",
 36 |         dest="crosses",
 37 |         default=1,
 38 |         type="int",
 39 |         help="Number of cross-fold rounds [Default:%default]",
 40 |     )
 41 |     parser.add_option(
 42 |         "-d",
 43 |         dest="dataset_i",
 44 |         default=None,
 45 |         type="int",
 46 |         help="Dataset index [Default:%default]",
 47 |     )
 48 |     parser.add_option(
 49 |         "-e",
 50 |         dest="conda_env",
 51 |         default="tf210",
 52 |         help="Anaconda environment [Default: %default]",
 53 |     )
 54 |     parser.add_option(
 55 |         "-f",
 56 |         dest="fold_subset",
 57 |         default=None,
 58 |         type="int",
 59 |         help="Run a subset of folds [Default:%default]",
 60 |     )
 61 |     parser.add_option(
 62 |         "--f_list",
 63 |         dest="fold_subset_list",
 64 |         default=None,
 65 |         help="Run a subset of folds (encoded as comma-separated string) [Default:%default]",
 66 |     )
 67 |     parser.add_option(
 68 |         "-g",
 69 |         dest="apa_file",
 70 |         default="%s/genes/polyadb/polyadb_human_v3.csv.gz" % os.environ.get('BORZOI_HG38', 'hg38'),
 71 |         help="Csv for polya site definition [Default %default]",
 72 |     )
 73 |     parser.add_option(
 74 |         "--name",
 75 |         dest="name",
 76 |         default="teste",
 77 |         help="SLURM name prefix [Default: %default]",
 78 |     )
 79 |     parser.add_option(
 80 |         "-o",
 81 |         dest="exp_dir",
 82 |         default=None,
 83 |         help="Output experiment directory [Default: %default]",
 84 |     )
 85 |     parser.add_option(
 86 |         "-q",
 87 |         dest="queue",
 88 |         default="geforce"
 89 |     )
 90 |     parser.add_option(
 91 |         "--rc",
 92 |         dest="rc",
 93 |         default=False,
 94 |         action="store_true",
 95 |         help="Average forward and reverse complement predictions [Default: %default]",
 96 |     )
 97 |     parser.add_option(
 98 |         "--shifts",
 99 |         dest="shifts",
100 |         default="0",
101 |         type="str",
102 |         help="Ensemble prediction shifts [Default: %default]",
103 |     )
104 |     parser.add_option(
105 |         "-t",
106 |         dest="targets_file",
107 |         default=None,
108 |         type="str",
109 |         help="File specifying target indexes and labels in table format",
110 |     )
111 |     parser.add_option(
112 |         "-u",
113 |         dest="untransform_old",
114 |         default=False,
115 |         action="store_true",
116 |         help="Untransform old models [Default: %default]",
117 |     )
118 |     (options, args) = parser.parse_args()
119 | 
120 |     if len(args) < 2:
121 |         parser.error("Must provide parameters file and data directory")
122 |     else:
123 |         params_file = args[0]
124 |         data_dirs = [os.path.abspath(arg) for arg in args[1:]]
125 | 
126 |     # using -o for required argument for compatibility with the training script
127 |     assert options.exp_dir is not None
128 | 
129 |     # read data parameters
130 |     data_stats_file = "%s/statistics.json" % data_dirs[0]
131 |     with open(data_stats_file) as data_stats_open:
132 |         data_stats = json.load(data_stats_open)
133 | 
134 |     if options.dataset_i is None:
135 |         head_i = 0
136 |     else:
137 |         head_i = options.dataset_i
138 | 
139 |     # count folds
140 |     num_folds = len([dkey for dkey in data_stats if dkey.startswith("fold")])
141 | 
142 |     # subset folds
143 |     if options.fold_subset is not None:
144 |         num_folds = min(options.fold_subset, num_folds)
145 |   
146 |     fold_index = [fold_i for fold_i in range(num_folds)]
147 | 
148 |     # subset folds (list)
149 |     if options.fold_subset_list is not None:
150 |         fold_index = [int(fold_str) for fold_str in options.fold_subset_list.split(",")]
151 | 
152 |     if options.queue == "standard":
153 |         num_cpu = 4
154 |         num_gpu = 0
155 |     else:
156 |         num_cpu = 2
157 |         num_gpu = 1
158 | 
159 |     ################################################################
160 |     # test best
161 |     ################################################################
162 |     jobs = []
163 | 
164 |     for ci in range(options.crosses):
165 |         for fi in fold_index:
166 |             it_dir = "%s/f%dc%d" % (options.exp_dir, fi, ci)
167 | 
168 |             if options.dataset_i is None:
169 |                 out_dir = "%s/teste" % it_dir
170 |                 model_file = "%s/train/model_best.h5" % it_dir
171 |             else:
172 |                 out_dir = "%s/teste%d" % (it_dir, options.dataset_i)
173 |                 model_file = "%s/train/model%d_best.h5" % (it_dir, options.dataset_i)
174 | 
175 |             # check if done
176 |             acc_file = "%s/apa_preds.tsv.gz" % out_dir
177 |             if os.path.isfile(acc_file):
178 |                 # print('%s already generated.' % acc_file)
179 |                 pass
180 |             else:
181 |                 # evaluate
182 |                 cmd = ('. %s; ' % os.environ['BORZOI_CONDA']) if 'BORZOI_CONDA' in os.environ else ''
183 |                 cmd += "conda activate %s;" % options.conda_env
184 |                 cmd += " time borzoi_test_apa.py"
185 |                 cmd += " --head %d" % head_i
186 |                 cmd += " -o %s" % out_dir
187 |                 if options.rc:
188 |                     cmd += " --rc"
189 |                 if options.shifts:
190 |                     cmd += " --shifts %s" % options.shifts
191 |                 if options.targets_file is not None:
192 |                     cmd += " -t %s" % options.targets_file
193 |                 if options.untransform_old:
194 |                     cmd += " -u"
195 |                 cmd += " %s" % params_file
196 |                 cmd += " %s" % model_file
197 |                 cmd += " %s/data%d" % (it_dir, head_i)
198 |                 cmd += " %s" % options.apa_file
199 | 
200 |                 name = "%s-f%dc%d" % (options.name, fi, ci)
201 |                 j = slurm.Job(
202 |                     cmd,
203 |                     name=name,
204 |                     out_file="%s.out" % out_dir,
205 |                     err_file="%s.err" % out_dir,
206 |                     queue=options.queue,
207 |                     cpu=num_cpu,
208 |                     gpu=num_gpu,
209 |                     mem=45000,
210 |                     time="2-00:00:00",
211 |                 )
212 |                 jobs.append(j)
213 | 
214 |     slurm.multi_run(jobs, verbose=True)
215 | 
216 | 
217 | ################################################################################
218 | # __main__
219 | ################################################################################
220 | if __name__ == "__main__":
221 |     main()
222 | 


--------------------------------------------------------------------------------
/src/scripts/borzoi_test_tss_folds.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # Copyright 2019 Calico LLC
  3 | 
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | 
  8 | #         https://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | # =========================================================================
 16 | from optparse import OptionParser
 17 | import json
 18 | import os
 19 | 
 20 | import slurm
 21 | 
 22 | """
 23 | borzoi_test_tss_folds.py
 24 | 
 25 | Measure accuracy at TSS-level for multiple model replicates.
 26 | """
 27 | 
 28 | ################################################################################
 29 | # main
 30 | ################################################################################
 31 | def main():
 32 |     usage = 'usage: %prog [options] <params_file> <data1_dir> ...'
 33 |     parser = OptionParser(usage)
 34 |     parser.add_option(
 35 |         '-c',
 36 |         dest='crosses',
 37 |         default=1,
 38 |         type='int',
 39 |         help='Number of cross-fold rounds [Default:%default]',
 40 |     )
 41 |     parser.add_option(
 42 |         '-d',
 43 |         dest='dataset_i',
 44 |         default=None,
 45 |         type='int',
 46 |         help='Dataset index [Default:%default]',
 47 |     )
 48 |     parser.add_option(
 49 |         '-e',
 50 |         dest='conda_env',
 51 |         default='tf210',
 52 |         help='Anaconda environment [Default: %default]',
 53 |     )
 54 |     parser.add_option(
 55 |         '-f',
 56 |         dest='fold_subset',
 57 |         default=None,
 58 |         type='int',
 59 |         help='Run a subset of folds [Default:%default]',
 60 |     )
 61 |     parser.add_option(
 62 |         '--f_list',
 63 |         dest='fold_subset_list',
 64 |         default=None,
 65 |         help='Run a subset of folds (encoded as comma-separated string) [Default:%default]',
 66 |     )
 67 |     parser.add_option(
 68 |         '-g',
 69 |         dest='tss_file',
 70 |         default='%s/genes/gencode41/gencode41_basic_tss2.bed' % os.environ.get('BORZOI_HG38', 'hg38'),
 71 |         help='Bed for tss definition [Default %default]',
 72 |     )
 73 |     parser.add_option(
 74 |         '--name',
 75 |         dest='name',
 76 |         default='teste',
 77 |         help='SLURM name prefix [Default: %default]',
 78 |     )
 79 |     parser.add_option(
 80 |         '-o',
 81 |         dest='exp_dir',
 82 |         default=None,
 83 |         help='Output experiment directory [Default: %default]',
 84 |     )
 85 |     parser.add_option(
 86 |         '-q',
 87 |         dest='queue',
 88 |         default='geforce',
 89 |     )
 90 |     parser.add_option(
 91 |         '--rc',
 92 |         dest='rc',
 93 |         default=False,
 94 |         action='store_true',
 95 |         help='Average forward and reverse complement predictions [Default: %default]',
 96 |     )
 97 |     parser.add_option(
 98 |         '--shifts',
 99 |         dest='shifts',
100 |         default='0',
101 |         type='str',
102 |         help='Ensemble prediction shifts [Default: %default]',
103 |     )
104 |     parser.add_option(
105 |         '--windowcov',
106 |         dest='windowcov',
107 |         default=4,
108 |         type='int',
109 |         help='Coverage bin window size [Default: %default]',
110 |     )
111 |     parser.add_option(
112 |         '--maxcov',
113 |         dest='maxcov',
114 |         default=False,
115 |         action='store_true',
116 |         help='Store max instead of avg bin value in local window [Default: %default]',
117 |     )
118 |     parser.add_option(
119 |         '-t',
120 |         dest='targets_file',
121 |         default=None,
122 |         type='str',
123 |         help='File specifying target indexes and labels in table format',
124 |     )
125 |     parser.add_option(
126 |         '-u',
127 |         dest='untransform_old',
128 |         default=False,
129 |         action='store_true',
130 |         help='Untransform old models [Default: %default]',
131 |     )
132 |     (options, args) = parser.parse_args()
133 | 
134 |     if len(args) < 2:
135 |         parser.error('Must provide parameters file and data directory')
136 |     else:
137 |         params_file = args[0]
138 |         data_dirs = [os.path.abspath(arg) for arg in args[1:]]
139 | 
140 |     # using -o for required argument for compatibility with the training script
141 |     assert(options.exp_dir is not None)
142 | 
143 |     # read data parameters
144 |     data_stats_file = '%s/statistics.json' % data_dirs[0]
145 |     with open(data_stats_file) as data_stats_open:
146 |         data_stats = json.load(data_stats_open)
147 | 
148 |     if options.dataset_i is None:
149 |         head_i = 0
150 |     else:
151 |         head_i = options.dataset_i
152 | 
153 |     # count folds
154 |     num_folds = len([dkey for dkey in data_stats if dkey.startswith("fold")])
155 | 
156 |     # subset folds
157 |     if options.fold_subset is not None:
158 |         num_folds = min(options.fold_subset, num_folds)
159 |   
160 |     fold_index = [fold_i for fold_i in range(num_folds)]
161 | 
162 |     # subset folds (list)
163 |     if options.fold_subset_list is not None:
164 |         fold_index = [int(fold_str) for fold_str in options.fold_subset_list.split(",")]
165 | 
166 |     if options.queue == 'standard':
167 |         num_cpu = 4
168 |         num_gpu = 0
169 |     else:
170 |         num_cpu = 2
171 |         num_gpu = 1
172 | 
173 |     ################################################################
174 |     # test best
175 |     ################################################################
176 |     jobs = []
177 | 
178 |     for ci in range(options.crosses):
179 |         for fi in fold_index:
180 |             it_dir = '%s/f%dc%d' % (options.exp_dir, fi, ci)
181 | 
182 |             max_str = ''
183 |             if options.maxcov:
184 |                 max_str = 'max'
185 |             
186 |             windowcov_str = ''
187 |             if options.windowcov != 4:
188 |                 windowcov_str = 'w' + str(options.windowcov)
189 | 
190 |             if options.dataset_i is None:
191 |                 out_dir = '%s/testetss%s%s' % (it_dir, max_str, windowcov_str)
192 |                 model_file = '%s/train/model_best.h5' % it_dir
193 |             else:
194 |                 out_dir = '%s/testetss%s%s%d' % (it_dir, max_str, windowcov_str, options.dataset_i)
195 |                 model_file = '%s/train/model%d_best.h5' % (it_dir, options.dataset_i)
196 | 
197 |             # check if done
198 |             acc_file = '%s/tss_preds.tsv.gz' % out_dir
199 |             if os.path.isfile(acc_file):
200 |                 # print('%s already generated.' % acc_file)
201 |                 pass
202 |             else:
203 |                 # evaluate
204 |                 cmd = ('. %s; ' % os.environ['BORZOI_CONDA']) if 'BORZOI_CONDA' in os.environ else ''
205 |                 cmd += 'conda activate %s;' % options.conda_env
206 |                 cmd += ' time borzoi_test_tss.py'
207 |                 cmd += ' --head %d' % head_i
208 |                 cmd += ' -o %s' % out_dir
209 |                 if options.rc:
210 |                     cmd += ' --rc'
211 |                 if options.shifts:
212 |                     cmd += ' --shifts %s' % options.shifts
213 |                 if options.windowcov != 4:
214 |                     cmd += ' --windowcov %d' % options.windowcov
215 |                 if options.maxcov:
216 |                     cmd += ' --maxcov'
217 |                 if options.targets_file is not None:
218 |                     cmd += ' -t %s' % options.targets_file
219 |                 if options.untransform_old:
220 |                     cmd += ' -u'
221 |                 cmd += ' %s' % params_file
222 |                 cmd += ' %s' % model_file
223 |                 cmd += ' %s/data%d' % (it_dir, head_i)
224 |                 cmd += ' %s' % options.tss_file
225 |                 
226 |                 name = '%s-f%dc%d' % (options.name, fi, ci)
227 |                 j = slurm.Job(cmd,
228 |                                             name=name,
229 |                                             out_file='%s.out'%out_dir,
230 |                                             err_file='%s.err'%out_dir,
231 |                                             queue=options.queue,
232 |                                             cpu=num_cpu, gpu=num_gpu,
233 |                                             mem=45000,
234 |                                             time='2-00:00:00')
235 |                 jobs.append(j)
236 | 
237 |     slurm.multi_run(jobs, verbose=True)
238 | 
239 | ################################################################################
240 | # __main__
241 | ################################################################################
242 | if __name__ == '__main__':
243 |     main()
244 | 


--------------------------------------------------------------------------------
/src/scripts/bw_h5.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | from optparse import OptionParser
  3 | import sys
  4 | 
  5 | import h5py
  6 | import numpy as np
  7 | import pyBigWig
  8 | import scipy.interpolate
  9 | 
 10 | '''
 11 | bw_h5.py
 12 | 
 13 | Convert a BigWig to HDF5.
 14 | '''
 15 | 
 16 | ################################################################################
 17 | # main
 18 | ################################################################################
 19 | def main():
 20 |     usage = 'usage: %prog [options] <in_bw_file> <out_h5_file>'
 21 |     parser = OptionParser(usage)
 22 |     parser.add_option('-c', '--chr_strip', dest='chr_strip',
 23 |         default=False, action='store_true')
 24 |     parser.add_option('-i', dest='interp_nan',
 25 |         default=False, action='store_true',
 26 |         help='Interpolate NaNs [Default: %default]') 
 27 |     parser.add_option('-m', dest='min_norm',
 28 |         default=False, action='store_true',
 29 |         help='Normalize the minimum nonzero value to 1 [Default: %default]')
 30 |     # parser.add_option('--mode_max', dest='mode_norm_max',
 31 |     #     default=10, type='float',
 32 |     #     help='Maximum norm scale value determined by mode [Default: %default]')
 33 |     parser.add_option('-s', dest='scale',
 34 |         default=1.0, type='float',
 35 |         help='Scale all values (e.g. to undo normalization) [Default: %default]')
 36 |     parser.add_option('-v', dest='verbose',
 37 |         default=False, action='store_true')
 38 |     parser.add_option('-z', dest='clip_zero',
 39 |         default=False, action='store_true',
 40 |         help='Clip negative values at zero [Default: %default]')
 41 |     (options,args) = parser.parse_args()
 42 | 
 43 |     if len(args) != 2:
 44 |         parser.error('Must provide input BigWig and output HDF5.')
 45 |     else:
 46 |         bw_file = args[0]
 47 |         hdf5_file = args[1]
 48 | 
 49 |     # open files
 50 |     bw_in = pyBigWig.open(bw_file)
 51 |     h5_out = h5py.File(hdf5_file, 'w')
 52 | 
 53 |     # process chromosomes in length order
 54 |     chrom_lengths = bw_in.chroms()
 55 |     chroms = sorted(chrom_lengths.keys())
 56 |     length_chroms = [(chrom_lengths[chrm],chrm) for chrm in chroms]
 57 |     length_chroms = sorted(length_chroms)[::-1]
 58 |     min_factor = None
 59 | 
 60 |     # for each chromosome
 61 |     for clength, chrom in length_chroms:
 62 |         if options.verbose:
 63 |             print(chrom)
 64 | 
 65 |         # read values
 66 |         x = bw_in.values(chrom, 0, chrom_lengths[chrom], numpy=True)
 67 | 
 68 |         # scale
 69 |         if options.scale != 1:
 70 |             x = x*options.scale
 71 | 
 72 |         if options.min_norm:
 73 |             if min_factor is None:
 74 |                 min_factor = x[x>0].min()
 75 |                 # vals, counts = np.unique(x[x>0], return_counts=True)
 76 |                 # mode_factor = vals[0]
 77 |                 # mode_factor =  np.clip(vals[0], 1/options.mode_norm_max, options.mode_norm_max)
 78 |                 print('Min normalization factor: %f' % min_factor, file=sys.stderr)
 79 |             x /= min_factor
 80 | 
 81 |         # interpolate NaN
 82 |         if options.interp_nan:
 83 |             x = interp_nan(x)
 84 |         else:
 85 |             x = np.nan_to_num(x)
 86 | 
 87 |         # clip negative values
 88 |         if options.clip_zero:
 89 |             x = np.clip(x, 0, np.inf)
 90 | 
 91 |         # clip float16 min/max
 92 |         x = np.clip(x, np.finfo(np.float16).min, np.finfo(np.float16).max)
 93 | 
 94 |         # strip "chr"
 95 |         if options.chr_strip:
 96 |             chrom = chrom.replace('chr','')
 97 | 
 98 |         # write gzipped into HDF5
 99 |         x = x.astype('float16')
100 |         h5_out.create_dataset(chrom, data=x, dtype='float16', compression='gzip', shuffle=True)
101 | 
102 |     # close files
103 |     h5_out.close()
104 |     bw_in.close()
105 | 
106 | 
107 | def interp_nan(x, kind='linear'):
108 |     '''Linearly interpolate to fill NaN.'''
109 | 
110 |     # pad zeroes
111 |     xp = np.zeros(len(x)+2)
112 |     xp[1:-1] = x
113 | 
114 |     # find NaN
115 |     x_nan = np.isnan(xp)
116 | 
117 |     if np.sum(x_nan) == 0:
118 |         # unnecessary
119 |         return x
120 | 
121 |     else:
122 |         # interpolate
123 |         inds = np.arange(len(xp))
124 |         interpolator = scipy.interpolate.interp1d(
125 |             inds[~x_nan],
126 |             xp[~x_nan],
127 |             kind=kind,
128 |             bounds_error=False)
129 | 
130 |         loc = np.where(x_nan)
131 |         xp[loc] = interpolator(loc)
132 | 
133 |         # slice off pad
134 |         return xp[1:-1]
135 | 
136 | ################################################################################
137 | # __main__
138 | ################################################################################
139 | if __name__ == '__main__':
140 |     main()
141 | 


--------------------------------------------------------------------------------
/src/scripts/idx_genome.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import os
 4 | import sys
 5 | import pyfaidx
 6 | 
 7 | '''
 8 | idx_genome.py
 9 | 
10 | Create .fai index file for input .fa.
11 | '''
12 | 
13 | ################################################################################
14 | # main
15 | ################################################################################
16 | def main():
17 |     usage = 'usage: %prog [options] <genome_fa>'
18 |     parser = OptionParser(usage)
19 |     (options, args) = parser.parse_args()
20 |     
21 |     if len(args) != 1:
22 |         parser.error('Must provide input fasta file')
23 |     else:
24 |         genome_fa = args[0]
25 | 
26 |     pyfaidx.Faidx(genome_fa)
27 | 
28 | ################################################################################
29 | # __main__
30 | ################################################################################
31 | if __name__ == '__main__':
32 |     main()
33 | 


--------------------------------------------------------------------------------
/src/scripts/util.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | from __future__ import print_function
  3 | #import pdb
  4 | import operator, os, sys, subprocess, time
  5 | 
  6 | ############################################################
  7 | # util
  8 | #
  9 | # Helpful methods that are difficult to categorize.
 10 | ############################################################
 11 | 
 12 | ############################################################
 13 | # condorify
 14 | ############################################################
 15 | def condorify(cmds):
 16 |     return ['runCmd -c "%s"' % c for c in cmds]
 17 | 
 18 | ############################################################
 19 | # slurmify
 20 | ############################################################
 21 | def slurmify(cmds, mem_mb=None):
 22 |     if mem != None:
 23 |         mem_str = '--mem %d' % mem_mb
 24 |     else:
 25 |         mem_str = ''
 26 | 
 27 |     return ['srun -p general -n 1 %s "%s"' % (mem_str,c) for c in cmds]
 28 | 
 29 | ############################################################
 30 | # exec_par
 31 | #
 32 | # Execute the commands in the list 'cmds' in parallel, but
 33 | # only running 'max_proc' at a time.
 34 | ############################################################
 35 | def exec_par(cmds, max_proc=None, verbose=False):
 36 |     total = len(cmds)
 37 |     finished = 0
 38 |     running = 0
 39 |     p = []
 40 | 
 41 |     if max_proc == None:
 42 |         max_proc = len(cmds)
 43 | 
 44 |     if max_proc == 1:
 45 |         while finished < total:
 46 |             if verbose:
 47 |                 print(cmds[finished], file=sys.stderr)
 48 |             op = subprocess.Popen(cmds[finished], shell=True)
 49 |             os.waitpid(op.pid, 0)
 50 |             finished += 1
 51 | 
 52 |     else:
 53 |         while finished + running < total:
 54 |             # launch jobs up to max
 55 |             while running < max_proc and finished+running < total:
 56 |                 if verbose:
 57 |                     print(cmds[finished+running], file=sys.stderr)
 58 |                 p.append(subprocess.Popen(cmds[finished+running], shell=True))
 59 |                 # print('Running %d' % p[running].pid)
 60 |                 running += 1
 61 | 
 62 |             # are any jobs finished
 63 |             new_p = []
 64 |             for i in range(len(p)):
 65 |                 # print('POLLING', i, p[i].poll())
 66 |                 if p[i].poll() != None:
 67 |                     running -= 1
 68 |                     finished += 1
 69 |                 else:
 70 |                     new_p.append(p[i])
 71 | 
 72 |             # if none finished, sleep
 73 |             if len(new_p) == len(p):
 74 |                 time.sleep(1)
 75 |             p = new_p
 76 | 
 77 |         # wait for all to finish
 78 |         for i in range(len(p)):
 79 |             p[i].wait()
 80 | 
 81 | ############################################################
 82 | # slurm_par
 83 | #
 84 | # Execute the commands in the list 'cmds' in parallel on
 85 | # SLURM, but only running 'max_proc' at a time.
 86 | #
 87 | # Doesn't work. Jobs are allocated resources, but won't run.
 88 | # Also, I'd have to screen into login nodes, which
 89 | # isn't great because I can't get back to them.
 90 | ############################################################
 91 | def slurm_par(cmds, max_proc, queue='general', cpu=1, mem=None, out_files=None, err_files=None):
 92 |     # preprocess cmds
 93 |     if mem != None:
 94 |         mem_str = '--mem %d' % mem
 95 |     else:
 96 |         mem_str = ''
 97 | 
 98 |     if out_files != None:
 99 |         out_strs = ['-o %s' % of for of in out_files]
100 |     else:
101 |         out_strs = ['']*len(cmds)
102 | 
103 |     if err_files != None:
104 |         err_strs = ['-e %s' % ef for ef in err_files]
105 |     else:
106 |         err_strs = ['']*len(cmds)
107 | 
108 |     slurm_cmds = ['srun -p %s -n %d %s %s %s "%s"' % (queue, cpu, mem_str, out_strs[i], err_strs[i], cmds[i]) for i in range(len(cmds))]
109 | 
110 |     exec_par(slurm_cmds, max_proc, print_cmd=True)
111 | 
112 | 
113 | ############################################################
114 | # sort_dict
115 | #
116 | # Sort a dict by the values, returning a list of tuples
117 | ############################################################
118 | def sort_dict(hash, reverse=False):
119 |     return sorted(hash.items(), key=operator.itemgetter(1), reverse=reverse)
120 | 
121 | 


--------------------------------------------------------------------------------
/src/scripts/w5_merge.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | from optparse import OptionParser
  3 | import os
  4 | import sys
  5 | 
  6 | import h5py
  7 | import numpy as np
  8 | 
  9 | '''
 10 | w5_merge.py
 11 | 
 12 | Merge wig5 files using a specified summary statistic.
 13 | '''
 14 | 
 15 | ################################################################################
 16 | # main
 17 | ################################################################################
 18 | def main():
 19 |     usage = 'usage: %prog [options] <out_w5> <in1_w5> <in2_w5> ...'
 20 |     parser = OptionParser(usage)
 21 |     parser.add_option('-s', dest='sum_stat',
 22 |         default='sum', help='Summary statistic [Default: %default]')
 23 |     parser.add_option('-v', dest='verbose',
 24 |         default=False, action='store_true')
 25 |     parser.add_option('-w', dest='overwrite',
 26 |         default=False, action='store_true')
 27 |     parser.add_option('-z', dest='gzip',
 28 |         default=False, action='store_true')
 29 |     (options,args) = parser.parse_args()
 30 | 
 31 |     if len(args) < 3:
 32 |         parser.error('Must provide output and two or more input wig5.')
 33 |     else:
 34 |         out_w5_file = args[0]
 35 |         in_w5_files = args[1:]
 36 | 
 37 |     compression_args = {}
 38 |     if options.gzip:
 39 |         compression_args['compression'] = 'gzip'
 40 |         compression_args['shuffle'] = True
 41 | 
 42 |     # open input wig5
 43 |     in_w5_opens = [h5py.File(iwf) for iwf in in_w5_files]
 44 |     in_num = len(in_w5_opens)
 45 | 
 46 |     # take keys union
 47 |     in_keys = set()
 48 |     for in_w5_open in in_w5_opens:
 49 |         in_keys |= in_w5_open.keys()
 50 | 
 51 |     # open output file
 52 |     if os.path.isfile(out_w5_file) and not options.overwrite:
 53 |         parser.error('%s exists. Please remove.' % out_w5_file)
 54 |     out_w5_open = h5py.File(out_w5_file, 'w')
 55 | 
 56 |     for out_key in in_keys:
 57 |         if options.verbose:
 58 |             print(out_key)
 59 | 
 60 |         # initialize array
 61 |         for i in range(in_num):
 62 |             if out_key in in_w5_opens[i]:
 63 |                 in_key_len = len(in_w5_opens[i][out_key])
 64 |                 break
 65 |         in_key_data = np.zeros((in_num,in_key_len), dtype='float32')
 66 | 
 67 |         # read data
 68 |         for i in range(in_num):
 69 |             if out_key in in_w5_opens[i]:
 70 |                 in_key_data[i] = np.array(in_w5_opens[i][out_key])
 71 |             else:
 72 |                 print('%s missing %s' % (in_w5_files[i], out_key), file=sys.stderr)
 73 | 
 74 |         # summarize
 75 |         if options.sum_stat == 'sum':
 76 |             out_key_data = in_key_data.sum(axis=0)
 77 | 
 78 |         elif options.sum_stat == 'mean':
 79 |             out_key_data = in_key_data.mean(axis=0)
 80 | 
 81 |         elif options.sum_stat == 'geo-mean':
 82 |             in_key_data_log = np.log(in_key_data)
 83 |             in_key_data_log_mean = in_key_data_log.mean(axis=0)
 84 |             out_key_data = np.exp(in_key_data_log_mean)
 85 | 
 86 |         elif options.sum_stat == 'sqrt-mean':
 87 |             in_key_data_sqrt = in_key_data**0.5
 88 |             in_key_data_sqrt_mean = in_key_data_sqrt.mean(axis=0)
 89 |             out_key_data = in_key_data_sqrt_mean**2
 90 |             
 91 |         else:
 92 |             print('Cannot identify summary statistic %s' % options.sum_stat)
 93 | 
 94 |         # carefully decrease resolution
 95 |         out_key_data = np.clip(out_key_data, np.finfo(np.float16).min, np.finfo(np.float16).max)
 96 |         out_key_data = out_key_data.astype('float16')
 97 | 
 98 |         # write
 99 |         out_w5_open.create_dataset(out_key, data=out_key_data,
100 |                                    dtype='float16', **compression_args)
101 | 
102 |     out_w5_open.close()
103 | 
104 | 
105 | 
106 | ################################################################################
107 | # __main__
108 | ################################################################################
109 | if __name__ == '__main__':
110 |     main()
111 | 


--------------------------------------------------------------------------------
/src/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/calico/borzoi/77670d9b0f940ff0a4b2b0edbdff445bbaefc6ed/src/tests/__init__.py


--------------------------------------------------------------------------------
/src/tests/test_dummy.py:
--------------------------------------------------------------------------------
1 | def test_dummy():
2 |     pass
3 | 


--------------------------------------------------------------------------------
/tutorials/latest/analyze_sv/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Shift augmentation for improved indel scoring in DNA sequence-based ML models
 3 | This repository contains example analyses related to indels, structural variants, and tandem repeats. The manuscript is available here:<br/>
 4 | 
 5 | "Shift augmentation for improved indel scoring in DNA sequence-based ML models" - biorXiv link.
 6 | 
 7 | Contact *drk (at) @calicolabs.com* or *anya (at) @calicolabs.com* for questions.
 8 | 
 9 | ## Indel / structural variant effect visualization
10 | 
11 | Please follow the installation steps on the main page. This code depends on the [baskerville](https://github.com/calico/baskerville.git) library and on plotly.
12 | Install plotly into the working environment:
13 | 
14 | ```sh
15 | pip install plotly
16 | ```
17 | 
18 | After you've installed baskerville, download the dependencies for SV visualization example, and run the example script:
19 | 
20 | ```sh
21 | bash download_dependencies_SV.sh
22 | python analyze_indel.sh
23 | ```
24 | 
25 | This will plot one indel/SV provided in the .vcf format. The script currently only handles one variant per run, so make sure your .vcf contains one variant.
26 | Interactive plots for each available GTEx tissue and across all GTEx tissues will be put in the specified output directory.
27 | 
28 | ## Tandem repeat scoring
29 | 
30 | This script will analyze the effect of tandem repeats by reducing and extending the specified short tandem repeat in the reference genome, then performing linear 
31 | regression over log2FC of the gene expression of interest. A tiny STR table (subset of the result obtained in [this paper](https://www.nature.com/articles/s41588-019-0521-9)) 
32 | is provided in the data folder.
33 | 
34 | ```sh
35 | bash download_dependencies_STR.sh
36 | python score_STR.sh
37 | ```
38 | 


--------------------------------------------------------------------------------
/tutorials/latest/analyze_sv/analyze_indel.sh:
--------------------------------------------------------------------------------
 1 | #!bin/bash
 2 | 
 3 | python analyze_vcf.py --vcf data/chr6_41897087_SV.vcf \
 4 |     --fasta data/hg38.fa \
 5 |     --model data/model \
 6 |     --params data/params.json \
 7 |     --targets data/targets.txt \
 8 |     --gencode data/gencode41_basic_exons.bed \
 9 |     --output_dir temp \
10 |     --fig_width 1000
11 | 


--------------------------------------------------------------------------------
/tutorials/latest/analyze_sv/data/STR.csv:
--------------------------------------------------------------------------------
1 | chrom,str.start,str.end,gene,gene.name,num.e,beta,tissue_info,pval,score,str.motif.forward,str.motif.reverse,tissue_list,score_concord,max_tissue,num_tissues,num_motifs,motif_coords_0,start_partial,end_partial,tissues,vcf,repeats
2 | chr1,28250549,28250559,ENSG00000130768,SMPDL3B,4,0.702792765,Adipose-Subcutaneous_0.29_0.01;Esophagus-Mucosa_0.53_0.51;Esophagus-Muscularis_0.24_0.02;Lung_0.70_1.00,1.0100000000000001e-35,1.0,A,T,"['Adipose-Subcutaneous', 'Esophagus-Mucosa', 'Esophagus-Muscularis', 'Lung']",True,Lung,4,11,"[(28250548, 28250549), (28250549, 28250550), (28250550, 28250551), (28250551, 28250552), (28250552, 28250553), (28250553, 28250554), (28250554, 28250555), (28250555, 28250556), (28250556, 28250557), (28250557, 28250558), (28250558, 28250559)]",False,False,"Adipose-Subcutaneous,Esophagus-Mucosa,Esophagus-Muscularis,Lung",chr1_28250548,"7,8,9,10,12,13,14,15,16,17,18,19"
3 | chr10,71984970,71984992,ENSG00000042286,AIFM2,1,-0.444886046,Esophagus-Mucosa_-0.44_1.00,1.19e-13,1.0,ATTT,AAAT,['Esophagus-Mucosa'],True,Esophagus-Mucosa,1,5,"[(71984971, 71984975), (71984975, 71984979), (71984979, 71984983), (71984983, 71984987), (71984987, 71984991)]",True,True,Esophagus-Mucosa,chr10_71984971,"1,2,3,4,6,7,8,9,10,11,12,13"
4 | 


--------------------------------------------------------------------------------
/tutorials/latest/analyze_sv/data/chr6_41897087_SV.vcf:
--------------------------------------------------------------------------------
1 | chr6	41897088	chr6_41897088_GTTGGAGGTTGCAGTGAGCTGAGATCGTGCCACAGCACTCCAGCCTGGCAACGGAGTGAGACTCCATCTCAAAAAAAAAAAAAAAAAAAAAAAAGTGTCGCCTGGAAAGGCCTAGGGATCTCTGAGACCCTTTGGGCTGGGGGGATAGTGGGGTGCCTGAGATCAAAACGATTTTCCTAATAATACTGAGACATATCTGCATTGTCACTGTGATGATATTTGCACAATGATACAAAAGTAGCAATGGGTAAAACTGCTGCCTTAGCACAAATCAAGGCAACTGCACCAAGTTGTGCTAGAGGTCAAGGTATTCTTCACTGCTACAGTAAAAAAACACCTGTTTCAGGCCGGATGGGTGCAGTGGCTCACACCTGTAATCCCAACACTTTGGGAGGCCAAGGCAGGTGGATCACTTGAGGTCAGGAATTCGAGACCAGCCTGGCCAACATGGTGAAACCCCTCTCTCTACTAAAAATACAGAAATTAGCTGGGCGTGGTGGCACGCACCTGTAATCCCAGCTACTCGGGAAGCTGAGGCA_G_b38	GTTGGAGGTTGCAGTGAGCTGAGATCGTGCCACAGCACTCCAGCCTGGCAACGGAGTGAGACTCCATCTCAAAAAAAAAAAAAAAAAAAAAAAAGTGTCGCCTGGAAAGGCCTAGGGATCTCTGAGACCCTTTGGGCTGGGGGGATAGTGGGGTGCCTGAGATCAAAACGATTTTCCTAATAATACTGAGACATATCTGCATTGTCACTGTGATGATATTTGCACAATGATACAAAAGTAGCAATGGGTAAAACTGCTGCCTTAGCACAAATCAAGGCAACTGCACCAAGTTGTGCTAGAGGTCAAGGTATTCTTCACTGCTACAGTAAAAAAACACCTGTTTCAGGCCGGATGGGTGCAGTGGCTCACACCTGTAATCCCAACACTTTGGGAGGCCAAGGCAGGTGGATCACTTGAGGTCAGGAATTCGAGACCAGCCTGGCCAACATGGTGAAACCCCTCTCTCTACTAAAAATACAGAAATTAGCTGGGCGTGGTGGCACGCACCTGTAATCCCAGCTACTCGGGAAGCTGAGGCA	G	.	.
2 | 


--------------------------------------------------------------------------------
/tutorials/latest/analyze_sv/download_dependencies_STR.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # create additional folder in borzoi data folders
 4 | mkdir -p "data/model"
 5 | mkdir -p "data/model/f0"
 6 | mkdir -p "data/model/f1"
 7 | mkdir -p "data/model/f2"
 8 | mkdir -p "data/model/f3"
 9 | 
10 | # download dependencies and the model
11 | if [ -f "data/hg19.fa" ]; then
12 |   echo "hg19.fa already exists."
13 | else
14 |   wget -O - "ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/bigZips/hg19.fa.gz" | gunzip -c > "data/hg19.fa"
15 | fi
16 | 
17 | if [ -f "data/gencode41_lift37_exons.bed" ]; then
18 |   echo "gencode41_lift37_exons.bed already exists."
19 | else
20 |   wget -O - "https://storage.googleapis.com/seqnn-share/helper/gencode41_lift37_exons.bed.gz" | gunzip -c > "data/gencode41_lift37_exons.bed"
21 | fi
22 | 
23 | if [ -f "data/model/f0/model0_best.h5" ]; then
24 |   echo "f0/model0_best.h5 already exists."
25 | else
26 |   wget "https://storage.googleapis.com/seqnn-share/borzoi/f0/model0_best.h5" -O "data/model/f0/model0_best.h5"
27 | fi
28 | 
29 | if [ -f "data/model/f1/model0_best.h5" ]; then
30 |   echo "f1/model0_best.h5 already exists."
31 | else
32 |   wget "https://storage.googleapis.com/seqnn-share/borzoi/f1/model0_best.h5" -O "data/model/f1/model0_best.h5"
33 | fi
34 | 
35 | if [ -f "data/model/f2/model0_best.h5" ]; then
36 |   echo "f2/model0_best.h5 already exists."
37 | else
38 |   wget "https://storage.googleapis.com/seqnn-share/borzoi/f2/model0_best.h5" -O "data/model/f2/model0_best.h5"
39 | fi
40 | 
41 | if [ -f "data/model/f3/model0_best.h5" ]; then
42 |   echo "f3/model0_best.h5 already exists."
43 | else
44 |   wget "https://storage.googleapis.com/seqnn-share/borzoi/f3/model0_best.h5" -O "data/model/f3/model0_best.h5"
45 | fi
46 | 
47 | if [ -f "data/targets.txt" ]; then
48 |   echo "targets.txt already exists."
49 | else
50 |   wget "https://storage.googleapis.com/seqnn-share/borzoi/hg38/targets.txt" -O "data/targets.txt"
51 | fi
52 | 
53 | if [ -f "data/params.json" ]; then
54 |   echo "params.json already exists."
55 | else
56 |   wget "https://storage.googleapis.com/seqnn-share/borzoi/params.json" -O "data/params.json"
57 | fi
58 | 


--------------------------------------------------------------------------------
/tutorials/latest/analyze_sv/download_dependencies_SV.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # create additional folder in borzoi data folders
 4 | mkdir -p "data/model"
 5 | mkdir -p "data/model/f0"
 6 | mkdir -p "data/model/f1"
 7 | mkdir -p "data/model/f2"
 8 | mkdir -p "data/model/f3"
 9 | 
10 | # download dependencies and the model
11 | if [ -f "data/hg38.fa" ]; then
12 |   echo "hg38.fa already exists."
13 | else
14 |   wget -O - "ftp://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz" | gunzip -c > "data/hg38.fa"
15 | fi
16 | 
17 | if [ -f "data/gencode41_basic_exons.bed" ]; then
18 |   echo "gencode41_basic_exons.bed already exists."
19 | else
20 |   wget -O - "https://storage.googleapis.com/seqnn-share/helper/gencode41_basic_exons.bed.gz" | gunzip -c > "data/gencode41_basic_exons.bed"
21 | fi
22 | 
23 | if [ -f "data/model/f0/model0_best.h5" ]; then
24 |   echo "f0/model0_best.h5 already exists."
25 | else
26 |   wget "https://storage.googleapis.com/seqnn-share/borzoi/f0/model0_best.h5" -O "data/model/f0/model0_best.h5"
27 | fi
28 | 
29 | if [ -f "data/model/f1/model0_best.h5" ]; then
30 |   echo "f1/model0_best.h5 already exists."
31 | else
32 |   wget "https://storage.googleapis.com/seqnn-share/borzoi/f1/model0_best.h5" -O "data/model/f1/model0_best.h5"
33 | fi
34 | 
35 | if [ -f "data/model/f2/model0_best.h5" ]; then
36 |   echo "f2/model0_best.h5 already exists."
37 | else
38 |   wget "https://storage.googleapis.com/seqnn-share/borzoi/f2/model0_best.h5" -O "data/model/f2/model0_best.h5"
39 | fi
40 | 
41 | if [ -f "data/model/f3/model0_best.h5" ]; then
42 |   echo "f3/model0_best.h5 already exists."
43 | else
44 |   wget "https://storage.googleapis.com/seqnn-share/borzoi/f3/model0_best.h5" -O "data/model/f3/model0_best.h5"
45 | fi
46 | 
47 | if [ -f "data/targets.txt" ]; then
48 |   echo "targets.txt already exists."
49 | else
50 |   wget "https://storage.googleapis.com/seqnn-share/borzoi/hg38/targets.txt" -O "data/targets.txt"
51 | fi
52 | 
53 | if [ -f "data/params.json" ]; then
54 |   echo "params.json already exists."
55 | else
56 |   wget "https://storage.googleapis.com/seqnn-share/borzoi/params.json" -O "data/params.json"
57 | fi
58 | 


--------------------------------------------------------------------------------
/tutorials/latest/analyze_sv/save_STR_vcf.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import re
  3 | import os
  4 | import argparse
  5 | import numpy as np
  6 | import pandas as pd
  7 | from Bio import SeqIO
  8 | 
  9 | 
 10 | def strip_tissue(tissues):
 11 |     tissue_list = []
 12 |     for tissue in tissues:
 13 |         tissue_new = tissue.split("_")[0]
 14 |         tissue_list.append(tissue_new)
 15 |     return tissue_list
 16 | 
 17 | def strip_score(tissues):
 18 |     score_list = []
 19 |     for tissue in tissues:
 20 |         score = tissue.split("_")[1]
 21 |         score_list.append(float(score))
 22 |     if len(score_list) == 1:
 23 |         return True
 24 |     else:
 25 |         # check if signs of all scores are the same
 26 |         if all(x > 0 for x in score_list) or all(x < 0 for x in score_list):
 27 |             return True
 28 |         else:
 29 |             return False
 30 | 
 31 | # find tissue with the highest score
 32 | def max_tissue(tissues):
 33 |     score_list = []
 34 |     for tissue in tissues:
 35 |         score = tissue.split("_")[1]
 36 |         score_list.append(float(score))
 37 |     max_index = score_list.index(max(score_list))
 38 |     tissue_clean = tissues[max_index].split("_")[0]
 39 |     return tissue_clean
 40 | 
 41 | # find motif occurence numbers with regex
 42 | def find_motif(seq_dict, coords, motif):
 43 |     seq_to_search = seq_dict[coords[0]][coords[1]:coords[2]].upper()
 44 |     motif_dict = []
 45 |     if len(motif)>1:
 46 |         matches = re.finditer(motif, seq_to_search)
 47 |         for match in matches:
 48 |             start = match.start()
 49 |             end = match.end()
 50 |             motif_dict.append((coords[1]+start, coords[1]+end))
 51 |     else:
 52 |         if seq_to_search==motif*len(seq_to_search):
 53 |             for i in range(len(seq_to_search)):
 54 |                 motif_dict.append((coords[1]+i, coords[1]+i+1))
 55 | 
 56 |     return motif_dict
 57 | 
 58 | 
 59 | def save_to_vcf(df, seq_dict, args):
 60 | 
 61 |     reduce_motifs = args.reduce
 62 |     extend_motifs = args.extend
 63 | 
 64 |     if not os.path.exists(args.output_dir):
 65 |         os.makedirs(args.output_dir)
 66 |     
 67 |     names_vcf = []
 68 |     arr_repeats = []
 69 | 
 70 |     for index, row in df.iterrows():
 71 |         
 72 |         chrom = row['chrom']
 73 |         start = row['str.start']-1
 74 |         end = row['str.end']
 75 |         num_motifs = row['num_motifs']
 76 |         first_start = row['motif_coords_0'][0][0]
 77 |         first_end = row['motif_coords_0'][0][1]
 78 |         last_end = row['motif_coords_0'][-1][1]
 79 |         motif_coords = row['motif_coords_0']
 80 |         partial_start = row['start_partial']
 81 |         partial_end = row['end_partial']
 82 | 
 83 |         ref_allele_full = seq_dict[chrom][start:end].upper()
 84 |         motif = row['str.motif.forward'].upper()
 85 | 
 86 |         range_repeats = []
 87 |         if num_motifs-reduce_motifs>1:
 88 |             range_repeats.extend(np.arange(num_motifs-reduce_motifs, num_motifs))
 89 |         else:
 90 |             range_repeats.extend(np.arange(1, num_motifs))
 91 |         range_repeats.extend(np.arange(num_motifs+1, num_motifs+extend_motifs))
 92 |         
 93 |         for repeat in range_repeats:
 94 |             # if number of repeats is less than num_motifs, it's a deletion
 95 |             if repeat<num_motifs:
 96 |                 # deletion
 97 |                 if not partial_start:
 98 |                     ref_allele = seq_dict[chrom][start-1:start].upper()
 99 |                     alt_allele = ref_allele
100 |                     pos_vcf = start
101 |                 else:
102 |                     ref_allele = seq_dict[chrom][first_start-1:first_start].upper()
103 |                     alt_allele = ref_allele
104 |                     pos_vcf = first_start
105 |                 # add number of repeats to be deleted
106 |                 ref_allele += motif*(num_motifs-repeat)
107 |             else:
108 |                 # insertion
109 |                 if not partial_start:
110 |                     ref_allele = seq_dict[chrom][start-1:start].upper()
111 |                     alt_allele = ref_allele
112 |                     pos_vcf = start
113 |                 else:
114 |                     ref_allele = seq_dict[chrom][first_start-1:first_start].upper()
115 |                     alt_allele = ref_allele
116 |                     pos_vcf = first_start
117 |                 # add number of repeats to be inserted
118 |                 alt_allele += motif*(repeat-num_motifs)
119 | 
120 |             vcf_df = pd.DataFrame({'chr': [chrom], 'pos': [pos_vcf], 'snp': [f"{chrom}_{pos_vcf}_{ref_allele}_{alt_allele}_b19"],
121 |                                 'ref': [ref_allele], 'alt': [alt_allele], 'x1': ['.'], 'x2': ['.']})
122 |             vcf_df.to_csv(f'{args.output_dir}/{chrom}_{pos_vcf}_{repeat}.vcf', header=None, index=None, sep='\t')
123 | 
124 | 
125 | def main():
126 | 
127 |     parser = argparse.ArgumentParser(description="Save STRs as VCF files")
128 |     parser.add_argument("--input", required=True, help=".csv file containing STRs")
129 |     parser.add_argument("--fasta", required=True, help="Reference genome FASTA file")
130 |     parser.add_argument("--reduce", type=int, default=4, help="Reduce the number of repeats by this number")
131 |     parser.add_argument("--extend", type=int, default=8, help="Extend the number of repeats by this number")
132 |     parser.add_argument("--output_dir", default="output", help="Output directory")
133 |     
134 |     args = parser.parse_args()
135 | 
136 |     df = pd.read_csv(args.input, header=0)
137 |     df['tissue_list'] = [strip_tissue(x.split(';')) for x in df['tissue_info']]
138 |     df['score_concord'] = [strip_score(x.split(';')) for x in df['tissue_info']]
139 |     df['max_tissue'] = [max_tissue(x.split(';')) for x in df['tissue_info']]
140 |     df['num_tissues'] = [len(x) for x in df['tissue_list']]
141 | 
142 |     # filter and retain only rows with 'score'>0.25 and betas concordant between tissues
143 |     df = df[df['score']>0.25]
144 |     df = df[df['score_concord']==True]
145 | 
146 |     # dictionary to store hg19 sequences
147 |     seq_dict = {}
148 | 
149 |     with open(args.fasta, mode="r") as handle:
150 |         # process each record in .fa file if there's more than one
151 |         for record in SeqIO.parse(handle, "fasta"):
152 |             identifier = record.id
153 |             sequence = record.seq
154 |             seq_dict[identifier] = str(sequence)
155 | 
156 |     # parse sequences chrom:start-end from hg19 
157 |     num_motifs, motif_coords, start_partial, end_partial = [], [], [], []
158 | 
159 |     for index, row in df.iterrows():
160 |         chrom = row['chrom']
161 |         start = row['str.start']-1
162 |         end = row['str.end']
163 |         coords = (chrom, start, end)
164 |         motif = row['str.motif.forward'].upper()
165 |         motif_dict = find_motif(seq_dict, coords, motif)
166 |         if len(motif_dict)>0:
167 |             if motif_dict[0][0]==start:
168 |                 start_partial.append(False)
169 |             else:
170 |                 start_partial.append(True)
171 |             if motif_dict[-1][1]==end:
172 |                 end_partial.append(False)
173 |             else:
174 |                 end_partial.append(True)
175 |         else:
176 |             start_partial.append(False)
177 |             end_partial.append(False)
178 |         num_motifs.append(len(motif_dict))
179 |         motif_coords.append(motif_dict)
180 | 
181 |     df['num_motifs'], df['motif_coords_0'], df['start_partial'], df['end_partial'] = num_motifs, motif_coords, start_partial, end_partial
182 | 
183 |     # filter and retain only rows with >0 motifs
184 |     df = df[df['num_motifs']>0]
185 |     df['tissues'] = [','.join(x) for x in df['tissue_list']]
186 | 
187 |     # save to vcf
188 |     save_to_vcf(df, seq_dict, args)
189 | 
190 | 
191 | if __name__ == "__main__":
192 |     main()


--------------------------------------------------------------------------------
/tutorials/latest/analyze_sv/score_STRs.sh:
--------------------------------------------------------------------------------
 1 | #!bin/bash
 2 | 
 3 | #python save_STR_vcf.py --input data/STR.csv \
 4 | #    --fasta data/hg19.fa \
 5 | #    --output_dir data/vcfs_STR
 6 | 
 7 | python score_tandem_repeats.py --table data/STR.csv \
 8 |     --input data/vcfs_STR \
 9 |     --fasta data/hg19.fa \
10 |     --model data/model \
11 |     --params data/params.json \
12 |     --targets data/targets.txt \
13 |     --gencode data/gencode41_lift37_exons.bed \
14 |     --output_dir out_STR \
15 | 


--------------------------------------------------------------------------------
/tutorials/latest/interpret_sequence/README.md:
--------------------------------------------------------------------------------
 1 | ## Interpretation
 2 | 
 3 | This tutorial describes how to compute gradient saliency scores (sequence attributions) with respect to various statistics computed for a list of input genes specified in a .gtf file. This example relies on the Mini Borzoi model trained on sample K562 RNA-seq data from the [train_model tutorial](https://github.com/calico/borzoi/tree/main/tutorials/latest/train_model), which clearly is a significantly weaker model than the pre-trained, published Borzoi model.
 4 | 
 5 | To compute input gradients with respect to the log-sum of coverage across the exons of the example gene HBE1, run the script 'run_gradients_expr_HBE1.sh'.
 6 | ```sh
 7 | conda activate borzoi_py310
 8 | cd ~/borzoi/tutorials/latest/interpret_sequence
 9 | ./run_gradients_expr_HBE1.sh
10 | ```
11 | 
12 | *Notes*:
13 | - The track scale, squashing exponentiation, and clip-soft threshold, are specific in the .py script arguments (flags: '--track_scale, '--track_transform', '--clip_soft'), and the values in the targets file are ignored. This means that the same data transformation parameters are applied to all tracks specified in the targets file. To calculate gradients for groups of tracks with different data transforms, separate these tracks into different targets files, and execute the gradient script on each group separately.
14 | 


--------------------------------------------------------------------------------
/tutorials/latest/interpret_sequence/explore_grads_k562_HBE1.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "7030e9ad",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import sys\n",
 11 |     "import os\n",
 12 |     "import numpy as np\n",
 13 |     "import pandas as pd\n",
 14 |     "\n",
 15 |     "import h5py\n",
 16 |     "\n",
 17 |     "import matplotlib.pyplot as plt\n",
 18 |     "from scipy.ndimage import gaussian_filter1d\n",
 19 |     "\n",
 20 |     "from vis_helpers import *\n"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 2,
 26 |    "id": "3bcaea3d",
 27 |    "metadata": {},
 28 |    "outputs": [
 29 |     {
 30 |      "name": "stdout",
 31 |      "output_type": "stream",
 32 |      "text": [
 33 |       "scores_hyp.shape = (1, 1, 393216, 4)\n",
 34 |       "scores.shape = (1, 1, 393216, 4)\n"
 35 |      ]
 36 |     },
 37 |     {
 38 |      "data": {
 39 |       "text/plain": [
 40 |        "0"
 41 |       ]
 42 |      },
 43 |      "execution_count": 2,
 44 |      "metadata": {},
 45 |      "output_type": "execute_result"
 46 |     }
 47 |    ],
 48 |    "source": [
 49 |     "#Load scores for the selected set of targets (grad)\n",
 50 |     "\n",
 51 |     "import gc\n",
 52 |     "\n",
 53 |     "seqs = None\n",
 54 |     "strands = None\n",
 55 |     "chrs = None\n",
 56 |     "starts = None\n",
 57 |     "ends = None\n",
 58 |     "genes = None\n",
 59 |     "\n",
 60 |     "all_scores_hyp = []\n",
 61 |     "all_scores = []\n",
 62 |     "\n",
 63 |     "gtex_tissues = ['liver']\n",
 64 |     "\n",
 65 |     "#Load score file\n",
 66 |     "score_file = h5py.File('k562_HBE1/scores_f0c0.h5', 'r')\n",
 67 |     "\n",
 68 |     "#Get scores and onehots\n",
 69 |     "scores = score_file['grads'][()][..., 0]\n",
 70 |     "seqs = score_file['seqs'][()]\n",
 71 |     "\n",
 72 |     "#Get auxiliary information\n",
 73 |     "strands = score_file['strand'][()]\n",
 74 |     "strands = np.array([strands[j].decode() for j in range(strands.shape[0])])\n",
 75 |     "\n",
 76 |     "chrs = score_file['chr'][()]\n",
 77 |     "chrs = np.array([chrs[j].decode() for j in range(chrs.shape[0])])\n",
 78 |     "\n",
 79 |     "starts = np.array(score_file['start'][()])\n",
 80 |     "ends = np.array(score_file['end'][()])\n",
 81 |     "\n",
 82 |     "genes = score_file['gene'][()]\n",
 83 |     "genes = np.array([genes[j].decode().split(\".\")[0] for j in range(genes.shape[0])])\n",
 84 |     "\n",
 85 |     "#Append hypothetical scores\n",
 86 |     "all_scores_hyp.append(scores[None, ...])\n",
 87 |     "\n",
 88 |     "#Append input-gated scores\n",
 89 |     "all_scores.append((scores * seqs)[None, ...])\n",
 90 |     "\n",
 91 |     "#Collect garbage\n",
 92 |     "gc.collect()\n",
 93 |     "\n",
 94 |     "#Collect final scores\n",
 95 |     "scores_hyp = np.concatenate(all_scores_hyp, axis=0)\n",
 96 |     "scores = np.concatenate(all_scores, axis=0)\n",
 97 |     "\n",
 98 |     "print(\"scores_hyp.shape = \" + str(scores_hyp.shape))\n",
 99 |     "print(\"scores.shape = \" + str(scores.shape))\n",
100 |     "\n",
101 |     "score_file = None\n",
102 |     "\n",
103 |     "#Collect garbage\n",
104 |     "gc.collect()\n"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": null,
110 |    "id": "955bf762",
111 |    "metadata": {
112 |     "scrolled": false
113 |    },
114 |    "outputs": [],
115 |    "source": [
116 |     "#Enumerate and visualize attributions; k562 example HBE1\n",
117 |     "\n",
118 |     "save_index = []\n",
119 |     "\n",
120 |     "#Visualization parameters\n",
121 |     "logo_width = 192\n",
122 |     "\n",
123 |     "top_n = 1\n",
124 |     "\n",
125 |     "use_gaussian = True\n",
126 |     "min_padding = 65536\n",
127 |     "gaussian_sigma = 8\n",
128 |     "local_window = 1024\n",
129 |     "\n",
130 |     "main_tissue_ix = 0\n",
131 |     "\n",
132 |     "tissue_colors = ['darkblue']\n",
133 |     "\n",
134 |     "#Loop over examples\n",
135 |     "for example_ix in range(top_n) :\n",
136 |     "    \n",
137 |     "    print(\"-- Example = \" + str(example_ix)+ \" --\")\n",
138 |     "    \n",
139 |     "    print(\" - \" + genes[example_ix] + \"(\" + str(strands[example_ix]) + \")\")\n",
140 |     "    print(\" - \" + chrs[example_ix] + \":\" + str(starts[example_ix]) + \"-\" + str(ends[example_ix]))\n",
141 |     "\n",
142 |     "    #Grad analysis\n",
143 |     "    \n",
144 |     "    #Calculate min and max scores globally (for scales)\n",
145 |     "    min_val = np.min(scores[:, example_ix, ...])\n",
146 |     "    max_val = np.max(scores[:, example_ix, ...])\n",
147 |     "    \n",
148 |     "    print(\" -- min_val = \" + str(round(min_val, 4)))\n",
149 |     "    print(\" -- max_val = \" + str(round(max_val, 4)))\n",
150 |     "    \n",
151 |     "    max_abs_val = max(np.abs(min_val), np.abs(max_val))\n",
152 |     "\n",
153 |     "    min_val -= 0.1 * max_abs_val\n",
154 |     "    max_val += 0.1 * max_abs_val\n",
155 |     "\n",
156 |     "    print(\" - (Gradient score profiles per tissue) - \")\n",
157 |     "    \n",
158 |     "    #Gradient profiles across input sequence\n",
159 |     "    f, ax = plt.subplots(len(gtex_tissues), 1, figsize=(8, len(gtex_tissues) * 1.5))\n",
160 |     "    \n",
161 |     "    if len(gtex_tissues) == 1 :\n",
162 |     "        ax = [ax]\n",
163 |     "\n",
164 |     "    #Loop over tissues\n",
165 |     "    for tissue_ix in range(len(gtex_tissues)) :\n",
166 |     "\n",
167 |     "        #Get tissue scores\n",
168 |     "        score = scores[tissue_ix, example_ix, ...]\n",
169 |     "\n",
170 |     "        l1 = ax[tissue_ix].plot(np.arange(seqs.shape[1]), np.sum(score, axis=-1), linewidth=1, linestyle='-', color=tissue_colors[tissue_ix], label=gtex_tissues[tissue_ix])\n",
171 |     "        \n",
172 |     "        plt.sca(ax[tissue_ix])\n",
173 |     "        \n",
174 |     "        plt.xlim(0, seqs.shape[1])\n",
175 |     "        plt.ylim(min_val, max_val)\n",
176 |     "        \n",
177 |     "        plt.legend(handles=[l1[0]], fontsize=8)\n",
178 |     "        \n",
179 |     "        plt.yticks([], [])\n",
180 |     "        plt.xticks([], [])\n",
181 |     "    \n",
182 |     "    plt.sca(ax[0])\n",
183 |     "    plt.title(\"Gradient Saliency for gene = '\" + genes[example_ix] + \"' (\" + str(strands[example_ix]) + \")\", fontsize=8)\n",
184 |     "    \n",
185 |     "    plt.sca(ax[len(gtex_tissues)-1])\n",
186 |     "    plt.xlabel(chrs[example_ix] + \":\" + str(starts[example_ix]) + \"-\" + str(ends[example_ix]), fontsize=8)\n",
187 |     "    \n",
188 |     "    plt.sca(plt.gca())\n",
189 |     "    plt.tight_layout()\n",
190 |     "    \n",
191 |     "    plt.show()\n",
192 |     "\n",
193 |     "    #Apply gaussian filter\n",
194 |     "    smooth_score = np.sum(scores[main_tissue_ix, example_ix, ...], axis=-1)\n",
195 |     "    if use_gaussian :\n",
196 |     "        smooth_score = gaussian_filter1d(smooth_score.astype('float32'), sigma=gaussian_sigma, truncate=2).astype('float16')\n",
197 |     "    \n",
198 |     "    #Calculate min/max positions and (differential) values\n",
199 |     "    #max_pos = np.argmax(smooth_score[min_padding:-min_padding]) + min_padding\n",
200 |     "    \n",
201 |     "    max_pos = np.argmax(smooth_score[min_padding:-min_padding]) + min_padding\n",
202 |     "\n",
203 |     "    print(\" - (Attribution at position of Max positive differential saliency) -\")\n",
204 |     "\n",
205 |     "    print(\" - max_pos (rel) = \" + str(max_pos))\n",
206 |     "    print(\" - max_pos (abs) = \" + str(starts[example_ix] + max_pos))\n",
207 |     "    \n",
208 |     "    #Visualize contribution scores\n",
209 |     "    plot_start = max_pos - logo_width // 2\n",
210 |     "    plot_end = max_pos + logo_width // 2\n",
211 |     "    \n",
212 |     "    print(\" - \" + chrs[example_ix] + \":\" + str(starts[example_ix] + max_pos - logo_width // 2) + \"-\" + str(starts[example_ix] + max_pos + logo_width // 2))\n",
213 |     "\n",
214 |     "    #Logo min/max value across tissues\n",
215 |     "    min_logo_val = np.min(scores[:, example_ix, plot_start:plot_end, :])\n",
216 |     "    max_logo_val = np.max(scores[:, example_ix, plot_start:plot_end, :])\n",
217 |     "\n",
218 |     "    max_abs_logo_val = max(np.abs(min_logo_val), np.abs(max_logo_val))\n",
219 |     "\n",
220 |     "    min_logo_val -= 0.02 * max_abs_logo_val\n",
221 |     "    max_logo_val += 0.02 * max_abs_logo_val\n",
222 |     "\n",
223 |     "    print(\" - y_min = \" + str(round(min_logo_val, 8)))\n",
224 |     "    print(\" - y_max = \" + str(round(max_logo_val, 8)))\n",
225 |     "\n",
226 |     "    #Loop over tissues\n",
227 |     "    for tissue_ix in range(len(gtex_tissues)) :\n",
228 |     "        print(gtex_tissues[tissue_ix])\n",
229 |     "\n",
230 |     "        #Get tissue-specific scores\n",
231 |     "        score = scores[tissue_ix, example_ix, plot_start:plot_end, :]\n",
232 |     "\n",
233 |     "        #Plot scores as sequence logo\n",
234 |     "        plot_seq_scores(\n",
235 |     "            score,\n",
236 |     "            y_min=min_logo_val,\n",
237 |     "            y_max=max_logo_val,\n",
238 |     "            figsize=(8, 1),\n",
239 |     "            plot_y_ticks=False,\n",
240 |     "        )\n",
241 |     "    \n",
242 |     "    print(\"--------------------\")\n",
243 |     "    print(\"\")\n"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": null,
249 |    "id": "67a3cf9d",
250 |    "metadata": {},
251 |    "outputs": [],
252 |    "source": []
253 |   }
254 |  ],
255 |  "metadata": {
256 |   "kernelspec": {
257 |    "display_name": "Python 3 (ipykernel)",
258 |    "language": "python",
259 |    "name": "python3"
260 |   },
261 |   "language_info": {
262 |    "codemirror_mode": {
263 |     "name": "ipython",
264 |     "version": 3
265 |    },
266 |    "file_extension": ".py",
267 |    "mimetype": "text/x-python",
268 |    "name": "python",
269 |    "nbconvert_exporter": "python",
270 |    "pygments_lexer": "ipython3",
271 |    "version": "3.8.15"
272 |   }
273 |  },
274 |  "nbformat": 4,
275 |  "nbformat_minor": 5
276 | }
277 | 


--------------------------------------------------------------------------------
/tutorials/latest/interpret_sequence/run_gradients_expr_HBE1.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | borzoi_satg_gene.py -o k562_HBE1 -f 0 -c 0 --rc --track_scale 0.3 --track_transform 0.5 --clip_soft 384.0 -t ../make_data/targets_human.txt ../train_model/params_mini.json ../train_model/mini_models HBE1_example.gtf
4 | 


--------------------------------------------------------------------------------
/tutorials/latest/interpret_sequence/vis_helpers.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import numpy as np
  4 | 
  5 | import matplotlib.pyplot as plt
  6 | 
  7 | import matplotlib.cm as cm
  8 | import matplotlib.colors as colors
  9 | 
 10 | import matplotlib as mpl
 11 | from matplotlib.text import TextPath
 12 | from matplotlib.patches import PathPatch, Rectangle
 13 | from matplotlib.font_manager import FontProperties
 14 | from matplotlib import gridspec
 15 | from matplotlib.ticker import FormatStrFormatter
 16 | 
 17 | #Helper function to draw a letter at a given position
 18 | def dna_letter_at(letter, x, y, yscale=1, ax=None, color=None, alpha=1.0):
 19 | 
 20 |     fp = FontProperties(family="DejaVu Sans", weight="bold")
 21 |     globscale = 1.35
 22 |     LETTERS = {	"T" : TextPath((-0.305, 0), "T", size=1, prop=fp),
 23 |                 "G" : TextPath((-0.384, 0), "G", size=1, prop=fp),
 24 |                 "A" : TextPath((-0.35, 0), "A", size=1, prop=fp),
 25 |                 "C" : TextPath((-0.366, 0), "C", size=1, prop=fp),
 26 |                 "UP" : TextPath((-0.488, 0), '$\\Uparrow$', size=1, prop=fp),
 27 |                 "DN" : TextPath((-0.488, 0), '$\\Downarrow$', size=1, prop=fp),
 28 |                 "(" : TextPath((-0.25, 0), "(", size=1, prop=fp),
 29 |                 "." : TextPath((-0.125, 0), "-", size=1, prop=fp),
 30 |                 ")" : TextPath((-0.1, 0), ")", size=1, prop=fp)}
 31 |     COLOR_SCHEME = {'G': 'orange',#'orange', 
 32 |                     'A': 'green',#'red', 
 33 |                     'C': 'blue',#'blue', 
 34 |                     'T': 'red',#'darkgreen',
 35 |                     'UP': 'green', 
 36 |                     'DN': 'red',
 37 |                     '(': 'black',
 38 |                     '.': 'black', 
 39 |                     ')': 'black'}
 40 | 
 41 | 
 42 |     text = LETTERS[letter]
 43 | 
 44 |     chosen_color = COLOR_SCHEME[letter]
 45 |     if color is not None :
 46 |         chosen_color = color
 47 | 
 48 |     t = mpl.transforms.Affine2D().scale(1*globscale, yscale*globscale) + \
 49 |         mpl.transforms.Affine2D().translate(x,y) + ax.transData
 50 |     p = PathPatch(text, lw=0, fc=chosen_color, alpha=alpha, transform=t)
 51 |     if ax != None:
 52 |         ax.add_artist(p)
 53 |     return p
 54 | 
 55 | #Function to plot sequence logo
 56 | def plot_seq_scores(importance_scores, figsize=(16, 2), plot_y_ticks=True, y_min=None, y_max=None, save_figs=False, fig_name="default") :
 57 | 
 58 |     importance_scores = importance_scores.T
 59 | 
 60 |     fig = plt.figure(figsize=figsize)
 61 |     
 62 |     ref_seq = ""
 63 |     for j in range(importance_scores.shape[1]) :
 64 |         argmax_nt = np.argmax(np.abs(importance_scores[:, j]))
 65 |         
 66 |         if argmax_nt == 0 :
 67 |             ref_seq += "A"
 68 |         elif argmax_nt == 1 :
 69 |             ref_seq += "C"
 70 |         elif argmax_nt == 2 :
 71 |             ref_seq += "G"
 72 |         elif argmax_nt == 3 :
 73 |             ref_seq += "T"
 74 | 
 75 |     ax = plt.gca()
 76 |     
 77 |     for i in range(0, len(ref_seq)) :
 78 |         mutability_score = np.sum(importance_scores[:, i])
 79 |         color = None
 80 |         dna_letter_at(ref_seq[i], i + 0.5, 0, mutability_score, ax, color=color)
 81 |     
 82 |     plt.sca(ax)
 83 |     plt.xticks([], [])
 84 |     plt.gca().yaxis.set_major_formatter(FormatStrFormatter('%.3f'))
 85 |     
 86 |     plt.xlim((0, len(ref_seq)))
 87 |     
 88 |     #plt.axis('off')
 89 |     
 90 |     if plot_y_ticks :
 91 |         plt.yticks(fontsize=12)
 92 |     else :
 93 |         plt.yticks([], [])
 94 |     
 95 |     if y_min is not None and y_max is not None :
 96 |         plt.ylim(y_min, y_max)
 97 |     elif y_min is not None :
 98 |         plt.ylim(y_min)
 99 |     else :
100 |         plt.ylim(
101 |             np.min(importance_scores) - 0.1 * np.max(np.abs(importance_scores)),
102 |             np.max(importance_scores) + 0.1 * np.max(np.abs(importance_scores))
103 |         )
104 |     
105 |     plt.axhline(y=0., color='black', linestyle='-', linewidth=1)
106 | 
107 |     #for axis in fig.axes :
108 |     #    axis.get_xaxis().set_visible(False)
109 |     #    axis.get_yaxis().set_visible(False)
110 | 
111 |     plt.tight_layout()
112 | 
113 |     if save_figs :
114 |         plt.savefig(fig_name + ".png", transparent=True, dpi=300)
115 |         plt.savefig(fig_name + ".eps")
116 | 
117 |     plt.show()
118 | 
119 | #Function to visualize a pair of sequence logos
120 | def visualize_input_gradient_pair(att_grad_wt, att_grad_mut, plot_start=0, plot_end=100, save_figs=False, fig_name='') :
121 | 
122 |     scores_wt = att_grad_wt[plot_start:plot_end, :]
123 |     scores_mut = att_grad_mut[plot_start:plot_end, :]
124 | 
125 |     y_min = min(np.min(scores_wt), np.min(scores_mut))
126 |     y_max = max(np.max(scores_wt), np.max(scores_mut))
127 | 
128 |     y_max_abs = max(np.abs(y_min), np.abs(y_max))
129 | 
130 |     y_min = y_min - 0.05 * y_max_abs
131 |     y_max = y_max + 0.05 * y_max_abs
132 | 
133 |     if np.sum(scores_mut) != 0. :
134 |         print("--- WT ---")
135 |     
136 |     plot_seq_scores(
137 |         scores_wt, y_min=y_min, y_max=y_max,
138 |         figsize=(8, 1),
139 |         plot_y_ticks=False,
140 |         save_figs=save_figs,
141 |         fig_name=fig_name + '_wt',
142 |     )
143 | 
144 |     if np.sum(scores_mut) != 0. :
145 |     
146 |         print("--- Mut ---")
147 |         plot_seq_scores(
148 |             scores_mut, y_min=y_min, y_max=y_max,
149 |             figsize=(8, 1),
150 |             plot_y_ticks=False,
151 |             save_figs=save_figs,
152 |             fig_name=fig_name + '_mut',
153 |         )
154 | 


--------------------------------------------------------------------------------
/tutorials/latest/make_data/Makefile:
--------------------------------------------------------------------------------
 1 | FASTA_HUMAN=$$BORZOI_HG38/assembly/gnomad/hg38.ml.fa
 2 | GAPS_HUMAN=$$BORZOI_HG38/assembly/ucsc/hg38_gaps.bed
 3 | UMAP_HUMAN=$$BORZOI_HG38/mappability/umap_k36_t10_l32.bed
 4 | BLACK_HUMAN=$$BORZOI_HG38/blacklist/blacklist_hg38_all.bed
 5 | 
 6 | FASTA_MOUSE=$$BORZOI_MM10/assembly/ucsc/mm10.ml.fa
 7 | GAPS_MOUSE=$$BORZOI_MM10/assembly/ucsc/mm10_gaps.bed
 8 | UMAP_MOUSE=$$BORZOI_MM10/mappability/umap_k36_t10_l32.bed
 9 | BLACK_MOUSE=$$BORZOI_MM10/blacklist/blacklist_mm10_all.bed
10 | 
11 | ALIGN=$$BORZOI_HG38/align/hg38.mm10.syn.net.gz
12 | 
13 | OUT=data
14 | 
15 | # mini borzoi configuration
16 | LENGTH=393216
17 | TSTRIDE=131087 # 393216/3 - 15
18 | CROP=0
19 | WIDTH=32
20 | FOLDS=8
21 | 
22 | AOPTS=--break 2097152 -c $(CROP) --nf 524288 --no 393216 -l $(LENGTH) --stride $(TSTRIDE) -f $(FOLDS) --umap_t 0.5 -w $(WIDTH)
23 | DOPTS=-c $(CROP) -d 2 -f $(FOLDS) -l $(LENGTH) -p 64 -r 16 --umap_clip 0.5 -w $(WIDTH)
24 | 
25 | all: $(OUT)/hg38/tfrecords/train-0.tfr # $(OUT)/mm10/tfrecords/train-0.tfr
26 | 
27 | umap_human.bed:
28 | 	cat $(UMAP_HUMAN) $(BLACK_HUMAN) | awk 'BEGIN {OFS="\t"} {print $$1, $$2, $$3}' | bedtools sort -i - | bedtools merge -i - > umap_human.bed
29 | 
30 | umap_mouse.bed:
31 | 	cat $(UMAP_MOUSE) $(BLACK_MOUSE) | awk 'BEGIN {OFS="\t"} {print $$1, $$2, $$3}' | bedtools sort -i - | bedtools merge -i - > umap_mouse.bed
32 | 
33 | # targets file is already generated in this example
34 | #targets_human.txt targets_mouse.txt:
35 | #	./make_targets.py
36 | 
37 | $(OUT)/hg38/sequences.bed $(OUT)/mm10/sequences.bed: umap_human.bed umap_mouse.bed
38 | 	hound_data_align.py -a hg38,mm10 -g $(GAPS_HUMAN),$(GAPS_MOUSE) -u umap_human.bed,umap_mouse.bed $(AOPTS) -o $(OUT) $(ALIGN) $(FASTA_HUMAN),$(FASTA_MOUSE)
39 | 
40 | $(OUT)/hg38/tfrecords/train-0.tfr: $(OUT)/hg38/sequences.bed targets_human.txt
41 | 	hound_data.py --restart $(DOPTS) -b $(BLACK_HUMAN) -o $(OUT)/hg38 $(FASTA_HUMAN) -u umap_human.bed targets_human.txt
42 | 
43 | # no mouse data in this example
44 | #$(OUT)/mm10/tfrecords/train-0.tfr: $(OUT)/mm10/sequences.bed targets_mouse.txt
45 | #	hound_data.py --restart $(DOPTS) -b $(BLACK_MOUSE) -o $(OUT)/mm10 $(FASTA_MOUSE) -u umap_mouse.bed targets_mouse.txt
46 | 


--------------------------------------------------------------------------------
/tutorials/latest/make_data/README.md:
--------------------------------------------------------------------------------
 1 | ## Data Processing
 2 | 
 3 | This tutorial decribes how to process a .bigwig sequencing experiment into compressed .w5 format, merge replicates, generate QC metrics, and finally create TFRecord files containing binned coverage values suitable for training Borzoi models. We will exemplify this for the ENCODE K562 RNA-seq experiment [ENCSR000AEL](https://www.encodeproject.org/experiments/ENCSR000AEL/).
 4 | 
 5 | First, activate the conda environment and run the script 'download_dependencies.sh' to download required auxiliary files.
 6 | ```sh
 7 | conda activate borzoi_py310
 8 | cd ~/borzoi/tutorials/latest/make_data
 9 | ./download_dependencies.sh
10 | ```
11 | 
12 | Next, run the script 'download_bw.sh' to download sample ENCODE .bigwig files and arrange them in a folder structure.
13 | ```sh
14 | ./download_bw.sh
15 | ```
16 | 
17 | Then run script 'process_w5.sh' to generate compressed .w5 files (hdf5) from the input .bigwig files, merge the two replicates, and calculate basic QC metrics. This .sh script internally calls 'bw_h5.py' to generate .w5 files, 'w5_merge.py' to merge replicates, and 'w5_qc.py' to calculate QC metrics.
18 | ```sh
19 | ./process_w5.sh
20 | ```
21 | 
22 | Finally, run the Makefile to create genome-wide binned coverage tracks, stored as compressed TFRecords.
23 | ```sh
24 | make
25 | ```
26 | 
27 | In this example, the Makefile creates 8 cross-validation folds of TFRecords with input sequences of length 393216 bp, generated with a genome-wide stride of 131087 bp (which is ~1/3 of the sequence length, but shifts the bin boundaries, too). The output coverage tracks corresponding to each input sequence are not cropped in the latest version of Borzoi models. This results in 12288 coverage bins per 393kb sequence. The specific .w5 tracks to include in the TFRecord generation, and the scales and pooling transforms applied to the bins of each experiment, are given in the targets file 'targets_human.txt'. Below is a description of the columns in this file.
28 | 
29 | *targets_human.txt*:
30 | - (unnamed) => integer index of each track (must start from 0 when training a new model).
31 | - 'identifier' => unique identifier of each experiment (and strand).
32 | - 'file' => local file path to .w5 file.
33 | - 'clip' => hard clipping threshold to be applied to each bin, after soft-clipping.
34 | - 'clip_soft' => soft clipping (squashing) threshold.
35 | - 'scale' => scale value applied to each bp-level position before clipping.
36 | - 'sum_stat' => type of bin-level pooling operation ('sum_sqrt' = sum and square-root).
37 | - 'strand_pair' => integer index of the other stranded track of an experiment (same index as current row if unstranded).
38 | - 'description' => text description of experiment.
39 | 
40 | *Notes*:
41 | - See [here](https://github.com/calico/borzoi-paper/tree/main/data/training) for a description of the scripts called by the Makefile to create TFRecords.
42 | - In the latest version of Borzoi models, a modified hg38 fasta genome is used in the Makefile where the allele with highest overall frequency (from gnomAD) is substituted at each position.
43 | 


--------------------------------------------------------------------------------
/tutorials/latest/make_data/download_bw.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # download example data from ENCODE (ENCSR000AEL - K562 RNA-seq); 2 replicates
 4 | 
 5 | # define ENCODE ID
 6 | ENC_ID='ENCSR000AEL'
 7 | 
 8 | # define remote urls
 9 | URL_P_REP1='https://www.encodeproject.org/files/ENCFF980ZHM/@@download/ENCFF980ZHM.bigWig'
10 | URL_M_REP1='https://www.encodeproject.org/files/ENCFF533LJF/@@download/ENCFF533LJF.bigWig'
11 | 
12 | URL_P_REP2='https://www.encodeproject.org/files/ENCFF335LVS/@@download/ENCFF335LVS.bigWig'
13 | URL_M_REP2='https://www.encodeproject.org/files/ENCFF257NOL/@@download/ENCFF257NOL.bigWig'
14 | 
15 | # define ENCODE file IDs
16 | FILE_P_REP1='ENCFF980ZHM'
17 | FILE_M_REP1='ENCFF533LJF'
18 | 
19 | FILE_P_REP2='ENCFF335LVS'
20 | FILE_M_REP2='ENCFF257NOL'
21 | 
22 | # create folder for bigwig files
23 | mkdir -p "human/rna/encode/$ENC_ID/rep1"
24 | mkdir -p "human/rna/encode/$ENC_ID/rep2"
25 | 
26 | 
27 | # download bigwig files; rep1
28 | if [ -f "human/rna/encode/$ENC_ID/rep1/$FILE_P_REP1.bigWig" ]; then
29 |   echo "example RNA-seq data already downloaded (rep 1)."
30 | else
31 |   wget $URL_P_REP1 -O "human/rna/encode/$ENC_ID/rep1/$FILE_P_REP1.bigWig"
32 |   wget $URL_M_REP1 -O "human/rna/encode/$ENC_ID/rep1/$FILE_M_REP1.bigWig"
33 | fi
34 | 
35 | # download bigwig files; rep2
36 | if [ -f "human/rna/encode/$ENC_ID/rep2/$FILE_P_REP2.bigWig" ]; then
37 |   echo "example RNA-seq data already downloaded (rep 2)."
38 | else
39 |   wget $URL_P_REP2 -O "human/rna/encode/$ENC_ID/rep2/$FILE_P_REP2.bigWig"
40 |   wget $URL_M_REP2 -O "human/rna/encode/$ENC_ID/rep2/$FILE_M_REP2.bigWig"
41 | fi
42 | 


--------------------------------------------------------------------------------
/tutorials/latest/make_data/download_dependencies.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # create additional folder in borzoi data folders
 4 | mkdir -p "$BORZOI_HG38/assembly/ucsc"
 5 | mkdir -p "$BORZOI_HG38/assembly/gnomad"
 6 | mkdir -p "$BORZOI_HG38/mappability"
 7 | mkdir -p "$BORZOI_HG38/blacklist"
 8 | mkdir -p "$BORZOI_HG38/align"
 9 | 
10 | mkdir -p "$BORZOI_MM10/assembly/ucsc"
11 | mkdir -p "$BORZOI_MM10/mappability"
12 | mkdir -p "$BORZOI_MM10/blacklist"
13 | 
14 | 
15 | # download and uncompress auxiliary files required for Makefile (hg38)
16 | if [ -f "$BORZOI_HG38/assembly/ucsc/hg38_gaps.bed" ]; then
17 |   echo "hg38_gaps.bed already exists."
18 | else
19 |   wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/hg38_gaps.bed.gz | gunzip -c > "$BORZOI_HG38/assembly/ucsc/hg38_gaps.bed"
20 | fi
21 | 
22 | if [ -f "$BORZOI_HG38/mappability/umap_k36_t10_l32.bed" ]; then
23 |   echo "umap_k36_t10_l32.bed (hg38) already exists."
24 | else
25 |   wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/umap_k36_t10_l32_hg38.bed.gz | gunzip -c > "$BORZOI_HG38/mappability/umap_k36_t10_l32.bed"
26 | fi
27 | 
28 | if [ -f "$BORZOI_HG38/blacklist/blacklist_hg38_all.bed" ]; then
29 |   echo "blacklist_hg38_all.bed already exists."
30 | else
31 |   wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/blacklist_hg38_all.bed.gz | gunzip -c > "$BORZOI_HG38/blacklist/blacklist_hg38_all.bed"
32 | fi
33 | 
34 | if [ -f "$BORZOI_HG38/align/hg38.mm10.syn.net.gz" ]; then
35 |   echo "Splice site annotation already exist."
36 | else
37 |   wget https://storage.googleapis.com/seqnn-share/helper/dependencies/hg38.mm10.syn.net.gz -O "$BORZOI_HG38/align/hg38.mm10.syn.net.gz"
38 | fi
39 | 
40 | 
41 | # download and uncompress auxiliary files required for Makefile (mm10)
42 | if [ -f "$BORZOI_MM10/assembly/ucsc/mm10_gaps.bed" ]; then
43 |   echo "mm10_gaps.bed already exists."
44 | else
45 |   wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/mm10_gaps.bed.gz | gunzip -c > "$BORZOI_MM10/assembly/ucsc/mm10_gaps.bed"
46 | fi
47 | 
48 | if [ -f "$BORZOI_MM10/mappability/umap_k36_t10_l32.bed" ]; then
49 |   echo "umap_k36_t10_l32.bed (mm10) already exists."
50 | else
51 |   wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/umap_k36_t10_l32_mm10.bed.gz | gunzip -c > "$BORZOI_MM10/mappability/umap_k36_t10_l32.bed"
52 | fi
53 | 
54 | if [ -f "$BORZOI_MM10/blacklist/blacklist_mm10_all.bed" ]; then
55 |   echo "blacklist_mm10_all.bed already exists."
56 | else
57 |   wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/blacklist_mm10_all.bed.gz | gunzip -c > "$BORZOI_MM10/blacklist/blacklist_mm10_all.bed"
58 | fi
59 | 
60 | 
61 | # download and uncompress pre-compiled umap bed files
62 | if [ -f umap_human.bed ]; then
63 |   echo "umap_human.bed already exists."
64 | else
65 |   wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/umap_human.bed.gz | gunzip -c > umap_human.bed
66 | fi
67 | 
68 | if [ -f umap_mouse.bed ]; then
69 |   echo "umap_mouse.bed already exists."
70 | else
71 |   wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/umap_mouse.bed.gz | gunzip -c > umap_mouse.bed
72 | fi
73 | 
74 | 
75 | # download and index hg38 ml genome
76 | if [ -f "$BORZOI_HG38/assembly/ucsc/hg38.ml.fa" ]; then
77 |   echo "hg38.ml.fa already exists."
78 | else
79 |   wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/hg38.ml.fa.gz | gunzip -c > "$BORZOI_HG38/assembly/ucsc/hg38.ml.fa"
80 |   idx_genome.py "$BORZOI_HG38/assembly/ucsc/hg38.ml.fa"
81 | fi
82 | 
83 | # download and index hg38 ml genome (gnomad major alleles)
84 | if [ -f "$BORZOI_HG38/assembly/gnomad/hg38.ml.fa" ]; then
85 |   echo "hg38.ml.fa (gnomad) already exists."
86 | else
87 |   wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/hg38_gnomad.ml.fa.gz | gunzip -c > "$BORZOI_HG38/assembly/gnomad/hg38.ml.fa"
88 |   idx_genome.py "$BORZOI_HG38/assembly/gnomad/hg38.ml.fa"
89 | fi
90 | 
91 | # download and index mm10 ml genome
92 | if [ -f "$BORZOI_MM10/assembly/ucsc/mm10.ml.fa" ]; then
93 |   echo "mm10.ml.fa already exists."
94 | else
95 |   wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/mm10.ml.fa.gz | gunzip -c > "$BORZOI_MM10/assembly/ucsc/mm10.ml.fa"
96 |   idx_genome.py "$BORZOI_MM10/assembly/ucsc/mm10.ml.fa"
97 | fi
98 | 


--------------------------------------------------------------------------------
/tutorials/latest/make_data/process_w5.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # merge bigwig replicates, generate .w5 files and run qc
 4 | 
 5 | # define ENCODE ID
 6 | ENC_ID='ENCSR000AEL'
 7 | 
 8 | # define ENCODE file IDs
 9 | FILE_P_REP1='ENCFF980ZHM'
10 | FILE_M_REP1='ENCFF533LJF'
11 | 
12 | FILE_P_REP2='ENCFF335LVS'
13 | FILE_M_REP2='ENCFF257NOL'
14 | 
15 | # create folder for merged replicate files
16 | mkdir -p "human/rna/encode/$ENC_ID/summary"
17 | 
18 | 
19 | # step 1: generate per-replicate .w5 files
20 | 
21 | # rep1
22 | if [ -f "human/rna/encode/$ENC_ID/rep1/$FILE_P_REP1+.w5" ]; then
23 |   echo "example RNA-seq .w5 already exists (rep 1)."
24 | else
25 |   bw_h5.py -z "human/rna/encode/$ENC_ID/rep1/$FILE_P_REP1.bigWig" "human/rna/encode/$ENC_ID/rep1/$FILE_P_REP1+.w5"
26 |   bw_h5.py -z "human/rna/encode/$ENC_ID/rep1/$FILE_M_REP1.bigWig" "human/rna/encode/$ENC_ID/rep1/$FILE_M_REP1-.w5"
27 | fi
28 | 
29 | # rep2
30 | if [ -f "human/rna/encode/$ENC_ID/rep2/$FILE_P_REP2+.w5" ]; then
31 |   echo "example RNA-seq .w5 already exists (rep 2)."
32 | else
33 |   bw_h5.py -z "human/rna/encode/$ENC_ID/rep2/$FILE_P_REP2.bigWig" "human/rna/encode/$ENC_ID/rep2/$FILE_P_REP2+.w5"
34 |   bw_h5.py -z "human/rna/encode/$ENC_ID/rep2/$FILE_M_REP2.bigWig" "human/rna/encode/$ENC_ID/rep2/$FILE_M_REP2-.w5"
35 | fi
36 | 
37 | 
38 | # step 2: merge replicates
39 | 
40 | if [ -f "human/rna/encode/$ENC_ID/summary/coverage+.w5" ]; then
41 |   echo "example RNA-seq .w5 already exists (merged)."
42 | else
43 |   w5_merge.py -w -s mean -z "human/rna/encode/$ENC_ID/summary/coverage+.w5" "human/rna/encode/$ENC_ID/rep1/$FILE_P_REP1+.w5" "human/rna/encode/$ENC_ID/rep2/$FILE_P_REP2+.w5"
44 |   w5_merge.py -w -s mean -z "human/rna/encode/$ENC_ID/summary/coverage-.w5" "human/rna/encode/$ENC_ID/rep1/$FILE_M_REP1-.w5" "human/rna/encode/$ENC_ID/rep2/$FILE_M_REP2-.w5"
45 | fi
46 | 
47 | 
48 | # step 3: run qc on each replicate and the merged file
49 | 
50 | if [ -f "human/rna/encode/$ENC_ID/summary/covqc/means.txt" ]; then
51 |   echo "qc statistics already exist."
52 | else
53 |   # rep1
54 |   w5_qc.py -b "$BORZOI_HG38/blacklist/blacklist_hg38_all.bed" -o "human/rna/encode/$ENC_ID/rep1/covqc" "human/rna/encode/$ENC_ID/rep1/$FILE_P_REP1+.w5"
55 |   w5_qc.py -b "$BORZOI_HG38/blacklist/blacklist_hg38_all.bed" -o "human/rna/encode/$ENC_ID/rep1/covqc_m" "human/rna/encode/$ENC_ID/rep1/$FILE_M_REP1-.w5"
56 | 
57 |   # rep2
58 |   w5_qc.py -b "$BORZOI_HG38/blacklist/blacklist_hg38_all.bed" -o "human/rna/encode/$ENC_ID/rep2/covqc" "human/rna/encode/$ENC_ID/rep2/$FILE_P_REP2+.w5"
59 |   w5_qc.py -b "$BORZOI_HG38/blacklist/blacklist_hg38_all.bed" -o "human/rna/encode/$ENC_ID/rep2/covqc_m" "human/rna/encode/$ENC_ID/rep2/$FILE_M_REP2-.w5"
60 | 
61 |   # summary
62 |   w5_qc.py -b "$BORZOI_HG38/blacklist/blacklist_hg38_all.bed" -o "human/rna/encode/$ENC_ID/summary/covqc" "human/rna/encode/$ENC_ID/summary/coverage+.w5"
63 |   w5_qc.py -b "$BORZOI_HG38/blacklist/blacklist_hg38_all.bed" -o "human/rna/encode/$ENC_ID/summary/covqc_m" "human/rna/encode/$ENC_ID/summary/coverage-.w5"
64 | fi
65 | 
66 | 


--------------------------------------------------------------------------------
/tutorials/latest/make_data/targets_human.txt:
--------------------------------------------------------------------------------
1 | 	identifier	file	clip	clip_soft	scale	sum_stat	strand_pair	description
2 | 0	ENCFF980ZHM+	human/rna/encode/ENCSR000AEL/summary/coverage+.w5	768	384	0.3	sum_sqrt	1	RNA:K562
3 | 1	ENCFF980ZHM-	human/rna/encode/ENCSR000AEL/summary/coverage-.w5	768	384	0.3	sum_sqrt	0	RNA:K562
4 | 


--------------------------------------------------------------------------------
/tutorials/latest/score_variants/README.md:
--------------------------------------------------------------------------------
 1 | ## Variant Scoring
 2 | 
 3 | This tutorial describes how to predict variant effect scores for a small set of SNVs defined in a .vcf file. This example relies on the Mini Borzoi model trained on sample K562 RNA-seq data from the [train_model tutorial](https://github.com/calico/borzoi/tree/main/tutorials/latest/train_model), which clearly is a significantly weaker model than the pre-trained, published Borzoi model. For examples showcasing variant effect prediction at a larger scale with the pre-trained model (e.g. fine-mapped eQTL classification benchmarks), we refer the user to the [borzoi-paper respository](https://github.com/calico/borzoi-paper/tree/main). Additionally, we refer the user to the **legacy** version of [this tutorial](https://github.com/calico/borzoi/tree/main/tutorials/legacy/score_variants), which uses the pre-trained, published model.
 4 | 
 5 | First, to calculate **gene-specific expression** scores, run the script 'score_expr_sed.sh'. Two different statistics are computed: (1) logSED (gene expression log fold change), and (2) logD2 (bin-level L2 norm across the coverage profile intersecting the exons of the gene).
 6 | ```sh
 7 | conda activate borzoi_py310
 8 | cd ~/borzoi/tutorials/latest/score_variants
 9 | ./score_expr_sed.sh
10 | ```
11 | 
12 | To calculate **gene-agnostic expression** scores, run the script 'score_expr_sad.sh'. One statistic is computed: logD2 (bin-level L2 norm across the entire predicted coverage track).
13 | ```sh
14 | ./score_expr_sad.sh
15 | ```
16 | 
17 | To calculate **gene-specific polyadenylation** scores, run the script 'score_polya.sh'. One statistic is computed: COVR (3' coverage ratio across pA junctions of the target gene).
18 | ```sh
19 | ./score_polya.sh
20 | ```
21 | 
22 | To calculate **gene-specific splicing** scores, run the script 'score_splice.sh'. One statistic is computed: nDi (normalized maximum absolute difference in coverage bins across the target gene span).
23 | ```sh
24 | ./score_splice.sh
25 | ```
26 | 
27 | Finally, the jupyter notebook 'run_variant_scripts.ipynb' is provided for convenience to execute all above scripts. The notebook also exemplifies how to navigate the variant prediction hdf5 files and print some example scores.
28 | 


--------------------------------------------------------------------------------
/tutorials/latest/score_variants/run_variant_scripts.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "f5d0f9fb",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import os\n",
 11 |     "import sys\n",
 12 |     "import h5py\n"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": null,
 18 |    "id": "7a94cbf8",
 19 |    "metadata": {
 20 |     "scrolled": true
 21 |    },
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "#Calculate gene-specific variant effect scores\n",
 25 |     "\n",
 26 |     "!./score_expr_sed.sh\n"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": null,
 32 |    "id": "1047ff0f",
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "#Print an example variant effect prediction for a SNP-gene pair (gene-specific expression)\n",
 37 |     "\n",
 38 |     "sed_h5 = h5py.File('snp_sed/f0c0/sed.h5', 'r')\n",
 39 |     "\n",
 40 |     "row_ix = 63\n",
 41 |     "target_ix = 0\n",
 42 |     "\n",
 43 |     "print(\"score: 'logSED', snp: '\" + str(sed_h5['snp'][sed_h5['si'][row_ix]].decode()) + \"', gene: '\" + str(sed_h5['gene'][sed_h5['si'][row_ix]].decode()) + \"', track: '\" + str(sed_h5['target_labels'][target_ix].decode()) + \"' => \" + str(round(sed_h5['logSED'][row_ix, target_ix], 4)))\n"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": null,
 49 |    "id": "f105ecd9",
 50 |    "metadata": {
 51 |     "scrolled": true
 52 |    },
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "#Calculate gene-agnostic variant effect scores\n",
 56 |     "\n",
 57 |     "!./score_expr_sad.sh\n"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": null,
 63 |    "id": "96e4f7cb",
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "#Print an example variant effect prediction for a SNP (gene-agnostic expression)\n",
 68 |     "\n",
 69 |     "sad_h5 = h5py.File('snp_sad/f0c0/sad.h5', 'r')\n",
 70 |     "\n",
 71 |     "snp_ix = 1\n",
 72 |     "target_ix = 0\n",
 73 |     "\n",
 74 |     "print(\"score: 'logD2', snp: '\" + str(sad_h5['snp'][snp_ix].decode()) + \"', track: '\" + str(sad_h5['target_labels'][target_ix].decode()) + \"' => \" + str(round(sad_h5['logD2'][snp_ix, target_ix], 4)))\n"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": null,
 80 |    "id": "c56efaef",
 81 |    "metadata": {
 82 |     "scrolled": true
 83 |    },
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "#Calculate splice variant effect scores\n",
 87 |     "\n",
 88 |     "!./score_splice.sh\n"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": null,
 94 |    "id": "980993fc",
 95 |    "metadata": {},
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "#Print an example variant effect prediction for a SNP-gene pair (splicing)\n",
 99 |     "\n",
100 |     "sed_h5 = h5py.File('snp_splice/f0c0/sed.h5', 'r')\n",
101 |     "\n",
102 |     "row_ix = 116\n",
103 |     "target_ix = 755\n",
104 |     "\n",
105 |     "print(\"score: 'nDi', snp: '\" + str(sed_h5['snp'][sed_h5['si'][row_ix]].decode()) + \"', gene: '\" + str(sed_h5['gene'][sed_h5['si'][row_ix]].decode()) + \"', track: '\" + str(sed_h5['target_labels'][target_ix].decode()) + \"' => \" + str(round(sed_h5['nDi'][row_ix, target_ix], 4)))\n"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": null,
111 |    "id": "05cccfb6",
112 |    "metadata": {
113 |     "scrolled": true
114 |    },
115 |    "outputs": [],
116 |    "source": [
117 |     "#Calculate polyadenylation variant effect scores\n",
118 |     "\n",
119 |     "!./score_polya.sh\n"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": null,
125 |    "id": "43ac562f",
126 |    "metadata": {},
127 |    "outputs": [],
128 |    "source": [
129 |     "#Print an example variant effect prediction for a SNP-gene pair (polyadenylation)\n",
130 |     "\n",
131 |     "sed_h5 = h5py.File('snp_polya/f0c0/sed.h5', 'r')\n",
132 |     "\n",
133 |     "row_ix = 47\n",
134 |     "target_ix = 100\n",
135 |     "\n",
136 |     "print(\"score: 'logSED', snp: '\" + str(sed_h5['snp'][sed_h5['si'][row_ix]].decode()) + \"', gene: '\" + str(sed_h5['gene'][sed_h5['si'][row_ix]].decode()) + \"', track: '\" + str(sed_h5['target_labels'][target_ix].decode()) + \"' => \" + str(round(sed_h5['COVR'][row_ix, target_ix], 4)))\n"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": null,
142 |    "id": "0ba23572",
143 |    "metadata": {},
144 |    "outputs": [],
145 |    "source": []
146 |   }
147 |  ],
148 |  "metadata": {
149 |   "kernelspec": {
150 |    "display_name": "Python 3 (ipykernel)",
151 |    "language": "python",
152 |    "name": "python3"
153 |   },
154 |   "language_info": {
155 |    "codemirror_mode": {
156 |     "name": "ipython",
157 |     "version": 3
158 |    },
159 |    "file_extension": ".py",
160 |    "mimetype": "text/x-python",
161 |    "name": "python",
162 |    "nbconvert_exporter": "python",
163 |    "pygments_lexer": "ipython3",
164 |    "version": "3.8.15"
165 |   }
166 |  },
167 |  "nbformat": 4,
168 |  "nbformat_minor": 5
169 | }
170 | 


--------------------------------------------------------------------------------
/tutorials/latest/score_variants/score_expr_sad.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | mkdir -p snp_sad/f0c0
4 | 
5 | borzoi_sad.py -o snp_sad/f0c0 --rc --stats logD2 -t ../make_data/targets_human.txt ../train_model/params_mini.json ../train_model/mini_models/f0c0/train/model_best.h5 snps_expr.vcf
6 | 


--------------------------------------------------------------------------------
/tutorials/latest/score_variants/score_expr_sed.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | mkdir -p snp_sed/f0c0
4 | 
5 | borzoi_sed.py -o snp_sed/f0c0 --rc --stats logSED,logD2 -t ../make_data/targets_human.txt ../train_model/params_mini.json ../train_model/mini_models/f0c0/train/model_best.h5 snps_expr.vcf
6 | 


--------------------------------------------------------------------------------
/tutorials/latest/score_variants/score_polya.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | mkdir -p snp_polya/f0c0
4 | 
5 | borzoi_sed_paqtl_cov.py -o snp_polya/f0c0 --rc --stats COVR -t ../make_data/targets_human.txt ../train_model/params_mini.json ../train_model/mini_models/f0c0/train/model_best.h5 snps_polya.vcf
6 | 


--------------------------------------------------------------------------------
/tutorials/latest/score_variants/score_splice.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | mkdir -p snp_splice/f0c0
4 | 
5 | borzoi_sed.py -o snp_splice/f0c0 --span --no_untransform --rc --stats nDi -t ../make_data/targets_human.txt ../train_model/params_mini.json ../train_model/mini_models/f0c0/train/model_best.h5 snps_splice.vcf
6 | 


--------------------------------------------------------------------------------
/tutorials/latest/score_variants/snps_expr.vcf:
--------------------------------------------------------------------------------
1 | ##fileformat=VCFv4.2
2 | chr1	43110773	chr1_43110773_G_A_b38	G	A	.	.
3 | chr1	43120331	chr1_43120331_C_T_b38	C	T	.	.
4 | chr1	46309111	chr1_46309111_A_G_b38	A	G	.	.
5 | chr1	52632886	chr1_52632886_A_C_b38	A	C	.	.
6 | chr1	54053434	chr1_54053434_G_A_b38	G	A	.	.
7 | 


--------------------------------------------------------------------------------
/tutorials/latest/score_variants/snps_polya.vcf:
--------------------------------------------------------------------------------
 1 | ##fileformat=VCFv4.2
 2 | ##INFO=<ID=MT,Number=1,Type=String,Description="Molecular trait id">
 3 | ##INFO=<ID=PD,Number=1,Type=Integer,Description="PAS distance">
 4 | ##INFO=<ID=PI,Number=1,Type=String,Description="Positive SNP id">
 5 | #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
 6 | chr1	11790946	chr1_11790946_G_C	G	C	.	.	MT=ENSG00000177000.grp_2.downstream.ENST00000641805;PD=924;PI=chr1_11790946_G_C
 7 | chr1	150160094	chr1_150160094_C_G	C	G	.	.	MT=ENSG00000023902.grp_1.downstream.ENST00000369126;PD=29;PI=chr1_150160094_C_G
 8 | chr16	57665101	chr16_57665101_A_G	A	G	.	.	MT=ENSG00000205336.grp_1.downstream.ENST00000568908;PD=73;PI=chr16_57665101_A_G
 9 | chr16	80976052	chr16_80976052_T_G	T	G	.	.	MT=ENSG00000103121.grp_2.downstream.ENST00000565925;PD=24;PI=chr16_80976052_T_G
10 | chr16	88857261	chr16_88857261_T_C	T	C	.	.	MT=ENSG00000167515.grp_2.downstream.ENST00000564547;PD=3851;PI=chr16_88857261_T_C


--------------------------------------------------------------------------------
/tutorials/latest/score_variants/snps_splice.vcf:
--------------------------------------------------------------------------------
 1 | ##fileformat=VCFv4.2
 2 | ##INFO=<ID=MT,Number=1,Type=String,Description="Molecular trait id">
 3 | ##INFO=<ID=SD,Number=1,Type=Integer,Description="Splice distance">
 4 | ##INFO=<ID=PI,Number=1,Type=String,Description="Positive SNP id">
 5 | #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
 6 | chr1	1665061	chr1_1665061_C_T	C	T	.	.	MT=ENSG00000189339.grp_2.contained.ENST00000611123;SD=959;PI=chr1_1665061_C_T
 7 | chr1	1689221	chr1_1689221_G_A	G	A	.	.	MT=ENSG00000189339.grp_1.contained.ENST00000614300;SD=1753;PI=chr1_1689221_G_A
 8 | chr1	50655526	chr1_50655526_T_C	T	C	.	.	MT=ENSG00000185104.grp_2.contained.ENST00000396153;SD=3;PI=chr1_50655526_T_C
 9 | chr1	109489368	chr1_109489368_C_G	C	G	.	.	MT=ENSG00000143537.grp_2.contained.ENST00000360674;SD=1;PI=chr1_155060832_G_A
10 | chr1	156236330	chr1_156236330_G_A	G	A	.	.	MT=ENSG00000160783.grp_1.contained.ENST00000368279;SD=17;PI=chr1_156236330_G_A
11 | 


--------------------------------------------------------------------------------
/tutorials/latest/train_model/README.md:
--------------------------------------------------------------------------------
 1 | ## Model Training
 2 | 
 3 | This tutorial describes how to train smaller Borzoi models on the example RNA-seq experiment processed in the [make_data tutorial](https://github.com/calico/borzoi/tree/main/tutorials/latest/make_data).
 4 | 
 5 | To train a 'Mini Borzoi' ensemble (~40M parameters, 2 cross-validation folds), run the script 'train_mini.sh'. The model parameters are specified in 'params_mini.json'. This model can be trained with a batch size of 2 on a 24GB NVIDIA Titan RTX or RTX4090 GPU.
 6 | ```sh
 7 | conda activate borzoi_py310
 8 | cd ~/borzoi/tutorials/latest/train_model
 9 | ./train_mini.sh
10 | ```
11 | 
12 | Alternatively, to train an even smaller 'Micro Borzoi' ensemble (~5M parameters), run the script 'train_micro.sh'. This model can fit into the above GPU cards with a batch size of 4, which means the learning rate can be doubled and each epoch finished in half the time.
13 | ```sh
14 | ./train_micro.sh
15 | ```
16 | 
17 | *Notes*:
18 | - See [here](https://github.com/calico/borzoi-paper/tree/main/model) for a description of the scripts called internally by the training .sh script.
19 | - Rather than cropping the output predictions before applying the training loss, in the latest version of Borzoi models a smooth position-specific loss weight is applied that penalizes prediction errors less at the left/right boundaries.
20 | 


--------------------------------------------------------------------------------
/tutorials/latest/train_model/params_micro.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train": {
 3 |         "batch_size": 4,
 4 |         "shuffle_buffer": 256,
 5 |         "optimizer": "adam",
 6 |         "learning_rate": 0.0002,
 7 |         "loss": "poisson_mn",
 8 |         "total_weight": 0.2,
 9 |         "weight_range": 8,
10 |         "weight_exp": 6,
11 |         "warmup_steps": 10000,
12 |         "global_clipnorm": 0.2,
13 |         "adam_beta1": 0.9,
14 |         "adam_beta2": 0.999,
15 |         "patience": 30,
16 |         "train_epochs_min": 130,
17 |         "train_epochs_max": 180
18 |     },
19 |     "model": {
20 |         "seq_length": 393216,
21 |         "augment_rc": true,
22 |         "augment_shift": 3,
23 |         "activation": "gelu",
24 |         "norm_type": "batch",
25 |         "bn_momentum": 0.9,
26 |         "kernel_initializer": "lecun_normal",
27 |         "l2_scale": 1.0e-6,
28 |         "trunk": [
29 |             {
30 |                 "name": "conv_dna",
31 |                 "filters": 128,
32 |                 "kernel_size": 11,
33 |                 "norm_type": null,
34 |                 "activation": "linear",
35 |                 "pool_size": 2
36 |             },
37 |             {
38 |                 "name": "res_tower",
39 |                 "filters_init": 160,
40 |                 "filters_end": 320,
41 |                 "divisible_by": 8,
42 |                 "kernel_size": 5,
43 |                 "num_convs": 1,
44 |                 "pool_size": 2,
45 |                 "repeat": 6
46 |             },
47 |             {
48 |                 "name": "transformer_tower",
49 |                 "key_size": 32,
50 |                 "heads": 4,
51 |                 "num_position_features": 32,
52 |                 "dropout": 0.1,
53 |                 "attention_dropout": 0.01,
54 |                 "mha_l2_scale": 1.0e-8,
55 |                 "l2_scale": 1.0e-8,
56 |                 "kernel_initializer": "he_normal",
57 |                 "repeat": 4
58 |             },
59 |             {
60 |                 "name": "unet_conv",
61 |                 "kernel_size": 3
62 |             },
63 |             {
64 |                 "name": "unet_conv",
65 |                 "kernel_size": 3
66 |             }
67 |         ],
68 |         "head_human": {
69 |             "name": "final",
70 |             "units": 2,
71 |             "activation": "softplus"
72 |         }
73 |     }
74 | }
75 | 


--------------------------------------------------------------------------------
/tutorials/latest/train_model/params_mini.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train": {
 3 |         "batch_size": 2,
 4 |         "shuffle_buffer": 256,
 5 |         "optimizer": "adam",
 6 |         "learning_rate": 0.0001,
 7 |         "loss": "poisson_mn",
 8 |         "total_weight": 0.2,
 9 |         "weight_range": 8,
10 |         "weight_exp": 6,
11 |         "warmup_steps": 20000,
12 |         "global_clipnorm": 0.1,
13 |         "adam_beta1": 0.9,
14 |         "adam_beta2": 0.999,
15 |         "patience": 30,
16 |         "train_epochs_min": 130,
17 |         "train_epochs_max": 180
18 |     },
19 |     "model": {
20 |         "seq_length": 393216,
21 |         "augment_rc": true,
22 |         "augment_shift": 3,
23 |         "activation": "gelu",
24 |         "norm_type": "batch",
25 |         "bn_momentum": 0.9,
26 |         "kernel_initializer": "lecun_normal",
27 |         "l2_scale": 5.0e-7,
28 |         "trunk": [
29 |             {
30 |                 "name": "conv_dna",
31 |                 "filters": 320,
32 |                 "kernel_size": 11,
33 |                 "norm_type": null,
34 |                 "activation": "linear",
35 |                 "pool_size": 2
36 |             },
37 |             {
38 |                 "name": "res_tower",
39 |                 "filters_init": 384,
40 |                 "filters_end": 768,
41 |                 "divisible_by": 16,
42 |                 "kernel_size": 5,
43 |                 "num_convs": 1,
44 |                 "pool_size": 2,
45 |                 "repeat": 6
46 |             },
47 |             {
48 |                 "name": "transformer_tower",
49 |                 "key_size": 64,
50 |                 "heads": 4,
51 |                 "num_position_features": 32,
52 |                 "dropout": 0.2,
53 |                 "mha_l2_scale": 1.0e-8,
54 |                 "l2_scale": 1.0e-8,
55 |                 "kernel_initializer": "he_normal",
56 |                 "repeat": 8
57 |             },
58 |             {
59 |                 "name": "unet_conv",
60 |                 "kernel_size": 3
61 |             },
62 |             {
63 |                 "name": "unet_conv",
64 |                 "kernel_size": 3
65 |             }
66 |         ],
67 |         "head_human": {
68 |             "name": "final",
69 |             "units": 2,
70 |             "activation": "softplus"
71 |         }
72 |     }
73 | }
74 | 


--------------------------------------------------------------------------------
/tutorials/latest/train_model/train_micro.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | westminster_train_folds.py -e borzoi_py310 -f 2 -c 1 -q rtx4090 -o micro_models params_micro.json ../make_data/data/hg38
4 | 


--------------------------------------------------------------------------------
/tutorials/latest/train_model/train_mini.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | westminster_train_folds.py -e borzoi_py310 -f 2 -c 1 -q rtx4090 -o mini_models params_mini.json ../make_data/data/hg38
4 | 


--------------------------------------------------------------------------------
/tutorials/legacy/interpret_sequence/README.md:
--------------------------------------------------------------------------------
 1 | ## Interpretation
 2 | 
 3 | This tutorial describes how to compute gradient saliency scores (sequence attributions) with respect to various statistics computed for a list of input genes specified in a .gtf file. This example uses the pre-trained, published Borzoi model to compute gradients. To download this model, run the script 'download_models.sh' in the 'borzoi' root folder.
 4 | 
 5 | First, to compute input gradients with respect to the log-sum of coverage across the exons of the target gene, run the script 'run_gradients_expr_CFHR2.sh'.
 6 | ```sh
 7 | conda activate borzoi_py310
 8 | cd ~/borzoi/tutorials/legacy/interpret_sequence
 9 | ./run_gradients_expr_CFHR2.sh
10 | ```
11 | 
12 | To compute input gradients with respect to the log-ratio of coverage immediately upstream and downstream of the distal polyA site of the target gene, run the script 'run_gradients_polya_CD99.sh'.
13 | ```sh
14 | ./run_gradients_polya_CD99.sh
15 | ```
16 | 
17 | To compute input gradients with respect to the log-ratio of coverage of an exon of the target gene relative to intronic coverage, run the script 'run_gradients_splice_GCFC2.sh'.
18 | ```sh
19 | ./run_gradients_splice_GCFC2.sh
20 | ```
21 | Currently, the splicing gradient script chooses one exon at random to compute gradients for. While this approach was favorable for the specific analysis of the manuscript, we acknowledge that this is not particularly useful to users wanting to investigate an exon of their choice. We plan on updating this script soon to allow users to specify which exon to calculate gradients for.
22 | 
23 | *Notes*:
24 | - The track scale, squashing exponentiation, and clip-soft threshold, are specific in the .py script arguments (flags: '--track_scale, '--track_transform', '--clip_soft'), and the values in the targets file are ignored. This means that the same data transformation parameters are applied to all tracks specified in the targets file. To calculate gradients for groups of tracks with different data transforms, separate these tracks into different targets files, and execute the gradient script on each group separately.
25 | - The legacy data transforms are activated in all above .sh scripts with the flag '--untransform_old'.
26 | 


--------------------------------------------------------------------------------
/tutorials/legacy/interpret_sequence/run_gradients_expr_CFHR2.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | borzoi_satg_gene.py -o ../../../examples/saved_models/gtex_CFHR2 -f 3 -c 0 --rc --untransform_old --track_scale 0.01 --track_transform 0.75 --clip_soft 384.0 -t ../../../examples/targets_gtex_liver.txt ../../../examples/params_pred.json ../../../examples/saved_models ../../../examples/CFHR2_example.gtf
4 | 


--------------------------------------------------------------------------------
/tutorials/legacy/interpret_sequence/run_gradients_polya_CD99.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | borzoi_satg_polya.py -o ../../../examples/saved_models/gtex_CD99 -f 3 -c 0 --rc --untransform_old --track_scale 0.01 --track_transform 0.75 --clip_soft 384.0 -t ../../../examples/targets_gtex.txt ../../../examples/params_pred.json ../../../examples/saved_models ../../../examples/CD99_example.gtf
4 | 


--------------------------------------------------------------------------------
/tutorials/legacy/interpret_sequence/run_gradients_splice_GCFC2.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | borzoi_satg_splice.py -o ../../../examples/saved_models/gtex_GCFC2 -f 3 -c 0 --rc --untransform_old --track_scale 0.01 --track_transform 0.75 --clip_soft 384.0 -t ../../../examples/targets_gtex.txt ../../../examples/params_pred.json ../../../examples/saved_models ../../../examples/GCFC2_example.gtf
4 | 


--------------------------------------------------------------------------------
/tutorials/legacy/interpret_sequence/vis_helpers.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import numpy as np
  4 | 
  5 | import matplotlib.pyplot as plt
  6 | 
  7 | import matplotlib.cm as cm
  8 | import matplotlib.colors as colors
  9 | 
 10 | import matplotlib as mpl
 11 | from matplotlib.text import TextPath
 12 | from matplotlib.patches import PathPatch, Rectangle
 13 | from matplotlib.font_manager import FontProperties
 14 | from matplotlib import gridspec
 15 | from matplotlib.ticker import FormatStrFormatter
 16 | 
 17 | #Helper function to draw a letter at a given position
 18 | def dna_letter_at(letter, x, y, yscale=1, ax=None, color=None, alpha=1.0):
 19 | 
 20 |     fp = FontProperties(family="DejaVu Sans", weight="bold")
 21 |     globscale = 1.35
 22 |     LETTERS = {	"T" : TextPath((-0.305, 0), "T", size=1, prop=fp),
 23 |                 "G" : TextPath((-0.384, 0), "G", size=1, prop=fp),
 24 |                 "A" : TextPath((-0.35, 0), "A", size=1, prop=fp),
 25 |                 "C" : TextPath((-0.366, 0), "C", size=1, prop=fp),
 26 |                 "UP" : TextPath((-0.488, 0), '$\\Uparrow$', size=1, prop=fp),
 27 |                 "DN" : TextPath((-0.488, 0), '$\\Downarrow$', size=1, prop=fp),
 28 |                 "(" : TextPath((-0.25, 0), "(", size=1, prop=fp),
 29 |                 "." : TextPath((-0.125, 0), "-", size=1, prop=fp),
 30 |                 ")" : TextPath((-0.1, 0), ")", size=1, prop=fp)}
 31 |     COLOR_SCHEME = {'G': 'orange',#'orange', 
 32 |                     'A': 'green',#'red', 
 33 |                     'C': 'blue',#'blue', 
 34 |                     'T': 'red',#'darkgreen',
 35 |                     'UP': 'green', 
 36 |                     'DN': 'red',
 37 |                     '(': 'black',
 38 |                     '.': 'black', 
 39 |                     ')': 'black'}
 40 | 
 41 | 
 42 |     text = LETTERS[letter]
 43 | 
 44 |     chosen_color = COLOR_SCHEME[letter]
 45 |     if color is not None :
 46 |         chosen_color = color
 47 | 
 48 |     t = mpl.transforms.Affine2D().scale(1*globscale, yscale*globscale) + \
 49 |         mpl.transforms.Affine2D().translate(x,y) + ax.transData
 50 |     p = PathPatch(text, lw=0, fc=chosen_color, alpha=alpha, transform=t)
 51 |     if ax != None:
 52 |         ax.add_artist(p)
 53 |     return p
 54 | 
 55 | #Function to plot sequence logo
 56 | def plot_seq_scores(importance_scores, figsize=(16, 2), plot_y_ticks=True, y_min=None, y_max=None, save_figs=False, fig_name="default") :
 57 | 
 58 |     importance_scores = importance_scores.T
 59 | 
 60 |     fig = plt.figure(figsize=figsize)
 61 |     
 62 |     ref_seq = ""
 63 |     for j in range(importance_scores.shape[1]) :
 64 |         argmax_nt = np.argmax(np.abs(importance_scores[:, j]))
 65 |         
 66 |         if argmax_nt == 0 :
 67 |             ref_seq += "A"
 68 |         elif argmax_nt == 1 :
 69 |             ref_seq += "C"
 70 |         elif argmax_nt == 2 :
 71 |             ref_seq += "G"
 72 |         elif argmax_nt == 3 :
 73 |             ref_seq += "T"
 74 | 
 75 |     ax = plt.gca()
 76 |     
 77 |     for i in range(0, len(ref_seq)) :
 78 |         mutability_score = np.sum(importance_scores[:, i])
 79 |         color = None
 80 |         dna_letter_at(ref_seq[i], i + 0.5, 0, mutability_score, ax, color=color)
 81 |     
 82 |     plt.sca(ax)
 83 |     plt.xticks([], [])
 84 |     plt.gca().yaxis.set_major_formatter(FormatStrFormatter('%.3f'))
 85 |     
 86 |     plt.xlim((0, len(ref_seq)))
 87 |     
 88 |     #plt.axis('off')
 89 |     
 90 |     if plot_y_ticks :
 91 |         plt.yticks(fontsize=12)
 92 |     else :
 93 |         plt.yticks([], [])
 94 |     
 95 |     if y_min is not None and y_max is not None :
 96 |         plt.ylim(y_min, y_max)
 97 |     elif y_min is not None :
 98 |         plt.ylim(y_min)
 99 |     else :
100 |         plt.ylim(
101 |             np.min(importance_scores) - 0.1 * np.max(np.abs(importance_scores)),
102 |             np.max(importance_scores) + 0.1 * np.max(np.abs(importance_scores))
103 |         )
104 |     
105 |     plt.axhline(y=0., color='black', linestyle='-', linewidth=1)
106 | 
107 |     #for axis in fig.axes :
108 |     #    axis.get_xaxis().set_visible(False)
109 |     #    axis.get_yaxis().set_visible(False)
110 | 
111 |     plt.tight_layout()
112 | 
113 |     if save_figs :
114 |         plt.savefig(fig_name + ".png", transparent=True, dpi=300)
115 |         plt.savefig(fig_name + ".eps")
116 | 
117 |     plt.show()
118 | 
119 | #Function to visualize a pair of sequence logos
120 | def visualize_input_gradient_pair(att_grad_wt, att_grad_mut, plot_start=0, plot_end=100, save_figs=False, fig_name='') :
121 | 
122 |     scores_wt = att_grad_wt[plot_start:plot_end, :]
123 |     scores_mut = att_grad_mut[plot_start:plot_end, :]
124 | 
125 |     y_min = min(np.min(scores_wt), np.min(scores_mut))
126 |     y_max = max(np.max(scores_wt), np.max(scores_mut))
127 | 
128 |     y_max_abs = max(np.abs(y_min), np.abs(y_max))
129 | 
130 |     y_min = y_min - 0.05 * y_max_abs
131 |     y_max = y_max + 0.05 * y_max_abs
132 | 
133 |     if np.sum(scores_mut) != 0. :
134 |         print("--- WT ---")
135 |     
136 |     plot_seq_scores(
137 |         scores_wt, y_min=y_min, y_max=y_max,
138 |         figsize=(8, 1),
139 |         plot_y_ticks=False,
140 |         save_figs=save_figs,
141 |         fig_name=fig_name + '_wt',
142 |     )
143 | 
144 |     if np.sum(scores_mut) != 0. :
145 |     
146 |         print("--- Mut ---")
147 |         plot_seq_scores(
148 |             scores_mut, y_min=y_min, y_max=y_max,
149 |             figsize=(8, 1),
150 |             plot_y_ticks=False,
151 |             save_figs=save_figs,
152 |             fig_name=fig_name + '_mut',
153 |         )
154 | 


--------------------------------------------------------------------------------
/tutorials/legacy/make_data/Makefile:
--------------------------------------------------------------------------------
 1 | FASTA_HUMAN=$$BORZOI_HG38/assembly/ucsc/hg38.ml.fa
 2 | GAPS_HUMAN=$$BORZOI_HG38/assembly/ucsc/hg38_gaps.bed
 3 | UMAP_HUMAN=$$BORZOI_HG38/mappability/umap_k36_t10_l32.bed
 4 | BLACK_HUMAN=$$BORZOI_HG38/blacklist/blacklist_hg38_all.bed
 5 | 
 6 | FASTA_MOUSE=$$BORZOI_MM10/assembly/ucsc/mm10.ml.fa
 7 | GAPS_MOUSE=$$BORZOI_MM10/assembly/ucsc/mm10_gaps.bed
 8 | UMAP_MOUSE=$$BORZOI_MM10/mappability/umap_k36_t10_l32.bed
 9 | BLACK_MOUSE=$$BORZOI_MM10/blacklist/blacklist_mm10_all.bed
10 | 
11 | ALIGN=$$BORZOI_HG38/align/hg38.mm10.syn.net.gz
12 | 
13 | OUT=data
14 | 
15 | # mini borzoi configuration
16 | LENGTH=393216
17 | TSTRIDE=65551 # (393216-2*98304)/3 + 15
18 | CROP=98304
19 | WIDTH=32
20 | FOLDS=8
21 | 
22 | AOPTS=--break 2097152 -c $(CROP) --nf 524288 --no 393216 -l $(LENGTH) --stride $(TSTRIDE) -f $(FOLDS) --umap_t 0.5 -w $(WIDTH)
23 | DOPTS=-c $(CROP) -d 2 -f $(FOLDS) -l $(LENGTH) -p 64 -r 16 --umap_clip 0.5 -w $(WIDTH) --transform_old
24 | 
25 | all: $(OUT)/hg38/tfrecords/train-0.tfr # $(OUT)/mm10/tfrecords/train-0.tfr
26 | 
27 | umap_human.bed:
28 | 	cat $(UMAP_HUMAN) $(BLACK_HUMAN) | awk 'BEGIN {OFS="\t"} {print $$1, $$2, $$3}' | bedtools sort -i - | bedtools merge -i - > umap_human.bed
29 | 
30 | umap_mouse.bed:
31 | 	cat $(UMAP_MOUSE) $(BLACK_MOUSE) | awk 'BEGIN {OFS="\t"} {print $$1, $$2, $$3}' | bedtools sort -i - | bedtools merge -i - > umap_mouse.bed
32 | 
33 | # targets file is already generated in this example
34 | #targets_human.txt targets_mouse.txt:
35 | #	./make_targets.py
36 | 
37 | $(OUT)/hg38/sequences.bed $(OUT)/mm10/sequences.bed: umap_human.bed umap_mouse.bed
38 | 	hound_data_align.py -a hg38,mm10 -g $(GAPS_HUMAN),$(GAPS_MOUSE) -u umap_human.bed,umap_mouse.bed $(AOPTS) -o $(OUT) $(ALIGN) $(FASTA_HUMAN),$(FASTA_MOUSE)
39 | 
40 | $(OUT)/hg38/tfrecords/train-0.tfr: $(OUT)/hg38/sequences.bed targets_human.txt
41 | 	hound_data.py --restart $(DOPTS) -b $(BLACK_HUMAN) -o $(OUT)/hg38 $(FASTA_HUMAN) -u umap_human.bed targets_human.txt
42 | 
43 | # no mouse data in this example
44 | #$(OUT)/mm10/tfrecords/train-0.tfr: $(OUT)/mm10/sequences.bed targets_mouse.txt
45 | #	hound_data.py --restart $(DOPTS) -b $(BLACK_MOUSE) -o $(OUT)/mm10 $(FASTA_MOUSE) -u umap_mouse.bed targets_mouse.txt
46 | 


--------------------------------------------------------------------------------
/tutorials/legacy/make_data/README.md:
--------------------------------------------------------------------------------
 1 | ## Data Processing
 2 | 
 3 | This tutorial decribes how to process a .bigwig sequencing experiment into compressed .w5 format, merge replicates, generate QC metrics, and finally create TFRecord files containing binned coverage values suitable for training Borzoi models. We will exemplify this for the ENCODE K562 RNA-seq experiment [ENCSR000AEL](https://www.encodeproject.org/experiments/ENCSR000AEL/).
 4 | 
 5 | First, activate the conda environment and run the script 'download_dependencies.sh' to download required auxiliary files.
 6 | ```sh
 7 | conda activate borzoi_py310
 8 | cd ~/borzoi/tutorials/legacy/make_data
 9 | ./download_dependencies.sh
10 | ```
11 | 
12 | Next, run the script 'download_bw.sh' to download sample ENCODE .bigwig files and arrange them in a folder structure.
13 | ```sh
14 | ./download_bw.sh
15 | ```
16 | 
17 | Then run script 'process_w5.sh' to generate compressed .w5 files (hdf5) from the input .bigwig files, merge the two replicates, and calculate basic QC metrics. This .sh script internally calls 'bw_h5.py' to generate .w5 files, 'w5_merge.py' to merge replicates, and 'w5_qc.py' to calculate QC metrics.
18 | ```sh
19 | ./process_w5.sh
20 | ```
21 | 
22 | Finally, run the Makefile to create genome-wide binned coverage tracks, stored as compressed TFRecords.
23 | ```sh
24 | make
25 | ```
26 | 
27 | In this example, the Makefile creates 8 cross-validation folds of TFRecords with input sequences of length 393216 bp, generated with a genome-wide stride of 65551 bp (which is ~1/3 of the cropped sequence length, but shifts the bin boundaries, too). The output coverage tracks corresponding to each input sequence are cropped by 98304 bp on each side, before pooling the measurements in 32 bp bins. This results in 6144 coverage bins per 393kb sequence. The specific .w5 tracks to include in the TFRecord generation, and the scales and pooling transforms applied to the bins of each experiment, are given in the targets file 'targets_human.txt'. Below is a description of the columns in this file.
28 | 
29 | *targets_human.txt*:
30 | - (unnamed) => integer index of each track (must start from 0 when training a new model).
31 | - 'identifier' => unique identifier of each experiment (and strand).
32 | - 'file' => local file path to .w5 file.
33 | - 'clip' => hard clipping threshold to be applied to each bin, after soft-clipping.
34 | - 'clip_soft' => soft clipping (squashing) threshold.
35 | - 'scale' => scale value applied to each 32 bp bin after clipping.
36 | - 'sum_stat' => type of bin-level pooling operation ('sum_sqrt' = sum and exponentiate by 3/4).
37 | - 'strand_pair' => integer index of the other stranded track of an experiment (same index as current row if unstranded).
38 | - 'description' => text description of experiment.
39 | 
40 | *Notes*:
41 | - See [here](https://github.com/calico/borzoi-paper/tree/main/data/training) for a description of the scripts called by the Makefile to create TFRecords.
42 | - Of note, the **legacy** settings are activated in these data processing scripts with the flag '--transform_old' in the Makefile.
43 | - The **legacy** approach crops to the coverage tracks, a practice we have since abandonded in favor of a position-specific loss scale.
44 | 


--------------------------------------------------------------------------------
/tutorials/legacy/make_data/download_bw.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # download example data from ENCODE (ENCSR000AEL - K562 RNA-seq); 2 replicates
 4 | 
 5 | # define ENCODE ID
 6 | ENC_ID='ENCSR000AEL'
 7 | 
 8 | # define remote urls
 9 | URL_P_REP1='https://www.encodeproject.org/files/ENCFF980ZHM/@@download/ENCFF980ZHM.bigWig'
10 | URL_M_REP1='https://www.encodeproject.org/files/ENCFF533LJF/@@download/ENCFF533LJF.bigWig'
11 | 
12 | URL_P_REP2='https://www.encodeproject.org/files/ENCFF335LVS/@@download/ENCFF335LVS.bigWig'
13 | URL_M_REP2='https://www.encodeproject.org/files/ENCFF257NOL/@@download/ENCFF257NOL.bigWig'
14 | 
15 | # define ENCODE file IDs
16 | FILE_P_REP1='ENCFF980ZHM'
17 | FILE_M_REP1='ENCFF533LJF'
18 | 
19 | FILE_P_REP2='ENCFF335LVS'
20 | FILE_M_REP2='ENCFF257NOL'
21 | 
22 | # create folder for bigwig files
23 | mkdir -p "human/rna/encode/$ENC_ID/rep1"
24 | mkdir -p "human/rna/encode/$ENC_ID/rep2"
25 | 
26 | 
27 | # download bigwig files; rep1
28 | if [ -f "human/rna/encode/$ENC_ID/rep1/$FILE_P_REP1.bigWig" ]; then
29 |   echo "example RNA-seq data already downloaded (rep 1)."
30 | else
31 |   wget $URL_P_REP1 -O "human/rna/encode/$ENC_ID/rep1/$FILE_P_REP1.bigWig"
32 |   wget $URL_M_REP1 -O "human/rna/encode/$ENC_ID/rep1/$FILE_M_REP1.bigWig"
33 | fi
34 | 
35 | # download bigwig files; rep2
36 | if [ -f "human/rna/encode/$ENC_ID/rep2/$FILE_P_REP2.bigWig" ]; then
37 |   echo "example RNA-seq data already downloaded (rep 2)."
38 | else
39 |   wget $URL_P_REP2 -O "human/rna/encode/$ENC_ID/rep2/$FILE_P_REP2.bigWig"
40 |   wget $URL_M_REP2 -O "human/rna/encode/$ENC_ID/rep2/$FILE_M_REP2.bigWig"
41 | fi
42 | 


--------------------------------------------------------------------------------
/tutorials/legacy/make_data/download_dependencies.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # create additional folder in borzoi data folders
 4 | mkdir -p "$BORZOI_HG38/assembly/ucsc"
 5 | mkdir -p "$BORZOI_HG38/assembly/gnomad"
 6 | mkdir -p "$BORZOI_HG38/mappability"
 7 | mkdir -p "$BORZOI_HG38/blacklist"
 8 | mkdir -p "$BORZOI_HG38/align"
 9 | 
10 | mkdir -p "$BORZOI_MM10/assembly/ucsc"
11 | mkdir -p "$BORZOI_MM10/mappability"
12 | mkdir -p "$BORZOI_MM10/blacklist"
13 | 
14 | 
15 | # download and uncompress auxiliary files required for Makefile (hg38)
16 | if [ -f "$BORZOI_HG38/assembly/ucsc/hg38_gaps.bed" ]; then
17 |   echo "hg38_gaps.bed already exists."
18 | else
19 |   wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/hg38_gaps.bed.gz | gunzip -c > "$BORZOI_HG38/assembly/ucsc/hg38_gaps.bed"
20 | fi
21 | 
22 | if [ -f "$BORZOI_HG38/mappability/umap_k36_t10_l32.bed" ]; then
23 |   echo "umap_k36_t10_l32.bed (hg38) already exists."
24 | else
25 |   wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/umap_k36_t10_l32_hg38.bed.gz | gunzip -c > "$BORZOI_HG38/mappability/umap_k36_t10_l32.bed"
26 | fi
27 | 
28 | if [ -f "$BORZOI_HG38/blacklist/blacklist_hg38_all.bed" ]; then
29 |   echo "blacklist_hg38_all.bed already exists."
30 | else
31 |   wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/blacklist_hg38_all.bed.gz | gunzip -c > "$BORZOI_HG38/blacklist/blacklist_hg38_all.bed"
32 | fi
33 | 
34 | if [ -f "$BORZOI_HG38/align/hg38.mm10.syn.net.gz" ]; then
35 |   echo "Splice site annotation already exist."
36 | else
37 |   wget https://storage.googleapis.com/seqnn-share/helper/dependencies/hg38.mm10.syn.net.gz -O "$BORZOI_HG38/align/hg38.mm10.syn.net.gz"
38 | fi
39 | 
40 | 
41 | # download and uncompress auxiliary files required for Makefile (mm10)
42 | if [ -f "$BORZOI_MM10/assembly/ucsc/mm10_gaps.bed" ]; then
43 |   echo "mm10_gaps.bed already exists."
44 | else
45 |   wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/mm10_gaps.bed.gz | gunzip -c > "$BORZOI_MM10/assembly/ucsc/mm10_gaps.bed"
46 | fi
47 | 
48 | if [ -f "$BORZOI_MM10/mappability/umap_k36_t10_l32.bed" ]; then
49 |   echo "umap_k36_t10_l32.bed (mm10) already exists."
50 | else
51 |   wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/umap_k36_t10_l32_mm10.bed.gz | gunzip -c > "$BORZOI_MM10/mappability/umap_k36_t10_l32.bed"
52 | fi
53 | 
54 | if [ -f "$BORZOI_MM10/blacklist/blacklist_mm10_all.bed" ]; then
55 |   echo "blacklist_mm10_all.bed already exists."
56 | else
57 |   wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/blacklist_mm10_all.bed.gz | gunzip -c > "$BORZOI_MM10/blacklist/blacklist_mm10_all.bed"
58 | fi
59 | 
60 | 
61 | # download and uncompress pre-compiled umap bed files
62 | if [ -f umap_human.bed ]; then
63 |   echo "umap_human.bed already exists."
64 | else
65 |   wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/umap_human.bed.gz | gunzip -c > umap_human.bed
66 | fi
67 | 
68 | if [ -f umap_mouse.bed ]; then
69 |   echo "umap_mouse.bed already exists."
70 | else
71 |   wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/umap_mouse.bed.gz | gunzip -c > umap_mouse.bed
72 | fi
73 | 
74 | 
75 | # download and index hg38 ml genome
76 | if [ -f "$BORZOI_HG38/assembly/ucsc/hg38.ml.fa" ]; then
77 |   echo "hg38.ml.fa already exists."
78 | else
79 |   wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/hg38.ml.fa.gz | gunzip -c > "$BORZOI_HG38/assembly/ucsc/hg38.ml.fa"
80 |   idx_genome.py "$BORZOI_HG38/assembly/ucsc/hg38.ml.fa"
81 | fi
82 | 
83 | # download and index hg38 ml genome (gnomad major alleles)
84 | if [ -f "$BORZOI_HG38/assembly/gnomad/hg38.ml.fa" ]; then
85 |   echo "hg38.ml.fa (gnomad) already exists."
86 | else
87 |   wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/hg38_gnomad.ml.fa.gz | gunzip -c > "$BORZOI_HG38/assembly/gnomad/hg38.ml.fa"
88 |   idx_genome.py "$BORZOI_HG38/assembly/gnomad/hg38.ml.fa"
89 | fi
90 | 
91 | # download and index mm10 ml genome
92 | if [ -f "$BORZOI_MM10/assembly/ucsc/mm10.ml.fa" ]; then
93 |   echo "mm10.ml.fa already exists."
94 | else
95 |   wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/mm10.ml.fa.gz | gunzip -c > "$BORZOI_MM10/assembly/ucsc/mm10.ml.fa"
96 |   idx_genome.py "$BORZOI_MM10/assembly/ucsc/mm10.ml.fa"
97 | fi
98 | 


--------------------------------------------------------------------------------
/tutorials/legacy/make_data/process_w5.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # merge bigwig replicates, generate .w5 files and run qc
 4 | 
 5 | # define ENCODE ID
 6 | ENC_ID='ENCSR000AEL'
 7 | 
 8 | # define ENCODE file IDs
 9 | FILE_P_REP1='ENCFF980ZHM'
10 | FILE_M_REP1='ENCFF533LJF'
11 | 
12 | FILE_P_REP2='ENCFF335LVS'
13 | FILE_M_REP2='ENCFF257NOL'
14 | 
15 | # create folder for merged replicate files
16 | mkdir -p "human/rna/encode/$ENC_ID/summary"
17 | 
18 | 
19 | # step 1: generate per-replicate .w5 files
20 | 
21 | # rep1
22 | if [ -f "human/rna/encode/$ENC_ID/rep1/$FILE_P_REP1+.w5" ]; then
23 |   echo "example RNA-seq .w5 already exists (rep 1)."
24 | else
25 |   bw_h5.py -z "human/rna/encode/$ENC_ID/rep1/$FILE_P_REP1.bigWig" "human/rna/encode/$ENC_ID/rep1/$FILE_P_REP1+.w5"
26 |   bw_h5.py -z "human/rna/encode/$ENC_ID/rep1/$FILE_M_REP1.bigWig" "human/rna/encode/$ENC_ID/rep1/$FILE_M_REP1-.w5"
27 | fi
28 | 
29 | # rep2
30 | if [ -f "human/rna/encode/$ENC_ID/rep2/$FILE_P_REP2+.w5" ]; then
31 |   echo "example RNA-seq .w5 already exists (rep 2)."
32 | else
33 |   bw_h5.py -z "human/rna/encode/$ENC_ID/rep2/$FILE_P_REP2.bigWig" "human/rna/encode/$ENC_ID/rep2/$FILE_P_REP2+.w5"
34 |   bw_h5.py -z "human/rna/encode/$ENC_ID/rep2/$FILE_M_REP2.bigWig" "human/rna/encode/$ENC_ID/rep2/$FILE_M_REP2-.w5"
35 | fi
36 | 
37 | 
38 | # step 2: merge replicates
39 | 
40 | if [ -f "human/rna/encode/$ENC_ID/summary/coverage+.w5" ]; then
41 |   echo "example RNA-seq .w5 already exists (merged)."
42 | else
43 |   w5_merge.py -w -s mean -z "human/rna/encode/$ENC_ID/summary/coverage+.w5" "human/rna/encode/$ENC_ID/rep1/$FILE_P_REP1+.w5" "human/rna/encode/$ENC_ID/rep2/$FILE_P_REP2+.w5"
44 |   w5_merge.py -w -s mean -z "human/rna/encode/$ENC_ID/summary/coverage-.w5" "human/rna/encode/$ENC_ID/rep1/$FILE_M_REP1-.w5" "human/rna/encode/$ENC_ID/rep2/$FILE_M_REP2-.w5"
45 | fi
46 | 
47 | 
48 | # step 3: run qc on each replicate and the merged file
49 | 
50 | if [ -f "human/rna/encode/$ENC_ID/summary/covqc/means.txt" ]; then
51 |   echo "qc statistics already exist."
52 | else
53 |   # rep1
54 |   w5_qc.py -b "$BORZOI_HG38/blacklist/blacklist_hg38_all.bed" -o "human/rna/encode/$ENC_ID/rep1/covqc" "human/rna/encode/$ENC_ID/rep1/$FILE_P_REP1+.w5"
55 |   w5_qc.py -b "$BORZOI_HG38/blacklist/blacklist_hg38_all.bed" -o "human/rna/encode/$ENC_ID/rep1/covqc_m" "human/rna/encode/$ENC_ID/rep1/$FILE_M_REP1-.w5"
56 | 
57 |   # rep2
58 |   w5_qc.py -b "$BORZOI_HG38/blacklist/blacklist_hg38_all.bed" -o "human/rna/encode/$ENC_ID/rep2/covqc" "human/rna/encode/$ENC_ID/rep2/$FILE_P_REP2+.w5"
59 |   w5_qc.py -b "$BORZOI_HG38/blacklist/blacklist_hg38_all.bed" -o "human/rna/encode/$ENC_ID/rep2/covqc_m" "human/rna/encode/$ENC_ID/rep2/$FILE_M_REP2-.w5"
60 | 
61 |   # summary
62 |   w5_qc.py -b "$BORZOI_HG38/blacklist/blacklist_hg38_all.bed" -o "human/rna/encode/$ENC_ID/summary/covqc" "human/rna/encode/$ENC_ID/summary/coverage+.w5"
63 |   w5_qc.py -b "$BORZOI_HG38/blacklist/blacklist_hg38_all.bed" -o "human/rna/encode/$ENC_ID/summary/covqc_m" "human/rna/encode/$ENC_ID/summary/coverage-.w5"
64 | fi
65 | 
66 | 


--------------------------------------------------------------------------------
/tutorials/legacy/make_data/targets_human.txt:
--------------------------------------------------------------------------------
1 | 	identifier	file	clip	clip_soft	scale	sum_stat	strand_pair	description
2 | 0	ENCFF980ZHM+	human/rna/encode/ENCSR000AEL/summary/coverage+.w5	768	384	0.3	sum_sqrt	1	RNA:K562
3 | 1	ENCFF980ZHM-	human/rna/encode/ENCSR000AEL/summary/coverage-.w5	768	384	0.3	sum_sqrt	0	RNA:K562
4 | 


--------------------------------------------------------------------------------
/tutorials/legacy/score_variants/README.md:
--------------------------------------------------------------------------------
 1 | ## Variant Scoring
 2 | 
 3 | This tutorial describes how to predict variant effect scores for a small set of SNVs defined in a .vcf file. For examples showcasing variant effect prediction at a larger scale (e.g. fine-mapped eQTL classification benchmarks), we refer the user to the [borzoi-paper respository](https://github.com/calico/borzoi-paper/tree/main). This example uses the pre-trained, published Borzoi model to predict variant effects. To download this model, run the script 'download_models.sh' in the 'borzoi' root folder.
 4 | 
 5 | First, to calculate **gene-specific expression** scores, run the script 'score_expr_sed.sh'. Two different statistics are computed: (1) logSED (gene expression log fold change), and (2) logD2 (bin-level L2 norm across the coverage profile intersecting the exons of the gene).
 6 | ```sh
 7 | conda activate borzoi_py310
 8 | cd ~/borzoi/tutorials/legacy/score_variants
 9 | ./score_expr_sed.sh
10 | ```
11 | 
12 | To calculate **gene-agnostic expression** scores, run the script 'score_expr_sad.sh'. One statistic is computed: logD2 (bin-level L2 norm across the entire predicted coverage track).
13 | ```sh
14 | ./score_expr_sad.sh
15 | ```
16 | 
17 | To calculate **gene-specific polyadenylation** scores, run the script 'score_polya.sh'. One statistic is computed: COVR (3' coverage ratio across pA junctions of the target gene).
18 | ```sh
19 | ./score_polya.sh
20 | ```
21 | 
22 | To calculate **gene-specific splicing** scores, run the script 'score_splice.sh'. One statistic is computed: nDi (normalized maximum absolute difference in coverage bins across the target gene span).
23 | ```sh
24 | ./score_splice.sh
25 | ```
26 | 
27 | Finally, the jupyter notebook 'run_variant_scripts.ipynb' is provided for convenience to execute all above scripts. The notebook also exemplifies how to navigate the variant prediction hdf5 files and print some example scores.
28 | 
29 | *Notes*:
30 | - The legacy data transforms are activated in all above .sh scripts with the flag '-u'.
31 | 
32 | 


--------------------------------------------------------------------------------
/tutorials/legacy/score_variants/run_variant_scripts.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "f5d0f9fb",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import os\n",
 11 |     "import sys\n",
 12 |     "import h5py\n"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": null,
 18 |    "id": "7a94cbf8",
 19 |    "metadata": {
 20 |     "scrolled": true
 21 |    },
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "#Calculate gene-specific variant effect scores\n",
 25 |     "\n",
 26 |     "!./score_expr_sed.sh\n"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 2,
 32 |    "id": "1047ff0f",
 33 |    "metadata": {},
 34 |    "outputs": [
 35 |     {
 36 |      "name": "stdout",
 37 |      "output_type": "stream",
 38 |      "text": [
 39 |       "score: 'logSED', snp: 'chr1_46309111_A_G_b38', gene: 'ENSG00000237090.1', track: 'RNA:adipose_tissue' => -0.2551\n"
 40 |      ]
 41 |     }
 42 |    ],
 43 |    "source": [
 44 |     "#Print an example variant effect prediction for a SNP-gene pair (gene-specific expression)\n",
 45 |     "\n",
 46 |     "sed_h5 = h5py.File('snp_sed/f3c0/sed.h5', 'r')\n",
 47 |     "\n",
 48 |     "row_ix = 63\n",
 49 |     "target_ix = 0\n",
 50 |     "\n",
 51 |     "print(\"score: 'logSED', snp: '\" + str(sed_h5['snp'][sed_h5['si'][row_ix]].decode()) + \"', gene: '\" + str(sed_h5['gene'][sed_h5['si'][row_ix]].decode()) + \"', track: '\" + str(sed_h5['target_labels'][target_ix].decode()) + \"' => \" + str(round(sed_h5['logSED'][row_ix, target_ix], 4)))\n"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "id": "f105ecd9",
 58 |    "metadata": {
 59 |     "scrolled": true
 60 |    },
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "#Calculate gene-agnostic variant effect scores\n",
 64 |     "\n",
 65 |     "!./score_expr_sad.sh\n"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 3,
 71 |    "id": "96e4f7cb",
 72 |    "metadata": {},
 73 |    "outputs": [
 74 |     {
 75 |      "name": "stdout",
 76 |      "output_type": "stream",
 77 |      "text": [
 78 |       "score: 'logD2', snp: 'chr1_43120331_C_T_b38', track: 'RNA:adipose_tissue' => 0.1057\n"
 79 |      ]
 80 |     }
 81 |    ],
 82 |    "source": [
 83 |     "#Print an example variant effect prediction for a SNP (gene-agnostic expression)\n",
 84 |     "\n",
 85 |     "sad_h5 = h5py.File('snp_sad/f3c0/sad.h5', 'r')\n",
 86 |     "\n",
 87 |     "snp_ix = 1\n",
 88 |     "target_ix = 0\n",
 89 |     "\n",
 90 |     "print(\"score: 'logD2', snp: '\" + str(sad_h5['snp'][snp_ix].decode()) + \"', track: '\" + str(sad_h5['target_labels'][target_ix].decode()) + \"' => \" + str(round(sad_h5['logD2'][snp_ix, target_ix], 4)))\n"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": null,
 96 |    "id": "c56efaef",
 97 |    "metadata": {
 98 |     "scrolled": true
 99 |    },
100 |    "outputs": [],
101 |    "source": [
102 |     "#Calculate splice variant effect scores\n",
103 |     "\n",
104 |     "!./score_splice.sh\n"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": 4,
110 |    "id": "980993fc",
111 |    "metadata": {},
112 |    "outputs": [
113 |     {
114 |      "name": "stdout",
115 |      "output_type": "stream",
116 |      "text": [
117 |       "score: 'nDi', snp: 'chr1_156236330_G_A', gene: 'ENSG00000225905.1', track: 'RNA:foreskin fibroblast male newborn' => 0.0022\n"
118 |      ]
119 |     }
120 |    ],
121 |    "source": [
122 |     "#Print an example variant effect prediction for a SNP-gene pair (splicing)\n",
123 |     "\n",
124 |     "sed_h5 = h5py.File('snp_splice/f3c0/sed.h5', 'r')\n",
125 |     "\n",
126 |     "row_ix = 116\n",
127 |     "target_ix = 755\n",
128 |     "\n",
129 |     "print(\"score: 'nDi', snp: '\" + str(sed_h5['snp'][sed_h5['si'][row_ix]].decode()) + \"', gene: '\" + str(sed_h5['gene'][sed_h5['si'][row_ix]].decode()) + \"', track: '\" + str(sed_h5['target_labels'][target_ix].decode()) + \"' => \" + str(round(sed_h5['nDi'][row_ix, target_ix], 4)))\n"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": null,
135 |    "id": "05cccfb6",
136 |    "metadata": {
137 |     "scrolled": true
138 |    },
139 |    "outputs": [],
140 |    "source": [
141 |     "#Calculate polyadenylation variant effect scores\n",
142 |     "\n",
143 |     "!./score_polya.sh\n"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": 5,
149 |    "id": "43ac562f",
150 |    "metadata": {},
151 |    "outputs": [
152 |     {
153 |      "name": "stdout",
154 |      "output_type": "stream",
155 |      "text": [
156 |       "score: 'logSED', snp: 'chr16_80976052_T_G', gene: 'ENSG00000132879.14', track: 'RNA:HeLa-S3 nuclear fraction' => 0.0628\n"
157 |      ]
158 |     }
159 |    ],
160 |    "source": [
161 |     "#Print an example variant effect prediction for a SNP-gene pair (polyadenylation)\n",
162 |     "\n",
163 |     "sed_h5 = h5py.File('snp_polya/f3c0/sed.h5', 'r')\n",
164 |     "\n",
165 |     "row_ix = 47\n",
166 |     "target_ix = 100\n",
167 |     "\n",
168 |     "print(\"score: 'logSED', snp: '\" + str(sed_h5['snp'][sed_h5['si'][row_ix]].decode()) + \"', gene: '\" + str(sed_h5['gene'][sed_h5['si'][row_ix]].decode()) + \"', track: '\" + str(sed_h5['target_labels'][target_ix].decode()) + \"' => \" + str(round(sed_h5['COVR'][row_ix, target_ix], 4)))\n"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": null,
174 |    "id": "0ba23572",
175 |    "metadata": {},
176 |    "outputs": [],
177 |    "source": []
178 |   }
179 |  ],
180 |  "metadata": {
181 |   "kernelspec": {
182 |    "display_name": "Python 3 (ipykernel)",
183 |    "language": "python",
184 |    "name": "python3"
185 |   },
186 |   "language_info": {
187 |    "codemirror_mode": {
188 |     "name": "ipython",
189 |     "version": 3
190 |    },
191 |    "file_extension": ".py",
192 |    "mimetype": "text/x-python",
193 |    "name": "python",
194 |    "nbconvert_exporter": "python",
195 |    "pygments_lexer": "ipython3",
196 |    "version": "3.8.15"
197 |   }
198 |  },
199 |  "nbformat": 4,
200 |  "nbformat_minor": 5
201 | }
202 | 


--------------------------------------------------------------------------------
/tutorials/legacy/score_variants/score_expr_sad.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | mkdir -p snp_sad/f3c0
4 | 
5 | borzoi_sad.py -o snp_sad/f3c0 --rc --stats logD2 -u -t ../../../examples/targets_human.txt ../../../examples/params_pred.json ../../../examples/saved_models/f3c0/train/model0_best.h5 snps_expr.vcf
6 | 


--------------------------------------------------------------------------------
/tutorials/legacy/score_variants/score_expr_sed.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | mkdir -p snp_sed/f3c0
4 | 
5 | borzoi_sed.py -o snp_sed/f3c0 --rc --stats logSED,logD2 -u -t ../../../examples/targets_gtex.txt ../../../examples/params_pred.json ../../../examples/saved_models/f3c0/train/model0_best.h5 snps_expr.vcf
6 | 


--------------------------------------------------------------------------------
/tutorials/legacy/score_variants/score_polya.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | mkdir -p snp_polya/f3c0
4 | 
5 | borzoi_sed_paqtl_cov.py -o snp_polya/f3c0 --rc --stats COVR -u -t ../../../examples/targets_rna.txt ../../../examples/params_pred.json ../../../examples/saved_models/f3c0/train/model0_best.h5 snps_polya.vcf
6 | 


--------------------------------------------------------------------------------
/tutorials/legacy/score_variants/score_splice.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | mkdir -p snp_splice/f3c0
4 | 
5 | borzoi_sed.py -o snp_splice/f3c0 --span --no_untransform --rc --stats nDi -u -t ../../../examples/targets_rna.txt ../../../examples/params_pred.json ../../../examples/saved_models/f3c0/train/model0_best.h5 snps_splice.vcf
6 | 


--------------------------------------------------------------------------------
/tutorials/legacy/score_variants/snps_expr.vcf:
--------------------------------------------------------------------------------
1 | ##fileformat=VCFv4.2
2 | chr1	43110773	chr1_43110773_G_A_b38	G	A	.	.
3 | chr1	43120331	chr1_43120331_C_T_b38	C	T	.	.
4 | chr1	46309111	chr1_46309111_A_G_b38	A	G	.	.
5 | chr1	52632886	chr1_52632886_A_C_b38	A	C	.	.
6 | chr1	54053434	chr1_54053434_G_A_b38	G	A	.	.
7 | 


--------------------------------------------------------------------------------
/tutorials/legacy/score_variants/snps_polya.vcf:
--------------------------------------------------------------------------------
 1 | ##fileformat=VCFv4.2
 2 | ##INFO=<ID=MT,Number=1,Type=String,Description="Molecular trait id">
 3 | ##INFO=<ID=PD,Number=1,Type=Integer,Description="PAS distance">
 4 | ##INFO=<ID=PI,Number=1,Type=String,Description="Positive SNP id">
 5 | #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
 6 | chr1	11790946	chr1_11790946_G_C	G	C	.	.	MT=ENSG00000177000.grp_2.downstream.ENST00000641805;PD=924;PI=chr1_11790946_G_C
 7 | chr1	150160094	chr1_150160094_C_G	C	G	.	.	MT=ENSG00000023902.grp_1.downstream.ENST00000369126;PD=29;PI=chr1_150160094_C_G
 8 | chr16	57665101	chr16_57665101_A_G	A	G	.	.	MT=ENSG00000205336.grp_1.downstream.ENST00000568908;PD=73;PI=chr16_57665101_A_G
 9 | chr16	80976052	chr16_80976052_T_G	T	G	.	.	MT=ENSG00000103121.grp_2.downstream.ENST00000565925;PD=24;PI=chr16_80976052_T_G
10 | chr16	88857261	chr16_88857261_T_C	T	C	.	.	MT=ENSG00000167515.grp_2.downstream.ENST00000564547;PD=3851;PI=chr16_88857261_T_C


--------------------------------------------------------------------------------
/tutorials/legacy/score_variants/snps_splice.vcf:
--------------------------------------------------------------------------------
 1 | ##fileformat=VCFv4.2
 2 | ##INFO=<ID=MT,Number=1,Type=String,Description="Molecular trait id">
 3 | ##INFO=<ID=SD,Number=1,Type=Integer,Description="Splice distance">
 4 | ##INFO=<ID=PI,Number=1,Type=String,Description="Positive SNP id">
 5 | #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
 6 | chr1	1665061	chr1_1665061_C_T	C	T	.	.	MT=ENSG00000189339.grp_2.contained.ENST00000611123;SD=959;PI=chr1_1665061_C_T
 7 | chr1	1689221	chr1_1689221_G_A	G	A	.	.	MT=ENSG00000189339.grp_1.contained.ENST00000614300;SD=1753;PI=chr1_1689221_G_A
 8 | chr1	50655526	chr1_50655526_T_C	T	C	.	.	MT=ENSG00000185104.grp_2.contained.ENST00000396153;SD=3;PI=chr1_50655526_T_C
 9 | chr1	109489368	chr1_109489368_C_G	C	G	.	.	MT=ENSG00000143537.grp_2.contained.ENST00000360674;SD=1;PI=chr1_155060832_G_A
10 | chr1	156236330	chr1_156236330_G_A	G	A	.	.	MT=ENSG00000160783.grp_1.contained.ENST00000368279;SD=17;PI=chr1_156236330_G_A
11 | 


--------------------------------------------------------------------------------
/tutorials/legacy/train_model/README.md:
--------------------------------------------------------------------------------
 1 | ## Model Training
 2 | 
 3 | This tutorial describes how to train smaller Borzoi models on the example RNA-seq experiment processed in the [make_data tutorial](https://github.com/calico/borzoi/tree/main/tutorials/legacy/make_data).
 4 | 
 5 | To train a 'Mini Borzoi' ensemble (~40M parameters, 2 cross-validation folds), run the script 'train_mini.sh'. The model parameters are specified in 'params_mini.json'. This model can be trained with a batch size of 2 on a 24GB NVIDIA Titan RTX or RTX4090 GPU.
 6 | ```sh
 7 | conda activate borzoi_py310
 8 | cd ~/borzoi/tutorials/legacy/train_model
 9 | ./train_mini.sh
10 | ```
11 | 
12 | Alternatively, to train an even smaller 'Micro Borzoi' ensemble (~5M parameters), run the script 'train_micro.sh'. This model can fit into the above GPU cards with a batch size of 4, which means the learning rate can be doubled and each epoch finished in half the time.
13 | ```sh
14 | ./train_micro.sh
15 | ```
16 | 
17 | *Notes*:
18 | - See [here](https://github.com/calico/borzoi-paper/tree/main/model) for a description of the scripts called internally by the training .sh script.
19 | - The **legacy** model crops the predicted tracks (see layer 'Cropping1D' in the parameters file). In this example, the input sequence has length 393216 bp, and the cropping layer removes 3072x 32 bp bins from each side, resulting in 6144 bins.
20 | - In the **legacy** architecture, there is an extra/superfluous linear convolution applied in each 'unet_conv' layer (see the bool 'upsample_conv' in the parameters file). This additional convolution has since been removed.
21 | 


--------------------------------------------------------------------------------
/tutorials/legacy/train_model/params_micro.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train": {
 3 |         "batch_size": 4,
 4 |         "shuffle_buffer": 256,
 5 |         "optimizer": "adam",
 6 |         "learning_rate": 0.0002,
 7 |         "loss": "poisson_mn",
 8 |         "total_weight": 0.2,
 9 |         "warmup_steps": 10000,
10 |         "global_clipnorm": 0.2,
11 |         "adam_beta1": 0.9,
12 |         "adam_beta2": 0.999,
13 |         "patience": 30,
14 |         "train_epochs_min": 130,
15 |         "train_epochs_max": 180
16 |     },
17 |     "model": {
18 |         "seq_length": 393216,
19 |         "augment_rc": true,
20 |         "augment_shift": 3,
21 |         "activation": "gelu",
22 |         "norm_type": "batch",
23 |         "bn_momentum": 0.9,
24 |         "kernel_initializer": "lecun_normal",
25 |         "l2_scale": 1.0e-6,
26 |         "trunk": [
27 |             {
28 |                 "name": "conv_dna",
29 |                 "filters": 128,
30 |                 "kernel_size": 11,
31 |                 "norm_type": null,
32 |                 "activation": "linear",
33 |                 "pool_size": 2
34 |             },
35 |             {
36 |                 "name": "res_tower",
37 |                 "filters_init": 160,
38 |                 "filters_end": 320,
39 |                 "divisible_by": 8,
40 |                 "kernel_size": 5,
41 |                 "num_convs": 1,
42 |                 "pool_size": 2,
43 |                 "repeat": 6
44 |             },
45 |             {
46 |                 "name": "transformer_tower",
47 |                 "key_size": 32,
48 |                 "heads": 4,
49 |                 "num_position_features": 32,
50 |                 "dropout": 0.1,
51 |                 "attention_dropout": 0.01,
52 |                 "mha_l2_scale": 1.0e-8,
53 |                 "l2_scale": 1.0e-8,
54 |                 "kernel_initializer": "he_normal",
55 |                 "repeat": 4
56 |             },
57 |             {
58 |                 "name": "unet_conv",
59 |                 "kernel_size": 3,
60 |                 "upsample_conv": true
61 |             },
62 |             {
63 |                 "name": "unet_conv",
64 |                 "kernel_size": 3,
65 |                 "upsample_conv": true
66 |             },
67 |             {
68 |                 "name": "Cropping1D",
69 |                 "cropping": 3072
70 |             }
71 |         ],
72 |         "head_human": {
73 |             "name": "final",
74 |             "units": 2,
75 |             "activation": "softplus"
76 |         }
77 |     }
78 | }
79 | 


--------------------------------------------------------------------------------
/tutorials/legacy/train_model/params_mini.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train": {
 3 |         "batch_size": 2,
 4 |         "shuffle_buffer": 256,
 5 |         "optimizer": "adam",
 6 |         "learning_rate": 0.0001,
 7 |         "loss": "poisson_mn",
 8 |         "total_weight": 0.2,
 9 |         "warmup_steps": 20000,
10 |         "global_clipnorm": 0.1,
11 |         "adam_beta1": 0.9,
12 |         "adam_beta2": 0.999,
13 |         "patience": 30,
14 |         "train_epochs_min": 130,
15 |         "train_epochs_max": 180
16 |     },
17 |     "model": {
18 |         "seq_length": 393216,
19 |         "augment_rc": true,
20 |         "augment_shift": 3,
21 |         "activation": "gelu",
22 |         "norm_type": "batch",
23 |         "bn_momentum": 0.9,
24 |         "kernel_initializer": "lecun_normal",
25 |         "l2_scale": 1.0e-6,
26 |         "trunk": [
27 |             {
28 |                 "name": "conv_dna",
29 |                 "filters": 320,
30 |                 "kernel_size": 11,
31 |                 "norm_type": null,
32 |                 "activation": "linear",
33 |                 "pool_size": 2
34 |             },
35 |             {
36 |                 "name": "res_tower",
37 |                 "filters_init": 384,
38 |                 "filters_end": 768,
39 |                 "divisible_by": 16,
40 |                 "kernel_size": 5,
41 |                 "num_convs": 1,
42 |                 "pool_size": 2,
43 |                 "repeat": 6
44 |             },
45 |             {
46 |                 "name": "transformer_tower",
47 |                 "key_size": 64,
48 |                 "heads": 4,
49 |                 "num_position_features": 32,
50 |                 "dropout": 0.2,
51 |                 "mha_l2_scale": 1.0e-8,
52 |                 "l2_scale": 1.0e-8,
53 |                 "kernel_initializer": "he_normal",
54 |                 "repeat": 8
55 |             },
56 |             {
57 |                 "name": "unet_conv",
58 |                 "kernel_size": 3,
59 |                 "upsample_conv": true
60 |             },
61 |             {
62 |                 "name": "unet_conv",
63 |                 "kernel_size": 3,
64 |                 "upsample_conv": true
65 |             },
66 |             {
67 |                 "name": "Cropping1D",
68 |                 "cropping": 3072
69 |             }
70 |         ],
71 |         "head_human": {
72 |             "name": "final",
73 |             "units": 2,
74 |             "activation": "softplus"
75 |         }
76 |     }
77 | }
78 | 


--------------------------------------------------------------------------------
/tutorials/legacy/train_model/train_micro.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | westminster_train_folds.py -e borzoi_py310 -f 2 -c 1 -q rtx4090 -o micro_models params_micro.json ../make_data/data/hg38
4 | 


--------------------------------------------------------------------------------
/tutorials/legacy/train_model/train_mini.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | westminster_train_folds.py -e borzoi_py310 -f 2 -c 1 -q rtx4090 -o mini_models params_mini.json ../make_data/data/hg38
4 | 


--------------------------------------------------------------------------------